From 2718ff530ca9a04023ea147424874e686762a384 Mon Sep 17 00:00:00 2001
From: George Othon <60243072+georgeothon@users.noreply.github.com>
Date: Thu, 15 Jun 2023 18:06:52 -0300
Subject: [PATCH 001/136] [doc] Variable 'label' is not defined in the pyspark
 application example (#9302)

---
 doc/tutorials/spark_estimator.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/tutorials/spark_estimator.rst b/doc/tutorials/spark_estimator.rst
index fb69b70e1..545403a34 100644
--- a/doc/tutorials/spark_estimator.rst
+++ b/doc/tutorials/spark_estimator.rst
@@ -146,7 +146,7 @@ using a list of feature names and the additional parameter ``use_gpu``:
   label_name = "class"
 
   # get a list with feature column names
-  feature_names = [x.name for x in train_df.schema if x.name != label]
+  feature_names = [x.name for x in train_df.schema if x.name != label_name]
 
   # create a xgboost pyspark regressor estimator and set use_gpu=True
   regressor = SparkXGBRegressor(

From d8beb517edc8acf1622c13c3086cc8fa5e15a663 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Fri, 16 Jun 2023 10:56:50 -0700
Subject: [PATCH 002/136] Support bitwise allreduce in NCCL communicator
 (#9300)

---
 src/collective/nccl_device_communicator.cu    | 228 ++++++++++++++++++
 src/collective/nccl_device_communicator.cuh   | 171 +------------
 .../test_nccl_device_communicator.cu          |  67 ++++-
 3 files changed, 302 insertions(+), 164 deletions(-)
 create mode 100644 src/collective/nccl_device_communicator.cu

diff --git a/src/collective/nccl_device_communicator.cu b/src/collective/nccl_device_communicator.cu
new file mode 100644
index 000000000..6599d4b5a
--- /dev/null
+++ b/src/collective/nccl_device_communicator.cu
@@ -0,0 +1,228 @@
+/*!
+ * Copyright 2023 XGBoost contributors
+ */
+#if defined(XGBOOST_USE_NCCL)
+#include "nccl_device_communicator.cuh"
+
+namespace xgboost {
+namespace collective {
+
+NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, Communicator *communicator)
+    : device_ordinal_{device_ordinal}, communicator_{communicator} {
+  if (device_ordinal_ < 0) {
+    LOG(FATAL) << "Invalid device ordinal: " << device_ordinal_;
+  }
+  if (communicator_ == nullptr) {
+    LOG(FATAL) << "Communicator cannot be null.";
+  }
+
+  int32_t const rank = communicator_->GetRank();
+  int32_t const world = communicator_->GetWorldSize();
+
+  if (world == 1) {
+    return;
+  }
+
+  std::vector<uint64_t> uuids(world * kUuidLength, 0);
+  auto s_uuid = xgboost::common::Span<uint64_t>{uuids.data(), uuids.size()};
+  auto s_this_uuid = s_uuid.subspan(rank * kUuidLength, kUuidLength);
+  GetCudaUUID(s_this_uuid);
+
+  // TODO(rongou): replace this with allgather.
+  communicator_->AllReduce(uuids.data(), uuids.size(), DataType::kUInt64, Operation::kSum);
+
+  std::vector<xgboost::common::Span<uint64_t, kUuidLength>> converted(world);
+  size_t j = 0;
+  for (size_t i = 0; i < uuids.size(); i += kUuidLength) {
+    converted[j] = xgboost::common::Span<uint64_t, kUuidLength>{uuids.data() + i, kUuidLength};
+    j++;
+  }
+
+  auto iter = std::unique(converted.begin(), converted.end());
+  auto n_uniques = std::distance(converted.begin(), iter);
+
+  CHECK_EQ(n_uniques, world)
+      << "Multiple processes within communication group running on same CUDA "
+      << "device is not supported. " << PrintUUID(s_this_uuid) << "\n";
+
+  nccl_unique_id_ = GetUniqueId();
+  dh::safe_cuda(cudaSetDevice(device_ordinal_));
+  dh::safe_nccl(ncclCommInitRank(&nccl_comm_, world, nccl_unique_id_, rank));
+  dh::safe_cuda(cudaStreamCreate(&cuda_stream_));
+}
+
+NcclDeviceCommunicator::~NcclDeviceCommunicator() {
+  if (communicator_->GetWorldSize() == 1) {
+    return;
+  }
+  if (cuda_stream_) {
+    dh::safe_cuda(cudaStreamDestroy(cuda_stream_));
+  }
+  if (nccl_comm_) {
+    dh::safe_nccl(ncclCommDestroy(nccl_comm_));
+  }
+  if (xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
+    LOG(CONSOLE) << "======== NCCL Statistics========";
+    LOG(CONSOLE) << "AllReduce calls: " << allreduce_calls_;
+    LOG(CONSOLE) << "AllReduce total MiB communicated: " << allreduce_bytes_ / 1048576;
+  }
+}
+
+namespace {
+ncclDataType_t GetNcclDataType(DataType const &data_type) {
+  ncclDataType_t result;
+  switch (data_type) {
+    case DataType::kInt8:
+      result = ncclInt8;
+      break;
+    case DataType::kUInt8:
+      result = ncclUint8;
+      break;
+    case DataType::kInt32:
+      result = ncclInt32;
+      break;
+    case DataType::kUInt32:
+      result = ncclUint32;
+      break;
+    case DataType::kInt64:
+      result = ncclInt64;
+      break;
+    case DataType::kUInt64:
+      result = ncclUint64;
+      break;
+    case DataType::kFloat:
+      result = ncclFloat;
+      break;
+    case DataType::kDouble:
+      result = ncclDouble;
+      break;
+    default:
+      LOG(FATAL) << "Unknown data type.";
+  }
+  return result;
+}
+
+bool IsBitwiseOp(Operation const &op) {
+  return op == Operation::kBitwiseAND || op == Operation::kBitwiseOR ||
+         op == Operation::kBitwiseXOR;
+}
+
+ncclRedOp_t GetNcclRedOp(Operation const &op) {
+  ncclRedOp_t result;
+  switch (op) {
+    case Operation::kMax:
+      result = ncclMax;
+      break;
+    case Operation::kMin:
+      result = ncclMin;
+      break;
+    case Operation::kSum:
+      result = ncclSum;
+      break;
+    default:
+      LOG(FATAL) << "Unsupported reduce operation.";
+  }
+  return result;
+}
+
+template <typename Func>
+void RunBitwiseAllreduce(char *out_buffer, char const *device_buffer, Func func, int world_size,
+                         std::size_t size, cudaStream_t stream) {
+  dh::LaunchN(size, stream, [=] __device__(std::size_t idx) {
+    out_buffer[idx] = device_buffer[idx];
+    for (auto rank = 1; rank < world_size; rank++) {
+      out_buffer[idx] = func(out_buffer[idx], device_buffer[rank * size + idx]);
+    }
+  });
+}
+}  // anonymous namespace
+
+void NcclDeviceCommunicator::BitwiseAllReduce(void *send_receive_buffer, std::size_t count,
+                                              DataType data_type, Operation op) {
+  auto const world_size = communicator_->GetWorldSize();
+  auto const size = count * GetTypeSize(data_type);
+  dh::caching_device_vector<char> buffer(size * world_size);
+  auto *device_buffer = buffer.data().get();
+
+  // First gather data from all the workers.
+  dh::safe_nccl(ncclAllGather(send_receive_buffer, device_buffer, count, GetNcclDataType(data_type),
+                              nccl_comm_, cuda_stream_));
+
+  // Then reduce locally.
+  auto *out_buffer = static_cast<char *>(send_receive_buffer);
+  switch (op) {
+    case Operation::kBitwiseAND:
+      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_and<char>(), world_size, size,
+                          cuda_stream_);
+      break;
+    case Operation::kBitwiseOR:
+      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_or<char>(), world_size, size,
+                          cuda_stream_);
+      break;
+    case Operation::kBitwiseXOR:
+      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_xor<char>(), world_size, size,
+                          cuda_stream_);
+      break;
+    default:
+      LOG(FATAL) << "Not a bitwise reduce operation.";
+  }
+}
+
+void NcclDeviceCommunicator::AllReduce(void *send_receive_buffer, std::size_t count,
+                                       DataType data_type, Operation op) {
+  if (communicator_->GetWorldSize() == 1) {
+    return;
+  }
+
+  dh::safe_cuda(cudaSetDevice(device_ordinal_));
+  if (IsBitwiseOp(op)) {
+    BitwiseAllReduce(send_receive_buffer, count, data_type, op);
+  } else {
+    dh::safe_nccl(ncclAllReduce(send_receive_buffer, send_receive_buffer, count,
+                                GetNcclDataType(data_type), GetNcclRedOp(op), nccl_comm_,
+                                cuda_stream_));
+  }
+  allreduce_bytes_ += count * GetTypeSize(data_type);
+  allreduce_calls_ += 1;
+}
+
+void NcclDeviceCommunicator::AllGatherV(void const *send_buffer, size_t length_bytes,
+                                        std::vector<std::size_t> *segments,
+                                        dh::caching_device_vector<char> *receive_buffer) {
+  if (communicator_->GetWorldSize() == 1) {
+    return;
+  }
+
+  dh::safe_cuda(cudaSetDevice(device_ordinal_));
+  int const world_size = communicator_->GetWorldSize();
+  int const rank = communicator_->GetRank();
+
+  segments->clear();
+  segments->resize(world_size, 0);
+  segments->at(rank) = length_bytes;
+  communicator_->AllReduce(segments->data(), segments->size(), DataType::kUInt64, Operation::kMax);
+  auto total_bytes = std::accumulate(segments->cbegin(), segments->cend(), 0UL);
+  receive_buffer->resize(total_bytes);
+
+  size_t offset = 0;
+  dh::safe_nccl(ncclGroupStart());
+  for (int32_t i = 0; i < world_size; ++i) {
+    size_t as_bytes = segments->at(i);
+    dh::safe_nccl(ncclBroadcast(send_buffer, receive_buffer->data().get() + offset, as_bytes,
+                                ncclChar, i, nccl_comm_, cuda_stream_));
+    offset += as_bytes;
+  }
+  dh::safe_nccl(ncclGroupEnd());
+}
+
+void NcclDeviceCommunicator::Synchronize() {
+  if (communicator_->GetWorldSize() == 1) {
+    return;
+  }
+  dh::safe_cuda(cudaSetDevice(device_ordinal_));
+  dh::safe_cuda(cudaStreamSynchronize(cuda_stream_));
+}
+
+}  // namespace collective
+}  // namespace xgboost
+#endif
diff --git a/src/collective/nccl_device_communicator.cuh b/src/collective/nccl_device_communicator.cuh
index 4e58fc5ba..e5f76119d 100644
--- a/src/collective/nccl_device_communicator.cuh
+++ b/src/collective/nccl_device_communicator.cuh
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2022 XGBoost contributors
+ * Copyright 2022-2023 XGBoost contributors
  */
 #pragma once
 
@@ -12,116 +12,13 @@ namespace collective {
 
 class NcclDeviceCommunicator : public DeviceCommunicator {
  public:
-  NcclDeviceCommunicator(int device_ordinal, Communicator *communicator)
-      : device_ordinal_{device_ordinal}, communicator_{communicator} {
-    if (device_ordinal_ < 0) {
-      LOG(FATAL) << "Invalid device ordinal: " << device_ordinal_;
-    }
-    if (communicator_ == nullptr) {
-      LOG(FATAL) << "Communicator cannot be null.";
-    }
-
-    int32_t const rank = communicator_->GetRank();
-    int32_t const world = communicator_->GetWorldSize();
-
-    if (world == 1) {
-      return;
-    }
-
-    std::vector<uint64_t> uuids(world * kUuidLength, 0);
-    auto s_uuid = xgboost::common::Span<uint64_t>{uuids.data(), uuids.size()};
-    auto s_this_uuid = s_uuid.subspan(rank * kUuidLength, kUuidLength);
-    GetCudaUUID(s_this_uuid);
-
-    // TODO(rongou): replace this with allgather.
-    communicator_->AllReduce(uuids.data(), uuids.size(), DataType::kUInt64, Operation::kSum);
-
-    std::vector<xgboost::common::Span<uint64_t, kUuidLength>> converted(world);
-    size_t j = 0;
-    for (size_t i = 0; i < uuids.size(); i += kUuidLength) {
-      converted[j] = xgboost::common::Span<uint64_t, kUuidLength>{uuids.data() + i, kUuidLength};
-      j++;
-    }
-
-    auto iter = std::unique(converted.begin(), converted.end());
-    auto n_uniques = std::distance(converted.begin(), iter);
-
-    CHECK_EQ(n_uniques, world)
-        << "Multiple processes within communication group running on same CUDA "
-        << "device is not supported. " << PrintUUID(s_this_uuid) << "\n";
-
-    nccl_unique_id_ = GetUniqueId();
-    dh::safe_nccl(ncclCommInitRank(&nccl_comm_, world, nccl_unique_id_, rank));
-    dh::safe_cuda(cudaStreamCreate(&cuda_stream_));
-  }
-
-  ~NcclDeviceCommunicator() override {
-    if (communicator_->GetWorldSize() == 1) {
-      return;
-    }
-    if (cuda_stream_) {
-      dh::safe_cuda(cudaStreamDestroy(cuda_stream_));
-    }
-    if (nccl_comm_) {
-      dh::safe_nccl(ncclCommDestroy(nccl_comm_));
-    }
-    if (xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
-      LOG(CONSOLE) << "======== NCCL Statistics========";
-      LOG(CONSOLE) << "AllReduce calls: " << allreduce_calls_;
-      LOG(CONSOLE) << "AllReduce total MiB communicated: " << allreduce_bytes_ / 1048576;
-    }
-  }
-
+  NcclDeviceCommunicator(int device_ordinal, Communicator *communicator);
+  ~NcclDeviceCommunicator() override;
   void AllReduce(void *send_receive_buffer, std::size_t count, DataType data_type,
-                 Operation op) override {
-    if (communicator_->GetWorldSize() == 1) {
-      return;
-    }
-
-    dh::safe_cuda(cudaSetDevice(device_ordinal_));
-    dh::safe_nccl(ncclAllReduce(send_receive_buffer, send_receive_buffer, count,
-                                GetNcclDataType(data_type), GetNcclRedOp(op), nccl_comm_,
-                                cuda_stream_));
-    allreduce_bytes_ += count * GetTypeSize(data_type);
-    allreduce_calls_ += 1;
-  }
-
+                 Operation op) override;
   void AllGatherV(void const *send_buffer, size_t length_bytes, std::vector<std::size_t> *segments,
-                  dh::caching_device_vector<char> *receive_buffer) override {
-    if (communicator_->GetWorldSize() == 1) {
-      return;
-    }
-
-    dh::safe_cuda(cudaSetDevice(device_ordinal_));
-    int const world_size = communicator_->GetWorldSize();
-    int const rank = communicator_->GetRank();
-
-    segments->clear();
-    segments->resize(world_size, 0);
-    segments->at(rank) = length_bytes;
-    communicator_->AllReduce(segments->data(), segments->size(), DataType::kUInt64,
-                             Operation::kMax);
-    auto total_bytes = std::accumulate(segments->cbegin(), segments->cend(), 0UL);
-    receive_buffer->resize(total_bytes);
-
-    size_t offset = 0;
-    dh::safe_nccl(ncclGroupStart());
-    for (int32_t i = 0; i < world_size; ++i) {
-      size_t as_bytes = segments->at(i);
-      dh::safe_nccl(ncclBroadcast(send_buffer, receive_buffer->data().get() + offset, as_bytes,
-                                  ncclChar, i, nccl_comm_, cuda_stream_));
-      offset += as_bytes;
-    }
-    dh::safe_nccl(ncclGroupEnd());
-  }
-
-  void Synchronize() override {
-    if (communicator_->GetWorldSize() == 1) {
-      return;
-    }
-    dh::safe_cuda(cudaSetDevice(device_ordinal_));
-    dh::safe_cuda(cudaStreamSynchronize(cuda_stream_));
-  }
+                  dh::caching_device_vector<char> *receive_buffer) override;
+  void Synchronize() override;
 
  private:
   static constexpr std::size_t kUuidLength =
@@ -160,60 +57,8 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
     return id;
   }
 
-  static ncclDataType_t GetNcclDataType(DataType const &data_type) {
-    ncclDataType_t result;
-    switch (data_type) {
-      case DataType::kInt8:
-        result = ncclInt8;
-        break;
-      case DataType::kUInt8:
-        result = ncclUint8;
-        break;
-      case DataType::kInt32:
-        result = ncclInt32;
-        break;
-      case DataType::kUInt32:
-        result = ncclUint32;
-        break;
-      case DataType::kInt64:
-        result = ncclInt64;
-        break;
-      case DataType::kUInt64:
-        result = ncclUint64;
-        break;
-      case DataType::kFloat:
-        result = ncclFloat;
-        break;
-      case DataType::kDouble:
-        result = ncclDouble;
-        break;
-      default:
-        LOG(FATAL) << "Unknown data type.";
-    }
-    return result;
-  }
-
-  static ncclRedOp_t GetNcclRedOp(Operation const &op) {
-    ncclRedOp_t result;
-    switch (op) {
-      case Operation::kMax:
-        result = ncclMax;
-        break;
-      case Operation::kMin:
-        result = ncclMin;
-        break;
-      case Operation::kSum:
-        result = ncclSum;
-        break;
-      case Operation::kBitwiseAND:
-      case Operation::kBitwiseOR:
-      case Operation::kBitwiseXOR:
-        LOG(FATAL) << "Not implemented yet.";
-      default:
-        LOG(FATAL) << "Unknown reduce operation.";
-    }
-    return result;
-  }
+  void BitwiseAllReduce(void *send_receive_buffer, std::size_t count, DataType data_type,
+                        Operation op);
 
   int const device_ordinal_;
   Communicator *communicator_;
diff --git a/tests/cpp/collective/test_nccl_device_communicator.cu b/tests/cpp/collective/test_nccl_device_communicator.cu
index 6d3203522..6ac861a55 100644
--- a/tests/cpp/collective/test_nccl_device_communicator.cu
+++ b/tests/cpp/collective/test_nccl_device_communicator.cu
@@ -5,10 +5,12 @@
 
 #include <gtest/gtest.h>
 
+#include <bitset>
 #include <string>  // for string
 
-#include "../../../src/collective/nccl_device_communicator.cuh"
 #include "../../../src/collective/communicator-inl.cuh"
+#include "../../../src/collective/nccl_device_communicator.cuh"
+#include "../helpers.h"
 
 namespace xgboost {
 namespace collective {
@@ -31,6 +33,69 @@ TEST(NcclDeviceCommunicatorSimpleTest, SystemError) {
     ASSERT_TRUE(str.find("environment variables") != std::string::npos);
   }
 }
+
+namespace {
+void VerifyAllReduceBitwiseAND() {
+  auto const rank = collective::GetRank();
+  std::bitset<64> original{};
+  original[rank] = true;
+  HostDeviceVector<uint64_t> buffer({original.to_ullong()}, rank);
+  collective::AllReduce<collective::Operation::kBitwiseAND>(rank, buffer.DevicePointer(), 1);
+  collective::Synchronize(rank);
+  EXPECT_EQ(buffer.HostVector()[0], 0ULL);
+}
+}  // anonymous namespace
+
+TEST(NcclDeviceCommunicator, MGPUAllReduceBitwiseAND) {
+  auto const n_gpus = common::AllVisibleGPUs();
+  if (n_gpus <= 1) {
+    GTEST_SKIP() << "Skipping MGPUAllReduceBitwiseAND test with # GPUs = " << n_gpus;
+  }
+  RunWithInMemoryCommunicator(n_gpus, VerifyAllReduceBitwiseAND);
+}
+
+namespace {
+void VerifyAllReduceBitwiseOR() {
+  auto const world_size = collective::GetWorldSize();
+  auto const rank = collective::GetRank();
+  std::bitset<64> original{};
+  original[rank] = true;
+  HostDeviceVector<uint64_t> buffer({original.to_ullong()}, rank);
+  collective::AllReduce<collective::Operation::kBitwiseOR>(rank, buffer.DevicePointer(), 1);
+  collective::Synchronize(rank);
+  EXPECT_EQ(buffer.HostVector()[0], (1ULL << world_size) - 1);
+}
+}  // anonymous namespace
+
+TEST(NcclDeviceCommunicator, MGPUAllReduceBitwiseOR) {
+  auto const n_gpus = common::AllVisibleGPUs();
+  if (n_gpus <= 1) {
+    GTEST_SKIP() << "Skipping MGPUAllReduceBitwiseOR test with # GPUs = " << n_gpus;
+  }
+  RunWithInMemoryCommunicator(n_gpus, VerifyAllReduceBitwiseOR);
+}
+
+namespace {
+void VerifyAllReduceBitwiseXOR() {
+  auto const world_size = collective::GetWorldSize();
+  auto const rank = collective::GetRank();
+  std::bitset<64> original{~0ULL};
+  original[rank] = false;
+  HostDeviceVector<uint64_t> buffer({original.to_ullong()}, rank);
+  collective::AllReduce<collective::Operation::kBitwiseXOR>(rank, buffer.DevicePointer(), 1);
+  collective::Synchronize(rank);
+  EXPECT_EQ(buffer.HostVector()[0], (1ULL << world_size) - 1);
+}
+}  // anonymous namespace
+
+TEST(NcclDeviceCommunicator, MGPUAllReduceBitwiseXOR) {
+  auto const n_gpus = common::AllVisibleGPUs();
+  if (n_gpus <= 1) {
+    GTEST_SKIP() << "Skipping MGPUAllReduceBitwiseXOR test with # GPUs = " << n_gpus;
+  }
+  RunWithInMemoryCommunicator(n_gpus, VerifyAllReduceBitwiseXOR);
+}
+
 }  // namespace collective
 }  // namespace xgboost
 

From ee6809e642971bb56a3832344047ef44ee915f2e Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 19 Jun 2023 18:52:55 +0800
Subject: [PATCH 003/136] Use mmap for external memory. (#9282)

- Have basic infrastructure for mmap.
- Release file write handle.
---
 demo/guide-python/external_memory.py          |   6 +-
 doc/tutorials/external_memory.rst             | 153 ++++++++++-----
 python-package/xgboost/testing/__init__.py    |   8 +-
 rabit/include/rabit/internal/io.h             |  74 ++++----
 src/common/io.cc                              | 176 ++++++++++++++++--
 src/common/io.h                               |  33 +++-
 src/data/sparse_page_source.h                 | 128 ++++++++-----
 src/tree/gpu_hist/gradient_based_sampler.cu   | 104 +++++------
 src/tree/gpu_hist/gradient_based_sampler.cuh  |  50 +++--
 src/tree/updater_gpu_hist.cu                  |  51 ++---
 tests/cpp/common/test_io.cc                   |  60 +++++-
 tests/cpp/histogram_helpers.h                 |   4 +
 .../gpu_hist/test_gradient_based_sampler.cu   |   8 +-
 tests/cpp/tree/test_gpu_hist.cu               |  14 +-
 tests/python-gpu/test_from_cudf.py            |   2 +-
 tests/python/test_data_iterator.py            |   3 +-
 16 files changed, 599 insertions(+), 275 deletions(-)

diff --git a/demo/guide-python/external_memory.py b/demo/guide-python/external_memory.py
index cc5527611..11a05c61c 100644
--- a/demo/guide-python/external_memory.py
+++ b/demo/guide-python/external_memory.py
@@ -82,10 +82,10 @@ def main(tmpdir: str) -> xgboost.Booster:
     missing = np.NaN
     Xy = xgboost.DMatrix(it, missing=missing, enable_categorical=False)
 
-    # Other tree methods including ``hist`` and ``gpu_hist`` also work, see tutorial in
-    # doc for details.
+    # Other tree methods including ``approx``, and ``gpu_hist`` are supported. GPU
+    # behaves differently than CPU tree methods. See tutorial in doc for details.
     booster = xgboost.train(
-        {"tree_method": "approx", "max_depth": 2},
+        {"tree_method": "hist", "max_depth": 4},
         Xy,
         evals=[(Xy, "Train")],
         num_boost_round=10,
diff --git a/doc/tutorials/external_memory.rst b/doc/tutorials/external_memory.rst
index 006d63b43..f5b6132c7 100644
--- a/doc/tutorials/external_memory.rst
+++ b/doc/tutorials/external_memory.rst
@@ -2,11 +2,25 @@
 Using XGBoost External Memory Version
 #####################################
 
-XGBoost supports loading data from external memory using builtin data parser.  And
-starting from version 1.5, users can also define a custom iterator to load data in chunks.
-The feature is still experimental and not yet ready for production use.  In this tutorial
-we will introduce both methods.  Please note that training on data from external memory is
-not supported by ``exact`` tree method.
+When working with large datasets, training XGBoost models can be challenging as the entire
+dataset needs to be loaded into memory. This can be costly and sometimes
+infeasible. Staring from 1.5, users can define a custom iterator to load data in chunks
+for running XGBoost algorithms. External memory can be used for both training and
+prediction, but training is the primary use case and it will be our focus in this
+tutorial. For prediction and evaluation, users can iterate through the data themseleves
+while training requires the full dataset to be loaded into the memory.
+
+During training, there are two different modes for external memory support available in
+XGBoost, one for CPU-based algorithms like ``hist`` and ``approx``, another one for the
+GPU-based training algorithm. We will introduce them in the following sections.
+
+.. note::
+
+   Training on data from external memory is not supported by the ``exact`` tree method.
+
+.. note::
+
+   The feature is still experimental as of 2.0. The performance is not well optimized.
 
 *************
 Data Iterator
@@ -15,8 +29,8 @@ Data Iterator
 Starting from XGBoost 1.5, users can define their own data loader using Python or C
 interface.  There are some examples in the ``demo`` directory for quick start.  This is a
 generalized version of text input external memory, where users no longer need to prepare a
-text file that XGBoost recognizes.  To enable the feature, user need to define a data
-iterator with 2 class methods ``next`` and ``reset`` then pass it into ``DMatrix``
+text file that XGBoost recognizes.  To enable the feature, users need to define a data
+iterator with 2 class methods: ``next`` and ``reset``, then pass it into the ``DMatrix``
 constructor.
 
 .. code-block:: python
@@ -60,20 +74,96 @@ constructor.
 
   # Other tree methods including ``hist`` and ``gpu_hist`` also work, but has some caveats
   # as noted in following sections.
-  booster = xgboost.train({"tree_method": "approx"}, Xy)
+  booster = xgboost.train({"tree_method": "hist"}, Xy)
 
 
-The above snippet is a simplified version of ``demo/guide-python/external_memory.py``.  For
-an example in C, please see ``demo/c-api/external-memory/``.
+The above snippet is a simplified version of :ref:`sphx_glr_python_examples_external_memory.py`.
+For an example in C, please see ``demo/c-api/external-memory/``. The iterator is the
+common interface for using external memory with XGBoost, you can pass the resulting
+``DMatrix`` object for training, prediction, and evaluation.
+
+It is important to set the batch size based on the memory available. A good starting point
+is to set the batch size to 10GB per batch if you have 64GB of memory. It is *not*
+recommended to set small batch sizes like 32 samples per batch, as this can seriously hurt
+performance in gradient boosting.
+
+***********
+CPU Version
+***********
+
+In the previous section, we demonstrated how to train a tree-based model using the
+``hist`` tree method on a CPU. This method involves iterating through data batches stored
+in a cache during tree construction. For optimal performance, we recommend using the
+``grow_policy=depthwise`` setting, which allows XGBoost to build an entire layer of tree
+nodes with only a few batch iterations. Conversely, using the ``lossguide`` policy
+requires XGBoost to iterate over the data set for each tree node, resulting in slower
+performance.
+
+If external memory is used, the performance of CPU training is limited by IO
+(input/output) speed. This means that the disk IO speed primarily determines the training
+speed. During benchmarking, we used an NVMe connected to a PCIe-4 slot, other types of
+storage can be too slow for practical usage. In addition, your system may perform caching
+to reduce the overhead of file reading.
+
+**********************************
+GPU Version (GPU Hist tree method)
+**********************************
+
+External memory is supported by GPU algorithms (i.e. when ``tree_method`` is set to
+``gpu_hist``). However, the algorithm used for GPU is different from the one used for
+CPU. When training on a CPU, the tree method iterates through all batches from external
+memory for each step of the tree construction algorithm. On the other hand, the GPU
+algorithm concatenates all batches into one and stores it in GPU memory. To reduce overall
+memory usage, users can utilize subsampling. The good news is that the GPU hist tree
+method supports gradient-based sampling, enabling users to set a low sampling rate without
+compromising accuracy.
+
+.. code-block:: python
+
+  param = {
+    ...
+    'subsample': 0.2,
+    'sampling_method': 'gradient_based',
+  }
+
+For more information about the sampling algorithm and its use in external memory training,
+see `this paper <https://arxiv.org/abs/2005.09148>`_.
+
+.. warning::
+
+   When GPU is running out of memory during iteration on external memory, user might
+   recieve a segfault instead of an OOM exception.
+
+*******
+Remarks
+*******
+
+When using external memory with XBGoost, data is divided into smaller chunks so that only
+a fraction of it needs to be stored in memory at any given time. It's important to note
+that this method only applies to the predictor data (``X``), while other data, like labels
+and internal runtime structures are concatenated. This means that memory reduction is most
+effective when dealing with wide datasets where ``X`` is larger compared to other data
+like ``y``, while it has little impact on slim datasets.
+
+Starting with XGBoost 2.0, the implementation of external memory uses ``mmap``. It is not
+yet tested against system errors like disconnected network devices (`SIGBUS`). Also, it's
+worth noting that most tests have been conducted on Linux distributions.
+
+Another important point to keep in mind is that creating the initial cache for XGBoost may
+take some time. The interface to external memory is through custom iterators, which may or
+may not be thread-safe. Therefore, initialization is performed sequentially.
+
 
 ****************
 Text File Inputs
 ****************
 
-There is no big difference between using external memory version and in-memory version.
-The only difference is the filename format.
+This is the original form of external memory support, users are encouraged to use custom
+data iterator instead. There is no big difference between using external memory version of
+text input and the in-memory version.  The only difference is the filename format.
 
-The external memory version takes in the following `URI <https://en.wikipedia.org/wiki/Uniform_Resource_Identifier>`_ format:
+The external memory version takes in the following `URI
+<https://en.wikipedia.org/wiki/Uniform_Resource_Identifier>`_ format:
 
 .. code-block:: none
 
@@ -91,9 +181,8 @@ To load from csv files, use the following syntax:
 
 where ``label_column`` should point to the csv column acting as the label.
 
-To provide a simple example for illustration, extracting the code from
-`demo/guide-python/external_memory.py <https://github.com/dmlc/xgboost/blob/master/demo/guide-python/external_memory.py>`_. If
-you have a dataset stored in a file similar to ``agaricus.txt.train`` with LIBSVM format, the external memory support can be enabled by:
+If you have a dataset stored in a file similar to ``demo/data/agaricus.txt.train`` with LIBSVM
+format, the external memory support can be enabled by:
 
 .. code-block:: python
 
@@ -104,35 +193,3 @@ XGBoost will first load ``agaricus.txt.train`` in, preprocess it, then write to
 more notes about text input formats, see :doc:`/tutorials/input_format`.
 
 For CLI version, simply add the cache suffix, e.g. ``"../data/agaricus.txt.train?format=libsvm#dtrain.cache"``.
-
-
-**********************************
-GPU Version (GPU Hist tree method)
-**********************************
-External memory is supported in GPU algorithms (i.e. when ``tree_method`` is set to ``gpu_hist``).
-
-If you are still getting out-of-memory errors after enabling external memory, try subsampling the
-data to further reduce GPU memory usage:
-
-.. code-block:: python
-
-  param = {
-    ...
-    'subsample': 0.1,
-    'sampling_method': 'gradient_based',
-  }
-
-For more information, see `this paper <https://arxiv.org/abs/2005.09148>`_.  Internally
-the tree method still concatenate all the chunks into 1 final histogram index due to
-performance reason, but in compressed format.  So its scalability has an upper bound but
-still has lower memory cost in general.
-
-***********
-CPU Version
-***********
-
-For CPU histogram based tree methods (``approx``, ``hist``) it's recommended to use
-``grow_policy=depthwise`` for performance reason.  Iterating over data batches is slow,
-with ``depthwise`` policy XGBoost can build a entire layer of tree nodes with a few
-iterations, while with ``lossguide`` XGBoost needs to iterate over the data set for each
-tree node.
diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py
index 70e536101..f6abb867e 100644
--- a/python-package/xgboost/testing/__init__.py
+++ b/python-package/xgboost/testing/__init__.py
@@ -198,14 +198,14 @@ class IteratorForTest(xgb.core.DataIter):
         X: Sequence,
         y: Sequence,
         w: Optional[Sequence],
-        cache: Optional[str] = "./",
+        cache: Optional[str],
     ) -> None:
         assert len(X) == len(y)
         self.X = X
         self.y = y
         self.w = w
         self.it = 0
-        super().__init__(cache)
+        super().__init__(cache_prefix=cache)
 
     def next(self, input_data: Callable) -> int:
         if self.it == len(self.X):
@@ -347,7 +347,9 @@ class TestDataset:
             if w is not None:
                 weight.append(w)
 
-        it = IteratorForTest(predictor, response, weight if weight else None)
+        it = IteratorForTest(
+            predictor, response, weight if weight else None, cache="cache"
+        )
         return xgb.DMatrix(it)
 
     def __repr__(self) -> str:
diff --git a/rabit/include/rabit/internal/io.h b/rabit/include/rabit/internal/io.h
index 978eebd8a..a12e1decd 100644
--- a/rabit/include/rabit/internal/io.h
+++ b/rabit/include/rabit/internal/io.h
@@ -1,18 +1,21 @@
-/*!
- *  Copyright (c) 2014-2019 by Contributors
+/**
+ *  Copyright 2014-2023, XGBoost Contributors
  * \file io.h
  * \brief utilities with different serializable implementations
  * \author Tianqi Chen
  */
 #ifndef RABIT_INTERNAL_IO_H_
 #define RABIT_INTERNAL_IO_H_
-#include <cstdio>
-#include <vector>
-#include <cstring>
-#include <string>
+
 #include <algorithm>
-#include <numeric>
+#include <cstddef>  // for size_t
+#include <cstdio>
+#include <cstring>  // for memcpy
 #include <limits>
+#include <numeric>
+#include <string>
+#include <vector>
+
 #include "rabit/internal/utils.h"
 #include "rabit/serializable.h"
 
@@ -20,54 +23,61 @@ namespace rabit {
 namespace utils {
 /*! \brief re-use definition of dmlc::SeekStream */
 using SeekStream = dmlc::SeekStream;
-/*! \brief fixed size memory buffer */
+/**
+ * @brief Fixed size memory buffer as a stream.
+ */
 struct MemoryFixSizeBuffer : public SeekStream {
  public:
   // similar to SEEK_END in libc
-  static size_t constexpr kSeekEnd = std::numeric_limits<size_t>::max();
+  static std::size_t constexpr kSeekEnd = std::numeric_limits<std::size_t>::max();
+
+ protected:
+  MemoryFixSizeBuffer() = default;
 
  public:
-  MemoryFixSizeBuffer(void *p_buffer, size_t buffer_size)
-      : p_buffer_(reinterpret_cast<char*>(p_buffer)),
-        buffer_size_(buffer_size) {
-    curr_ptr_ = 0;
-  }
+  /**
+   * @brief Ctor
+   *
+   * @param p_buffer Pointer to the source buffer with size `buffer_size`.
+   * @param buffer_size Size of the source buffer
+   */
+  MemoryFixSizeBuffer(void *p_buffer, std::size_t buffer_size)
+      : p_buffer_(reinterpret_cast<char *>(p_buffer)), buffer_size_(buffer_size) {}
   ~MemoryFixSizeBuffer() override = default;
-  size_t Read(void *ptr, size_t size) override {
-    size_t nread = std::min(buffer_size_ - curr_ptr_, size);
+
+  std::size_t Read(void *ptr, std::size_t size) override {
+    std::size_t nread = std::min(buffer_size_ - curr_ptr_, size);
     if (nread != 0) std::memcpy(ptr, p_buffer_ + curr_ptr_, nread);
     curr_ptr_ += nread;
     return nread;
   }
-  void Write(const void *ptr, size_t size) override {
+  void Write(const void *ptr, std::size_t size) override {
     if (size == 0) return;
-    utils::Assert(curr_ptr_ + size <=  buffer_size_,
-                  "write position exceed fixed buffer size");
+    CHECK_LE(curr_ptr_ + size, buffer_size_);
     std::memcpy(p_buffer_ + curr_ptr_, ptr, size);
     curr_ptr_ += size;
   }
-  void Seek(size_t pos) override {
+  void Seek(std::size_t pos) override {
     if (pos == kSeekEnd) {
       curr_ptr_ = buffer_size_;
     } else {
-      curr_ptr_ = static_cast<size_t>(pos);
+      curr_ptr_ = static_cast<std::size_t>(pos);
     }
   }
-  size_t Tell() override {
-    return curr_ptr_;
-  }
-  virtual bool AtEnd() const {
-    return curr_ptr_ == buffer_size_;
-  }
+  /**
+   * @brief Current position in the buffer (stream).
+   */
+  std::size_t Tell() override { return curr_ptr_; }
+  virtual bool AtEnd() const { return curr_ptr_ == buffer_size_; }
 
- private:
+ protected:
   /*! \brief in memory buffer */
-  char *p_buffer_;
+  char *p_buffer_{nullptr};
   /*! \brief current pointer */
-  size_t buffer_size_;
+  std::size_t buffer_size_{0};
   /*! \brief current pointer */
-  size_t curr_ptr_;
-};  // class MemoryFixSizeBuffer
+  std::size_t curr_ptr_{0};
+};
 
 /*! \brief a in memory buffer that can be read and write as stream interface */
 struct MemoryBufferStream : public SeekStream {
diff --git a/src/common/io.cc b/src/common/io.cc
index da3a75d65..ba97db574 100644
--- a/src/common/io.cc
+++ b/src/common/io.cc
@@ -1,24 +1,47 @@
-/*!
- * Copyright (c) by XGBoost Contributors 2019-2022
+/**
+ * Copyright 2019-2023, by XGBoost Contributors
  */
-#if defined(__unix__)
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
+#if !defined(NOMINMAX) && defined(_WIN32)
+#define NOMINMAX
+#endif  // !defined(NOMINMAX)
+
+#if !defined(xgboost_IS_WIN)
+
+#if defined(_MSC_VER) || defined(__MINGW32__)
+#define xgboost_IS_WIN 1
+#endif  // defined(_MSC_VER) || defined(__MINGW32__)
+
+#endif  // !defined(xgboost_IS_WIN)
+
+#if defined(__unix__) || defined(__APPLE__)
+#include <fcntl.h>     // for open, O_RDONLY
+#include <sys/mman.h>  // for mmap, mmap64, munmap
+#include <unistd.h>    // for close, getpagesize
+#elif defined(xgboost_IS_WIN)
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
 #endif  // defined(__unix__)
-#include <algorithm>
-#include <fstream>
-#include <string>
-#include <memory>
-#include <utility>
-#include <cstdio>
 
-#include "xgboost/logging.h"
+#include <algorithm>     // for copy, transform
+#include <cctype>        // for tolower
+#include <cerrno>        // for errno
+#include <cstddef>       // for size_t
+#include <cstdint>       // for int32_t, uint32_t
+#include <cstring>       // for memcpy
+#include <fstream>       // for ifstream
+#include <iterator>      // for distance
+#include <limits>        // for numeric_limits
+#include <memory>        // for unique_ptr
+#include <string>        // for string
+#include <system_error>  // for error_code, system_category
+#include <utility>       // for move
+#include <vector>        // for vector
+
 #include "io.h"
+#include "xgboost/collective/socket.h"  // for LastError
+#include "xgboost/logging.h"
 
-namespace xgboost {
-namespace common {
-
+namespace xgboost::common {
 size_t PeekableInStream::Read(void* dptr, size_t size) {
   size_t nbuffer = buffer_.length() - buffer_ptr_;
   if (nbuffer == 0) return strm_->Read(dptr, size);
@@ -94,11 +117,32 @@ void FixedSizeStream::Take(std::string* out) {
   *out = std::move(buffer_);
 }
 
+namespace {
+// Get system alignment value for IO with mmap.
+std::size_t GetMmapAlignment() {
+#if defined(xgboost_IS_WIN)
+  SYSTEM_INFO sys_info;
+  GetSystemInfo(&sys_info);
+  // During testing, `sys_info.dwPageSize` is of size 4096 while `dwAllocationGranularity` is of
+  // size 65536.
+  return sys_info.dwAllocationGranularity;
+#else
+  return getpagesize();
+#endif
+}
+
+auto SystemErrorMsg() {
+  std::int32_t errsv = system::LastError();
+  auto err = std::error_code{errsv, std::system_category()};
+  return err.message();
+}
+}  // anonymous namespace
+
 std::string LoadSequentialFile(std::string uri, bool stream) {
   auto OpenErr = [&uri]() {
     std::string msg;
     msg = "Opening " + uri + " failed: ";
-    msg += strerror(errno);
+    msg += SystemErrorMsg();
     LOG(FATAL) << msg;
   };
 
@@ -155,5 +199,99 @@ std::string FileExtension(std::string fname, bool lower) {
     return "";
   }
 }
-}  // namespace common
-}  // namespace xgboost
+
+struct PrivateMmapConstStream::MMAPFile {
+#if defined(xgboost_IS_WIN)
+  HANDLE fd{INVALID_HANDLE_VALUE};
+  HANDLE file_map{INVALID_HANDLE_VALUE};
+#else
+  std::int32_t fd{0};
+#endif
+  char* base_ptr{nullptr};
+  std::size_t base_size{0};
+  std::string path;
+};
+
+char* PrivateMmapConstStream::Open(std::string path, std::size_t offset, std::size_t length) {
+  if (length == 0) {
+    return nullptr;
+  }
+
+#if defined(xgboost_IS_WIN)
+  HANDLE fd = CreateFile(path.c_str(), GENERIC_READ, FILE_SHARE_READ, nullptr, OPEN_EXISTING,
+                         FILE_ATTRIBUTE_NORMAL | FILE_FLAG_OVERLAPPED, nullptr);
+  CHECK_NE(fd, INVALID_HANDLE_VALUE) << "Failed to open:" << path << ". " << SystemErrorMsg();
+#else
+  auto fd = open(path.c_str(), O_RDONLY);
+  CHECK_GE(fd, 0) << "Failed to open:" << path << ". " << SystemErrorMsg();
+#endif
+
+  char* ptr{nullptr};
+  // Round down for alignment.
+  auto view_start = offset / GetMmapAlignment() * GetMmapAlignment();
+  auto view_size = length + (offset - view_start);
+
+#if defined(__linux__) || defined(__GLIBC__)
+  int prot{PROT_READ};
+  ptr = reinterpret_cast<char*>(mmap64(nullptr, view_size, prot, MAP_PRIVATE, fd, view_start));
+  CHECK_NE(ptr, MAP_FAILED) << "Failed to map: " << path << ". " << SystemErrorMsg();
+  handle_.reset(new MMAPFile{fd, ptr, view_size, std::move(path)});
+#elif defined(xgboost_IS_WIN)
+  auto file_size = GetFileSize(fd, nullptr);
+  DWORD access = PAGE_READONLY;
+  auto map_file = CreateFileMapping(fd, nullptr, access, 0, file_size, nullptr);
+  access = FILE_MAP_READ;
+  std::uint32_t loff = static_cast<std::uint32_t>(view_start);
+  std::uint32_t hoff = view_start >> 32;
+  CHECK(map_file) << "Failed to map: " << path << ". " << SystemErrorMsg();
+  ptr = reinterpret_cast<char*>(MapViewOfFile(map_file, access, hoff, loff, view_size));
+  CHECK_NE(ptr, nullptr) << "Failed to map: " << path << ". " << SystemErrorMsg();
+  handle_.reset(new MMAPFile{fd, map_file, ptr, view_size, std::move(path)});
+#else
+  CHECK_LE(offset, std::numeric_limits<off_t>::max())
+      << "File size has exceeded the limit on the current system.";
+  int prot{PROT_READ};
+  ptr = reinterpret_cast<char*>(mmap(nullptr, view_size, prot, MAP_PRIVATE, fd, view_start));
+  CHECK_NE(ptr, MAP_FAILED) << "Failed to map: " << path << ". " << SystemErrorMsg();
+  handle_.reset(new MMAPFile{fd, ptr, view_size, std::move(path)});
+#endif  // defined(__linux__)
+
+  ptr += (offset - view_start);
+  return ptr;
+}
+
+PrivateMmapConstStream::PrivateMmapConstStream(std::string path, std::size_t offset,
+                                               std::size_t length)
+    : MemoryFixSizeBuffer{}, handle_{nullptr} {
+  this->p_buffer_ = Open(std::move(path), offset, length);
+  this->buffer_size_ = length;
+}
+
+PrivateMmapConstStream::~PrivateMmapConstStream() {
+  CHECK(handle_);
+#if defined(xgboost_IS_WIN)
+  if (p_buffer_) {
+    CHECK(UnmapViewOfFile(handle_->base_ptr)) "Faled to call munmap: " << SystemErrorMsg();
+  }
+  if (handle_->fd != INVALID_HANDLE_VALUE) {
+    CHECK(CloseHandle(handle_->fd)) << "Failed to close handle: " << SystemErrorMsg();
+  }
+  if (handle_->file_map != INVALID_HANDLE_VALUE) {
+    CHECK(CloseHandle(handle_->file_map)) << "Failed to close mapping object: " << SystemErrorMsg();
+  }
+#else
+  if (handle_->base_ptr) {
+    CHECK_NE(munmap(handle_->base_ptr, handle_->base_size), -1)
+        << "Faled to call munmap: " << handle_->path << ". " << SystemErrorMsg();
+  }
+  if (handle_->fd != 0) {
+    CHECK_NE(close(handle_->fd), -1)
+        << "Faled to close: " << handle_->path << ". " << SystemErrorMsg();
+  }
+#endif
+}
+}  // namespace xgboost::common
+
+#if defined(xgboost_IS_WIN)
+#undef xgboost_IS_WIN
+#endif  // defined(xgboost_IS_WIN)
diff --git a/src/common/io.h b/src/common/io.h
index 2dd593c60..ab408dec1 100644
--- a/src/common/io.h
+++ b/src/common/io.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright by XGBoost Contributors 2014-2022
+/**
+ * Copyright 2014-2023, XGBoost Contributors
  * \file io.h
  * \brief general stream interface for serialization, I/O
  * \author Tianqi Chen
@@ -10,9 +10,11 @@
 
 #include <dmlc/io.h>
 #include <rabit/rabit.h>
-#include <string>
+
 #include <cstring>
 #include <fstream>
+#include <memory>  // for unique_ptr
+#include <string>  // for string
 
 #include "common.h"
 
@@ -127,6 +129,31 @@ inline std::string ReadAll(std::string const &path) {
   return content;
 }
 
+/**
+ * @brief Private mmap file as a read-only stream.
+ *
+ *  It can calculate alignment automatically based on system page size (or allocation
+ *  granularity on Windows).
+ */
+class PrivateMmapConstStream : public MemoryFixSizeBuffer {
+  struct MMAPFile;
+  std::unique_ptr<MMAPFile> handle_;
+
+  char* Open(std::string path, std::size_t offset, std::size_t length);
+
+ public:
+  /**
+   * @brief Construct a private mmap stream.
+   *
+   * @param path      File path.
+   * @param offset    See the `offset` parameter of `mmap` for details.
+   * @param length    See the `length` parameter of `mmap` for details.
+   */
+  explicit PrivateMmapConstStream(std::string path, std::size_t offset, std::size_t length);
+  void Write(void const*, std::size_t) override { LOG(FATAL) << "Read-only stream."; }
+
+  ~PrivateMmapConstStream() override;
+};
 }  // namespace common
 }  // namespace xgboost
 #endif  // XGBOOST_COMMON_IO_H_
diff --git a/src/data/sparse_page_source.h b/src/data/sparse_page_source.h
index 088f1e98c..b4e42f2db 100644
--- a/src/data/sparse_page_source.h
+++ b/src/data/sparse_page_source.h
@@ -1,35 +1,34 @@
-/*!
- *  Copyright 2014-2022 by XGBoost Contributors
+/**
+ *  Copyright 2014-2023, XGBoost Contributors
  * \file sparse_page_source.h
  */
 #ifndef XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_
 #define XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_
 
-#include <algorithm>  // std::min
-#include <string>
-#include <utility>
-#include <vector>
-#include <future>
-#include <thread>
+#include <algorithm>  // for min
+#include <future>     // async
 #include <map>
 #include <memory>
+#include <string>
+#include <thread>
+#include <utility>
+#include <vector>
 
+#include "../common/common.h"
+#include "../common/io.h"     // for PrivateMmapStream, PadPageForMMAP
+#include "../common/timer.h"  // for Monitor, Timer
+#include "adapter.h"
+#include "dmlc/common.h"  // OMPException
+#include "proxy_dmatrix.h"
+#include "sparse_page_writer.h"
 #include "xgboost/base.h"
 #include "xgboost/data.h"
 
-#include "adapter.h"
-#include "sparse_page_writer.h"
-#include "proxy_dmatrix.h"
-
-#include "../common/common.h"
-#include "../common/timer.h"
-
-namespace xgboost {
-namespace data {
+namespace xgboost::data {
 inline void TryDeleteCacheFile(const std::string& file) {
   if (std::remove(file.c_str()) != 0) {
     LOG(WARNING) << "Couldn't remove external memory cache file " << file
-              << "; you may want to remove it manually";
+                 << "; you may want to remove it manually";
   }
 }
 
@@ -54,6 +53,9 @@ struct Cache {
   std::string ShardName() {
     return ShardName(this->name, this->format);
   }
+  void Push(std::size_t n_bytes) {
+    offset.push_back(n_bytes);
+  }
 
   // The write is completed.
   void Commit() {
@@ -95,56 +97,72 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
   uint32_t n_batches_ {0};
 
   std::shared_ptr<Cache> cache_info_;
-  std::unique_ptr<dmlc::Stream> fo_;
 
   using Ring = std::vector<std::future<std::shared_ptr<S>>>;
   // A ring storing futures to data.  Since the DMatrix iterator is forward only, so we
   // can pre-fetch data in a ring.
   std::unique_ptr<Ring> ring_{new Ring};
+  dmlc::OMPException exec_;
+  common::Monitor monitor_;
 
   bool ReadCache() {
     CHECK(!at_end_);
     if (!cache_info_->written) {
       return false;
     }
-    if (fo_) {
-      fo_.reset();  // flush the data to disk.
+    if (ring_->empty()) {
       ring_->resize(n_batches_);
     }
     // An heuristic for number of pre-fetched batches.  We can make it part of BatchParam
     // to let user adjust number of pre-fetched batches when needed.
-    uint32_t constexpr kPreFetch = 4;
+    uint32_t constexpr kPreFetch = 3;
 
     size_t n_prefetch_batches = std::min(kPreFetch, n_batches_);
     CHECK_GT(n_prefetch_batches, 0) << "total batches:" << n_batches_;
-    size_t fetch_it = count_;
+    std::size_t fetch_it = count_;
 
-    for (size_t i = 0; i < n_prefetch_batches; ++i, ++fetch_it) {
+    exec_.Rethrow();
+
+    monitor_.Start("launch");
+    for (std::size_t i = 0; i < n_prefetch_batches; ++i, ++fetch_it) {
       fetch_it %= n_batches_;  // ring
       if (ring_->at(fetch_it).valid()) {
         continue;
       }
-      auto const *self = this;  // make sure it's const
+      auto const* self = this;  // make sure it's const
       CHECK_LT(fetch_it, cache_info_->offset.size());
-      ring_->at(fetch_it) = std::async(std::launch::async, [fetch_it, self]() {
-        common::Timer timer;
-        timer.Start();
-        std::unique_ptr<SparsePageFormat<S>> fmt{CreatePageFormat<S>("raw")};
-        auto n = self->cache_info_->ShardName();
-        size_t offset = self->cache_info_->offset.at(fetch_it);
-        std::unique_ptr<dmlc::SeekStream> fi{dmlc::SeekStream::CreateForRead(n.c_str())};
-        fi->Seek(offset);
-        CHECK_EQ(fi->Tell(), offset);
+      ring_->at(fetch_it) = std::async(std::launch::async, [fetch_it, self, this]() {
         auto page = std::make_shared<S>();
-        CHECK(fmt->Read(page.get(), fi.get()));
-        LOG(INFO) << "Read a page in " << timer.ElapsedSeconds() << " seconds.";
+        this->exec_.Run([&] {
+          common::Timer timer;
+          timer.Start();
+          std::unique_ptr<SparsePageFormat<S>> fmt{CreatePageFormat<S>("raw")};
+          auto n = self->cache_info_->ShardName();
+
+          std::uint64_t offset = self->cache_info_->offset.at(fetch_it);
+          std::uint64_t length = self->cache_info_->offset.at(fetch_it + 1) - offset;
+
+          auto fi = std::make_unique<common::PrivateMmapConstStream>(n, offset, length);
+          CHECK(fmt->Read(page.get(), fi.get()));
+          timer.Stop();
+
+          LOG(INFO) << "Read a page `" << typeid(S).name() << "` in " << timer.ElapsedSeconds()
+                    << " seconds.";
+        });
         return page;
       });
     }
+    monitor_.Stop("launch");
+
     CHECK_EQ(std::count_if(ring_->cbegin(), ring_->cend(), [](auto const& f) { return f.valid(); }),
              n_prefetch_batches)
         << "Sparse DMatrix assumes forward iteration.";
+    monitor_.Start("Wait");
     page_ = (*ring_)[count_].get();
+    monitor_.Stop("Wait");
+    CHECK(!(*ring_)[count_].valid());
+    exec_.Rethrow();
+
     return true;
   }
 
@@ -153,25 +171,35 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
     common::Timer timer;
     timer.Start();
     std::unique_ptr<SparsePageFormat<S>> fmt{CreatePageFormat<S>("raw")};
-    if (!fo_) {
-      auto n = cache_info_->ShardName();
-      fo_.reset(dmlc::Stream::Create(n.c_str(), "w"));
-    }
-    auto bytes = fmt->Write(*page_, fo_.get());
-    timer.Stop();
 
+    auto name = cache_info_->ShardName();
+    std::unique_ptr<dmlc::Stream> fo;
+    if (this->Iter() == 0) {
+      fo.reset(dmlc::Stream::Create(name.c_str(), "wb"));
+    } else {
+      fo.reset(dmlc::Stream::Create(name.c_str(), "ab"));
+    }
+
+    auto bytes = fmt->Write(*page_, fo.get());
+
+    timer.Stop();
     LOG(INFO) << static_cast<double>(bytes) / 1024.0 / 1024.0 << " MB written in "
               << timer.ElapsedSeconds() << " seconds.";
-    cache_info_->offset.push_back(bytes);
+    cache_info_->Push(bytes);
   }
 
   virtual void Fetch() = 0;
 
  public:
-  SparsePageSourceImpl(float missing, int nthreads, bst_feature_t n_features,
-                       uint32_t n_batches, std::shared_ptr<Cache> cache)
-      : missing_{missing}, nthreads_{nthreads}, n_features_{n_features},
-        n_batches_{n_batches}, cache_info_{std::move(cache)} {}
+  SparsePageSourceImpl(float missing, int nthreads, bst_feature_t n_features, uint32_t n_batches,
+                       std::shared_ptr<Cache> cache)
+      : missing_{missing},
+        nthreads_{nthreads},
+        n_features_{n_features},
+        n_batches_{n_batches},
+        cache_info_{std::move(cache)} {
+    monitor_.Init(typeid(S).name());  // not pretty, but works for basic profiling
+  }
 
   SparsePageSourceImpl(SparsePageSourceImpl const &that) = delete;
 
@@ -244,7 +272,7 @@ class SparsePageSource : public SparsePageSourceImpl<SparsePage> {
         iter_{iter}, proxy_{proxy} {
     if (!cache_info_->written) {
       iter_.Reset();
-      CHECK_EQ(iter_.Next(), 1) << "Must have at least 1 batch.";
+      CHECK(iter_.Next()) << "Must have at least 1 batch.";
     }
     this->Fetch();
   }
@@ -259,6 +287,7 @@ class SparsePageSource : public SparsePageSourceImpl<SparsePage> {
     }
 
     if (at_end_) {
+      CHECK_EQ(cache_info_->offset.size(), n_batches_ + 1);
       cache_info_->Commit();
       if (n_batches_ != 0) {
         CHECK_EQ(count_, n_batches_);
@@ -371,6 +400,5 @@ class SortedCSCPageSource : public PageSourceIncMixIn<SortedCSCPage> {
     this->Fetch();
   }
 };
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
 #endif  // XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_
diff --git a/src/tree/gpu_hist/gradient_based_sampler.cu b/src/tree/gpu_hist/gradient_based_sampler.cu
index f22fa172f..5f763fb93 100644
--- a/src/tree/gpu_hist/gradient_based_sampler.cu
+++ b/src/tree/gpu_hist/gradient_based_sampler.cu
@@ -146,27 +146,30 @@ class PoissonSampling : public thrust::binary_function<GradientPair, size_t, Gra
   CombineGradientPair combine_;
 };
 
-NoSampling::NoSampling(EllpackPageImpl const* page) : page_(page) {}
+NoSampling::NoSampling(BatchParam batch_param) : batch_param_(std::move(batch_param)) {}
 
-GradientBasedSample NoSampling::Sample(Context const*, common::Span<GradientPair> gpair,
+GradientBasedSample NoSampling::Sample(Context const* ctx, common::Span<GradientPair> gpair,
                                        DMatrix* dmat) {
-  return {dmat->Info().num_row_, page_, gpair};
+  auto page = (*dmat->GetBatches<EllpackPage>(ctx, batch_param_).begin()).Impl();
+  return {dmat->Info().num_row_, page, gpair};
 }
 
-ExternalMemoryNoSampling::ExternalMemoryNoSampling(Context const* ctx, EllpackPageImpl const* page,
-                                                   size_t n_rows, BatchParam batch_param)
-    : batch_param_{std::move(batch_param)},
-      page_(new EllpackPageImpl(ctx->gpu_id, page->Cuts(), page->is_dense, page->row_stride,
-                                n_rows)) {}
+ExternalMemoryNoSampling::ExternalMemoryNoSampling(BatchParam batch_param)
+    : batch_param_{std::move(batch_param)} {}
 
 GradientBasedSample ExternalMemoryNoSampling::Sample(Context const* ctx,
                                                      common::Span<GradientPair> gpair,
                                                      DMatrix* dmat) {
   if (!page_concatenated_) {
     // Concatenate all the external memory ELLPACK pages into a single in-memory page.
+    page_.reset(nullptr);
     size_t offset = 0;
     for (auto& batch : dmat->GetBatches<EllpackPage>(ctx, batch_param_)) {
       auto page = batch.Impl();
+      if (!page_) {
+        page_ = std::make_unique<EllpackPageImpl>(ctx->gpu_id, page->Cuts(), page->is_dense,
+                                                  page->row_stride, dmat->Info().num_row_);
+      }
       size_t num_elements = page_->Copy(ctx->gpu_id, page, offset);
       offset += num_elements;
     }
@@ -175,8 +178,8 @@ GradientBasedSample ExternalMemoryNoSampling::Sample(Context const* ctx,
   return {dmat->Info().num_row_, page_.get(), gpair};
 }
 
-UniformSampling::UniformSampling(EllpackPageImpl const* page, float subsample)
-    : page_(page), subsample_(subsample) {}
+UniformSampling::UniformSampling(BatchParam batch_param, float subsample)
+    : batch_param_{std::move(batch_param)}, subsample_(subsample) {}
 
 GradientBasedSample UniformSampling::Sample(Context const* ctx, common::Span<GradientPair> gpair,
                                             DMatrix* dmat) {
@@ -185,7 +188,8 @@ GradientBasedSample UniformSampling::Sample(Context const* ctx, common::Span<Gra
   thrust::replace_if(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair),
                      thrust::counting_iterator<std::size_t>(0),
                      BernoulliTrial(common::GlobalRandom()(), subsample_), GradientPair());
-  return {dmat->Info().num_row_, page_, gpair};
+  auto page = (*dmat->GetBatches<EllpackPage>(ctx, batch_param_).begin()).Impl();
+  return {dmat->Info().num_row_, page, gpair};
 }
 
 ExternalMemoryUniformSampling::ExternalMemoryUniformSampling(size_t n_rows,
@@ -236,12 +240,10 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(Context const* ctx,
   return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
 }
 
-GradientBasedSampling::GradientBasedSampling(EllpackPageImpl const* page,
-                                             size_t n_rows,
-                                             const BatchParam&,
+GradientBasedSampling::GradientBasedSampling(std::size_t n_rows, BatchParam batch_param,
                                              float subsample)
-    : page_(page),
-      subsample_(subsample),
+    : subsample_(subsample),
+      batch_param_{std::move(batch_param)},
       threshold_(n_rows + 1, 0.0f),
       grad_sum_(n_rows, 0.0f) {}
 
@@ -252,18 +254,19 @@ GradientBasedSample GradientBasedSampling::Sample(Context const* ctx,
   size_t threshold_index = GradientBasedSampler::CalculateThresholdIndex(
       gpair, dh::ToSpan(threshold_), dh::ToSpan(grad_sum_), n_rows * subsample_);
 
+  auto page = (*dmat->GetBatches<EllpackPage>(ctx, batch_param_).begin()).Impl();
+
   // Perform Poisson sampling in place.
   thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair),
                     thrust::counting_iterator<size_t>(0), dh::tbegin(gpair),
                     PoissonSampling(dh::ToSpan(threshold_), threshold_index,
                                     RandomWeight(common::GlobalRandom()())));
-  return {n_rows, page_, gpair};
+  return {n_rows, page, gpair};
 }
 
-ExternalMemoryGradientBasedSampling::ExternalMemoryGradientBasedSampling(
-    size_t n_rows,
-    BatchParam batch_param,
-    float subsample)
+ExternalMemoryGradientBasedSampling::ExternalMemoryGradientBasedSampling(size_t n_rows,
+                                                                         BatchParam batch_param,
+                                                                         float subsample)
     : batch_param_(std::move(batch_param)),
       subsample_(subsample),
       threshold_(n_rows + 1, 0.0f),
@@ -273,16 +276,15 @@ ExternalMemoryGradientBasedSampling::ExternalMemoryGradientBasedSampling(
 GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* ctx,
                                                                 common::Span<GradientPair> gpair,
                                                                 DMatrix* dmat) {
-  size_t n_rows = dmat->Info().num_row_;
+  auto cuctx = ctx->CUDACtx();
+  bst_row_t n_rows = dmat->Info().num_row_;
   size_t threshold_index = GradientBasedSampler::CalculateThresholdIndex(
       gpair, dh::ToSpan(threshold_), dh::ToSpan(grad_sum_), n_rows * subsample_);
 
   // Perform Poisson sampling in place.
-  thrust::transform(dh::tbegin(gpair), dh::tend(gpair),
-                    thrust::counting_iterator<size_t>(0),
-                    dh::tbegin(gpair),
-                    PoissonSampling(dh::ToSpan(threshold_),
-                                    threshold_index,
+  thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair),
+                    thrust::counting_iterator<size_t>(0), dh::tbegin(gpair),
+                    PoissonSampling(dh::ToSpan(threshold_), threshold_index,
                                     RandomWeight(common::GlobalRandom()())));
 
   // Count the sampled rows.
@@ -290,16 +292,15 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c
 
   // Compact gradient pairs.
   gpair_.resize(sample_rows);
-  thrust::copy_if(dh::tbegin(gpair), dh::tend(gpair), gpair_.begin(), IsNonZero());
+  thrust::copy_if(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), gpair_.begin(), IsNonZero());
 
   // Index the sample rows.
-  thrust::transform(dh::tbegin(gpair), dh::tend(gpair), sample_row_index_.begin(), IsNonZero());
-  thrust::exclusive_scan(sample_row_index_.begin(), sample_row_index_.end(),
-    sample_row_index_.begin());
-  thrust::transform(dh::tbegin(gpair), dh::tend(gpair),
-                    sample_row_index_.begin(),
-                    sample_row_index_.begin(),
-                    ClearEmptyRows());
+  thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), sample_row_index_.begin(),
+                    IsNonZero());
+  thrust::exclusive_scan(cuctx->CTP(), sample_row_index_.begin(), sample_row_index_.end(),
+                         sample_row_index_.begin());
+  thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), sample_row_index_.begin(),
+                    sample_row_index_.begin(), ClearEmptyRows());
 
   auto batch_iterator = dmat->GetBatches<EllpackPage>(ctx, batch_param_);
   auto first_page = (*batch_iterator.begin()).Impl();
@@ -317,13 +318,13 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c
   return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
 }
 
-GradientBasedSampler::GradientBasedSampler(Context const* ctx, EllpackPageImpl const* page,
-                                           size_t n_rows, const BatchParam& batch_param,
-                                           float subsample, int sampling_method) {
+GradientBasedSampler::GradientBasedSampler(Context const* /*ctx*/, size_t n_rows,
+                                           const BatchParam& batch_param, float subsample,
+                                           int sampling_method, bool is_external_memory) {
+  // The ctx is kept here for future development of stream-based operations.
   monitor_.Init("gradient_based_sampler");
 
   bool is_sampling = subsample < 1.0;
-  bool is_external_memory = page->n_rows != n_rows;
 
   if (is_sampling) {
     switch (sampling_method) {
@@ -331,24 +332,24 @@ GradientBasedSampler::GradientBasedSampler(Context const* ctx, EllpackPageImpl c
         if (is_external_memory) {
           strategy_.reset(new ExternalMemoryUniformSampling(n_rows, batch_param, subsample));
         } else {
-          strategy_.reset(new UniformSampling(page, subsample));
+          strategy_.reset(new UniformSampling(batch_param, subsample));
         }
         break;
       case TrainParam::kGradientBased:
         if (is_external_memory) {
-          strategy_.reset(
-              new ExternalMemoryGradientBasedSampling(n_rows, batch_param, subsample));
+          strategy_.reset(new ExternalMemoryGradientBasedSampling(n_rows, batch_param, subsample));
         } else {
-          strategy_.reset(new GradientBasedSampling(page, n_rows, batch_param, subsample));
+          strategy_.reset(new GradientBasedSampling(n_rows, batch_param, subsample));
         }
         break;
-      default:LOG(FATAL) << "unknown sampling method";
+      default:
+        LOG(FATAL) << "unknown sampling method";
     }
   } else {
     if (is_external_memory) {
-      strategy_.reset(new ExternalMemoryNoSampling(ctx, page, n_rows, batch_param));
+      strategy_.reset(new ExternalMemoryNoSampling(batch_param));
     } else {
-      strategy_.reset(new NoSampling(page));
+      strategy_.reset(new NoSampling(batch_param));
     }
   }
 }
@@ -362,11 +363,11 @@ GradientBasedSample GradientBasedSampler::Sample(Context const* ctx,
   return sample;
 }
 
-size_t GradientBasedSampler::CalculateThresholdIndex(
-    common::Span<GradientPair> gpair, common::Span<float> threshold,
-    common::Span<float> grad_sum, size_t sample_rows) {
-  thrust::fill(dh::tend(threshold) - 1, dh::tend(threshold),
-               std::numeric_limits<float>::max());
+size_t GradientBasedSampler::CalculateThresholdIndex(common::Span<GradientPair> gpair,
+                                                     common::Span<float> threshold,
+                                                     common::Span<float> grad_sum,
+                                                     size_t sample_rows) {
+  thrust::fill(dh::tend(threshold) - 1, dh::tend(threshold), std::numeric_limits<float>::max());
   thrust::transform(dh::tbegin(gpair), dh::tend(gpair), dh::tbegin(threshold),
                     CombineGradientPair());
   thrust::sort(dh::tbegin(threshold), dh::tend(threshold) - 1);
@@ -379,6 +380,5 @@ size_t GradientBasedSampler::CalculateThresholdIndex(
       thrust::min_element(dh::tbegin(grad_sum), dh::tend(grad_sum));
   return thrust::distance(dh::tbegin(grad_sum), min) + 1;
 }
-
 };  // namespace tree
 };  // namespace xgboost
diff --git a/src/tree/gpu_hist/gradient_based_sampler.cuh b/src/tree/gpu_hist/gradient_based_sampler.cuh
index dafb98cfd..f89bf242e 100644
--- a/src/tree/gpu_hist/gradient_based_sampler.cuh
+++ b/src/tree/gpu_hist/gradient_based_sampler.cuh
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2019 by XGBoost Contributors
+/**
+ * Copyright 2019-2023, XGBoost Contributors
  */
 #pragma once
 #include <xgboost/base.h>
@@ -32,37 +32,36 @@ class SamplingStrategy {
 /*! \brief No sampling in in-memory mode. */
 class NoSampling : public SamplingStrategy {
  public:
-  explicit NoSampling(EllpackPageImpl const* page);
-  GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
-                             DMatrix* dmat) override;
-
- private:
-  EllpackPageImpl const* page_;
-};
-
-/*! \brief No sampling in external memory mode. */
-class ExternalMemoryNoSampling : public SamplingStrategy {
- public:
-  ExternalMemoryNoSampling(Context const* ctx, EllpackPageImpl const* page, size_t n_rows,
-                           BatchParam batch_param);
+  explicit NoSampling(BatchParam batch_param);
   GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
                              DMatrix* dmat) override;
 
  private:
   BatchParam batch_param_;
-  std::unique_ptr<EllpackPageImpl> page_;
+};
+
+/*! \brief No sampling in external memory mode. */
+class ExternalMemoryNoSampling : public SamplingStrategy {
+ public:
+  explicit ExternalMemoryNoSampling(BatchParam batch_param);
+  GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
+                             DMatrix* dmat) override;
+
+ private:
+  BatchParam batch_param_;
+  std::unique_ptr<EllpackPageImpl> page_{nullptr};
   bool page_concatenated_{false};
 };
 
 /*! \brief Uniform sampling in in-memory mode. */
 class UniformSampling : public SamplingStrategy {
  public:
-  UniformSampling(EllpackPageImpl const* page, float subsample);
+  UniformSampling(BatchParam batch_param, float subsample);
   GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
                              DMatrix* dmat) override;
 
  private:
-  EllpackPageImpl const* page_;
+  BatchParam batch_param_;
   float subsample_;
 };
 
@@ -84,13 +83,12 @@ class ExternalMemoryUniformSampling : public SamplingStrategy {
 /*! \brief Gradient-based sampling in in-memory mode.. */
 class GradientBasedSampling : public SamplingStrategy {
  public:
-  GradientBasedSampling(EllpackPageImpl const* page, size_t n_rows, const BatchParam& batch_param,
-                        float subsample);
+  GradientBasedSampling(std::size_t n_rows, BatchParam batch_param, float subsample);
   GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
                              DMatrix* dmat) override;
 
  private:
-  EllpackPageImpl const* page_;
+  BatchParam batch_param_;
   float subsample_;
   dh::caching_device_vector<float> threshold_;
   dh::caching_device_vector<float> grad_sum_;
@@ -106,11 +104,11 @@ class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
  private:
   BatchParam batch_param_;
   float subsample_;
-  dh::caching_device_vector<float> threshold_;
-  dh::caching_device_vector<float> grad_sum_;
+  dh::device_vector<float> threshold_;
+  dh::device_vector<float> grad_sum_;
   std::unique_ptr<EllpackPageImpl> page_;
   dh::device_vector<GradientPair> gpair_;
-  dh::caching_device_vector<size_t> sample_row_index_;
+  dh::device_vector<size_t> sample_row_index_;
 };
 
 /*! \brief Draw a sample of rows from a DMatrix.
@@ -124,8 +122,8 @@ class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
  */
 class GradientBasedSampler {
  public:
-  GradientBasedSampler(Context const* ctx, EllpackPageImpl const* page, size_t n_rows,
-                       const BatchParam& batch_param, float subsample, int sampling_method);
+  GradientBasedSampler(Context const* ctx, size_t n_rows, const BatchParam& batch_param,
+                       float subsample, int sampling_method, bool is_external_memory);
 
   /*! \brief Sample from a DMatrix based on the given gradient pairs. */
   GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair, DMatrix* dmat);
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 5e5d2b5cb..2807dcfd7 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -176,7 +176,7 @@ struct GPUHistMakerDevice {
   Context const* ctx_;
 
  public:
-  EllpackPageImpl const* page;
+  EllpackPageImpl const* page{nullptr};
   common::Span<FeatureType const> feature_types;
   BatchParam batch_param;
 
@@ -205,41 +205,41 @@ struct GPUHistMakerDevice {
 
   std::unique_ptr<FeatureGroups> feature_groups;
 
-
-  GPUHistMakerDevice(Context const* ctx, EllpackPageImpl const* _page,
-                     common::Span<FeatureType const> _feature_types, bst_uint _n_rows,
+  GPUHistMakerDevice(Context const* ctx, bool is_external_memory,
+                     common::Span<FeatureType const> _feature_types, bst_row_t _n_rows,
                      TrainParam _param, uint32_t column_sampler_seed, uint32_t n_features,
                      BatchParam _batch_param)
       : evaluator_{_param, n_features, ctx->gpu_id},
         ctx_(ctx),
-        page(_page),
         feature_types{_feature_types},
         param(std::move(_param)),
         column_sampler(column_sampler_seed),
         interaction_constraints(param, n_features),
         batch_param(std::move(_batch_param)) {
-    sampler.reset(new GradientBasedSampler(ctx, page, _n_rows, batch_param, param.subsample,
-                                           param.sampling_method));
+    sampler.reset(new GradientBasedSampler(ctx, _n_rows, batch_param, param.subsample,
+                                           param.sampling_method, is_external_memory));
     if (!param.monotone_constraints.empty()) {
       // Copy assigning an empty vector causes an exception in MSVC debug builds
       monotone_constraints = param.monotone_constraints;
     }
 
-    // Init histogram
-    hist.Init(ctx_->gpu_id, page->Cuts().TotalBins());
     monitor.Init(std::string("GPUHistMakerDevice") + std::to_string(ctx_->gpu_id));
-    feature_groups.reset(new FeatureGroups(page->Cuts(), page->is_dense,
-                                           dh::MaxSharedMemoryOptin(ctx_->gpu_id),
-                                           sizeof(GradientSumT)));
   }
 
   ~GPUHistMakerDevice() {  // NOLINT
     dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
   }
 
+  void InitFeatureGroupsOnce() {
+    if (!feature_groups) {
+      CHECK(page);
+      feature_groups.reset(new FeatureGroups(page->Cuts(), page->is_dense,
+                                             dh::MaxSharedMemoryOptin(ctx_->gpu_id),
+                                             sizeof(GradientSumT)));
+    }
+  }
+
   // Reset values for each update iteration
-  // Note that the column sampler must be passed by value because it is not
-  // thread safe
   void Reset(HostDeviceVector<GradientPair>* dh_gpair, DMatrix* dmat, int64_t num_columns) {
     auto const& info = dmat->Info();
     this->column_sampler.Init(ctx_, num_columns, info.feature_weights.HostVector(),
@@ -247,26 +247,30 @@ struct GPUHistMakerDevice {
                               param.colsample_bytree);
     dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
 
-    this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param,
-                           ctx_->gpu_id);
-
     this->interaction_constraints.Reset();
 
     if (d_gpair.size() != dh_gpair->Size()) {
       d_gpair.resize(dh_gpair->Size());
     }
-    dh::safe_cuda(cudaMemcpyAsync(
-        d_gpair.data().get(), dh_gpair->ConstDevicePointer(),
-        dh_gpair->Size() * sizeof(GradientPair), cudaMemcpyDeviceToDevice));
+    dh::safe_cuda(cudaMemcpyAsync(d_gpair.data().get(), dh_gpair->ConstDevicePointer(),
+                                  dh_gpair->Size() * sizeof(GradientPair),
+                                  cudaMemcpyDeviceToDevice));
     auto sample = sampler->Sample(ctx_, dh::ToSpan(d_gpair), dmat);
     page = sample.page;
     gpair = sample.gpair;
 
+    this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param, ctx_->gpu_id);
+
     quantiser.reset(new GradientQuantiser(this->gpair));
 
     row_partitioner.reset();  // Release the device memory first before reallocating
-    row_partitioner.reset(new RowPartitioner(ctx_->gpu_id,  sample.sample_rows));
+    row_partitioner.reset(new RowPartitioner(ctx_->gpu_id, sample.sample_rows));
+
+    // Init histogram
+    hist.Init(ctx_->gpu_id, page->Cuts().TotalBins());
     hist.Reset();
+
+    this->InitFeatureGroupsOnce();
   }
 
   GPUExpandEntry EvaluateRootSplit(GradientPairInt64 root_sum) {
@@ -808,12 +812,11 @@ class GPUHistMaker : public TreeUpdater {
     collective::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0);
 
     auto batch_param = BatchParam{param->max_bin, TrainParam::DftSparseThreshold()};
-    auto page = (*dmat->GetBatches<EllpackPage>(ctx_, batch_param).begin()).Impl();
     dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
     info_->feature_types.SetDevice(ctx_->gpu_id);
     maker.reset(new GPUHistMakerDevice<GradientSumT>(
-        ctx_, page, info_->feature_types.ConstDeviceSpan(), info_->num_row_, *param,
-        column_sampling_seed, info_->num_col_, batch_param));
+        ctx_, !dmat->SingleColBlock(), info_->feature_types.ConstDeviceSpan(), info_->num_row_,
+        *param, column_sampling_seed, info_->num_col_, batch_param));
 
     p_last_fmat_ = dmat;
     initialised_ = true;
diff --git a/tests/cpp/common/test_io.cc b/tests/cpp/common/test_io.cc
index feac8bd89..a64b60b80 100644
--- a/tests/cpp/common/test_io.cc
+++ b/tests/cpp/common/test_io.cc
@@ -1,5 +1,5 @@
-/*!
- * Copyright (c) by XGBoost Contributors 2019
+/**
+ * Copyright 2019-2023, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 
@@ -9,8 +9,7 @@
 #include "../helpers.h"
 #include "../filesystem.h"  // dmlc::TemporaryDirectory
 
-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 TEST(MemoryFixSizeBuffer, Seek) {
   size_t constexpr kSize { 64 };
   std::vector<int32_t> memory( kSize );
@@ -89,5 +88,54 @@ TEST(IO, LoadSequentialFile) {
 
   ASSERT_THROW(LoadSequentialFile("non-exist", true), dmlc::Error);
 }
-}  // namespace common
-}  // namespace xgboost
+
+TEST(IO, PrivateMmapStream) {
+  dmlc::TemporaryDirectory tempdir;
+  auto path = tempdir.path + "/testfile";
+
+  // The page size on Linux is usually set to 4096, while the allocation granularity on
+  // the Windows machine where this test is writted is 65536. We span the test to cover
+  // all of them.
+  std::size_t n_batches{64};
+  std::size_t multiplier{2048};
+
+  std::vector<std::vector<std::int32_t>> batches;
+  std::vector<std::size_t> offset{0ul};
+
+  using T = std::int32_t;
+
+  {
+    std::unique_ptr<dmlc::Stream> fo{dmlc::Stream::Create(path.c_str(), "w")};
+    for (std::size_t i = 0; i < n_batches; ++i) {
+      std::size_t size = (i + 1) * multiplier;
+      std::vector<T> data(size, 0);
+      std::iota(data.begin(), data.end(), i * i);
+
+      fo->Write(static_cast<std::uint64_t>(data.size()));
+      fo->Write(data.data(), data.size() * sizeof(T));
+
+      std::size_t bytes = sizeof(std::uint64_t) + data.size() * sizeof(T);
+      offset.push_back(bytes);
+
+      batches.emplace_back(std::move(data));
+    }
+  }
+
+  // Turn size info offset
+  std::partial_sum(offset.begin(), offset.end(), offset.begin());
+
+  for (std::size_t i = 0; i < n_batches; ++i) {
+    std::size_t off = offset[i];
+    std::size_t n = offset.at(i + 1) - offset[i];
+    std::unique_ptr<dmlc::Stream> fi{std::make_unique<PrivateMmapConstStream>(path, off, n)};
+    std::vector<T> data;
+
+    std::uint64_t size{0};
+    fi->Read(&size);
+    data.resize(size);
+
+    fi->Read(data.data(), size * sizeof(T));
+    ASSERT_EQ(data, batches[i]);
+  }
+}
+}  // namespace xgboost::common
diff --git a/tests/cpp/histogram_helpers.h b/tests/cpp/histogram_helpers.h
index 127f6fe44..6774f531c 100644
--- a/tests/cpp/histogram_helpers.h
+++ b/tests/cpp/histogram_helpers.h
@@ -2,6 +2,10 @@
 #include "../../src/data/ellpack_page.cuh"
 #endif
 
+#include <xgboost/data.h>  // for SparsePage
+
+#include "./helpers.h"  // for RandomDataGenerator
+
 namespace xgboost {
 #if defined(__CUDACC__)
 namespace {
diff --git a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
index 95ae02aee..26ddfd8cc 100644
--- a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
+++ b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
@@ -39,7 +39,8 @@ void VerifySampling(size_t page_size,
     EXPECT_NE(page->n_rows, kRows);
   }
 
-  GradientBasedSampler sampler(&ctx, page, kRows, param, subsample, sampling_method);
+  GradientBasedSampler sampler(&ctx, kRows, param, subsample, sampling_method,
+                               !fixed_size_sampling);
   auto sample = sampler.Sample(&ctx, gpair.DeviceSpan(), dmat.get());
 
   if (fixed_size_sampling) {
@@ -93,7 +94,7 @@ TEST(GradientBasedSampler, NoSamplingExternalMemory) {
   auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
   EXPECT_NE(page->n_rows, kRows);
 
-  GradientBasedSampler sampler(&ctx, page, kRows, param, kSubsample, TrainParam::kUniform);
+  GradientBasedSampler sampler(&ctx, kRows, param, kSubsample, TrainParam::kUniform, true);
   auto sample = sampler.Sample(&ctx, gpair.DeviceSpan(), dmat.get());
   auto sampled_page = sample.page;
   EXPECT_EQ(sample.sample_rows, kRows);
@@ -141,7 +142,8 @@ TEST(GradientBasedSampler, GradientBasedSampling) {
   constexpr size_t kPageSize = 0;
   constexpr float kSubsample = 0.8;
   constexpr int kSamplingMethod = TrainParam::kGradientBased;
-  VerifySampling(kPageSize, kSubsample, kSamplingMethod);
+  constexpr bool kFixedSizeSampling = true;
+  VerifySampling(kPageSize, kSubsample, kSamplingMethod, kFixedSizeSampling);
 }
 
 TEST(GradientBasedSampler, GradientBasedSamplingExternalMemory) {
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index 1bd4ece20..fd3034db5 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -92,8 +92,8 @@ void TestBuildHist(bool use_shared_memory_histograms) {
   auto page = BuildEllpackPage(kNRows, kNCols);
   BatchParam batch_param{};
   Context ctx{MakeCUDACtx(0)};
-  GPUHistMakerDevice<GradientSumT> maker(&ctx, page.get(), {}, kNRows, param, kNCols, kNCols,
-                                         batch_param);
+  GPUHistMakerDevice<GradientSumT> maker(&ctx, /*is_external_memory=*/false, {}, kNRows, param,
+                                         kNCols, kNCols, batch_param);
   xgboost::SimpleLCG gen;
   xgboost::SimpleRealUniformDistribution<bst_float> dist(0.0f, 1.0f);
   HostDeviceVector<GradientPair> gpair(kNRows);
@@ -106,9 +106,15 @@ void TestBuildHist(bool use_shared_memory_histograms) {
 
   thrust::host_vector<common::CompressedByteT> h_gidx_buffer (page->gidx_buffer.HostVector());
   maker.row_partitioner.reset(new RowPartitioner(0, kNRows));
+
+  maker.hist.Init(0, page->Cuts().TotalBins());
   maker.hist.AllocateHistograms({0});
+
   maker.gpair = gpair.DeviceSpan();
   maker.quantiser.reset(new GradientQuantiser(maker.gpair));
+  maker.page = page.get();
+
+  maker.InitFeatureGroupsOnce();
 
   BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(0),
                          maker.feature_groups->DeviceAccessor(0), gpair.DeviceSpan(),
@@ -126,8 +132,8 @@ void TestBuildHist(bool use_shared_memory_histograms) {
   std::vector<GradientPairPrecise> solution = GetHostHistGpair();
   for (size_t i = 0; i < h_result.size(); ++i) {
     auto result = maker.quantiser->ToFloatingPoint(h_result[i]);
-    EXPECT_NEAR(result.GetGrad(), solution[i].GetGrad(), 0.01f);
-    EXPECT_NEAR(result.GetHess(), solution[i].GetHess(), 0.01f);
+    ASSERT_NEAR(result.GetGrad(), solution[i].GetGrad(), 0.01f);
+    ASSERT_NEAR(result.GetHess(), solution[i].GetHess(), 0.01f);
   }
 }
 
diff --git a/tests/python-gpu/test_from_cudf.py b/tests/python-gpu/test_from_cudf.py
index 523dbf931..610c717a9 100644
--- a/tests/python-gpu/test_from_cudf.py
+++ b/tests/python-gpu/test_from_cudf.py
@@ -305,7 +305,7 @@ class IterForDMatrixTest(xgb.core.DataIter):
             self._labels = [rng.randn(self.rows)] * self.BATCHES
 
         self.it = 0             # set iterator to 0
-        super().__init__()
+        super().__init__(cache_prefix=None)
 
     def as_array(self):
         import cudf
diff --git a/tests/python/test_data_iterator.py b/tests/python/test_data_iterator.py
index 0590a4954..24c117f15 100644
--- a/tests/python/test_data_iterator.py
+++ b/tests/python/test_data_iterator.py
@@ -64,7 +64,8 @@ def run_data_iterator(
     subsample_rate = 0.8 if subsample else 1.0
 
     it = IteratorForTest(
-        *make_batches(n_samples_per_batch, n_features, n_batches, use_cupy)
+        *make_batches(n_samples_per_batch, n_features, n_batches, use_cupy),
+        cache="cache"
     )
     if n_batches == 0:
         with pytest.raises(ValueError, match="1 batch"):

From 6d22ea793c6806138673116ea326ebd81351ffbd Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 19 Jun 2023 21:27:03 +0800
Subject: [PATCH 004/136] Test QDM with sparse data on CPU. (#9316)

---
 tests/python/test_quantile_dmatrix.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tests/python/test_quantile_dmatrix.py b/tests/python/test_quantile_dmatrix.py
index 0e0aaed08..c1ec23ea3 100644
--- a/tests/python/test_quantile_dmatrix.py
+++ b/tests/python/test_quantile_dmatrix.py
@@ -253,9 +253,12 @@ class TestQuantileDMatrix:
         self.run_ref_dmatrix(rng, "hist", True)
         self.run_ref_dmatrix(rng, "hist", False)
 
-    def test_predict(self) -> None:
-        n_samples, n_features = 16, 2
-        X, y = make_categorical(n_samples, n_features, n_categories=13, onehot=False)
+    @pytest.mark.parametrize("sparsity", [0.0, 0.5])
+    def test_predict(self, sparsity: float) -> None:
+        n_samples, n_features = 256, 4
+        X, y = make_categorical(
+            n_samples, n_features, n_categories=13, onehot=False, sparsity=sparsity
+        )
         Xy = xgb.DMatrix(X, y, enable_categorical=True)
 
         booster = xgb.train({"tree_method": "hist"}, Xy)

From 4066d682611d8de85529deca9a952a636abbda2e Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 20 Jun 2023 17:56:47 +0800
Subject: [PATCH 005/136] [doc] Clarify early stopping. (#9304)

---
 python-package/xgboost/sklearn.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index 43d531a9d..4cc8a174c 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -381,17 +381,21 @@ __model_doc = f"""
           every **early_stopping_rounds** round(s) to continue training.  Requires at
           least one item in **eval_set** in :py:meth:`fit`.
 
-        - The method returns the model from the last iteration, not the best one, use a
-          callback :py:class:`xgboost.callback.EarlyStopping` if returning the best
-          model is preferred.
+        - If early stopping occurs, the model will have two additional attributes:
+          :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the
+          :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal
+          number of trees during inference. If users want to access the full model
+          (including trees built after early stopping), they can specify the
+          `iteration_range` in these inference methods. In addition, other utilities
+          like model plotting can also use the entire model.
+
+        - If you prefer to discard the trees after `best_iteration`, consider using the
+          callback function :py:class:`xgboost.callback.EarlyStopping`.
 
         - If there's more than one item in **eval_set**, the last entry will be used for
           early stopping.  If there's more than one metric in **eval_metric**, the last
           metric will be used for early stopping.
 
-        - If early stopping occurs, the model will have three additional fields:
-          :py:attr:`best_score`, :py:attr:`best_iteration`.
-
         .. note::
 
             This parameter replaces `early_stopping_rounds` in :py:meth:`fit` method.

From 54da4b31856625e9cca1848e1aa8ab8bf584e5fe Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 22 Jun 2023 06:43:11 +0800
Subject: [PATCH 006/136] Cleanup to prepare for using mmap pointer in external
 memory. (#9317)

- Update SparseDMatrix comment.
- Use a pointer in the bitfield. We will replace the `std::vector<bool>` in `ColumnMatrix` with bitfield.
- Clean up the page source. The timer is removed as it's inaccurate once we swap the mmap pointer into the page.
---
 src/collective/nccl_device_communicator.cu    |  4 +-
 src/common/bitfield.h                         | 94 +++++++++++--------
 src/common/categorical.h                      |  5 +-
 src/common/hist_util.h                        | 29 +++---
 src/data/array_interface.h                    |  2 +-
 src/data/data.cc                              |  3 +-
 src/data/data.cu                              |  3 +-
 src/data/ellpack_page.cu                      |  8 +-
 src/data/sparse_page_dmatrix.h                | 46 ++++-----
 src/data/sparse_page_source.h                 | 92 ++++++++++--------
 src/predictor/gpu_predictor.cu                |  6 +-
 src/tree/constraints.cu                       | 24 ++---
 src/tree/tree_model.cc                        |  4 +-
 tests/cpp/common/test_bitfield.cc             |  8 +-
 tests/cpp/common/test_bitfield.cu             | 49 ++++++----
 tests/cpp/common/test_column_matrix.cc        |  4 +-
 .../cpp/tree/gpu_hist/test_evaluate_splits.cu |  2 -
 tests/cpp/tree/test_constraints.cu            |  8 +-
 18 files changed, 220 insertions(+), 171 deletions(-)

diff --git a/src/collective/nccl_device_communicator.cu b/src/collective/nccl_device_communicator.cu
index 6599d4b5a..631193db4 100644
--- a/src/collective/nccl_device_communicator.cu
+++ b/src/collective/nccl_device_communicator.cu
@@ -70,7 +70,7 @@ NcclDeviceCommunicator::~NcclDeviceCommunicator() {
 
 namespace {
 ncclDataType_t GetNcclDataType(DataType const &data_type) {
-  ncclDataType_t result;
+  ncclDataType_t result{ncclInt8};
   switch (data_type) {
     case DataType::kInt8:
       result = ncclInt8;
@@ -108,7 +108,7 @@ bool IsBitwiseOp(Operation const &op) {
 }
 
 ncclRedOp_t GetNcclRedOp(Operation const &op) {
-  ncclRedOp_t result;
+  ncclRedOp_t result{ncclMax};
   switch (op) {
     case Operation::kMax:
       result = ncclMax;
diff --git a/src/common/bitfield.h b/src/common/bitfield.h
index 6bb5f3404..6cdf4412e 100644
--- a/src/common/bitfield.h
+++ b/src/common/bitfield.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2019 by Contributors
+/**
+ * Copyright 2019-2023, XGBoost Contributors
  * \file bitfield.h
  */
 #ifndef XGBOOST_COMMON_BITFIELD_H_
@@ -50,14 +50,17 @@ __forceinline__ __device__ BitFieldAtomicType AtomicAnd(BitFieldAtomicType* addr
 }
 #endif  // defined(__CUDACC__)
 
-/*!
- * \brief A non-owning type with auxiliary methods defined for manipulating bits.
+/**
+ * @brief A non-owning type with auxiliary methods defined for manipulating bits.
  *
- * \tparam Direction Whether the bits start from left or from right.
+ * @tparam VT        Underlying value type, must be an unsigned integer.
+ * @tparam Direction Whether the bits start from left or from right.
+ * @tparam IsConst   Whether the view is const.
  */
 template <typename VT, typename Direction, bool IsConst = false>
 struct BitFieldContainer {
   using value_type = std::conditional_t<IsConst, VT const, VT>;  // NOLINT
+  using size_type = size_t;                                      // NOLINT
   using index_type = size_t;                                     // NOLINT
   using pointer = value_type*;                                   // NOLINT
 
@@ -70,8 +73,9 @@ struct BitFieldContainer {
   };
 
  private:
-  common::Span<value_type> bits_;
-  static_assert(!std::is_signed<VT>::value, "Must use unsiged type as underlying storage.");
+  value_type* bits_{nullptr};
+  size_type n_values_{0};
+  static_assert(!std::is_signed<VT>::value, "Must use an unsiged type as the underlying storage.");
 
  public:
   XGBOOST_DEVICE static Pos ToBitPos(index_type pos) {
@@ -86,13 +90,15 @@ struct BitFieldContainer {
 
  public:
   BitFieldContainer() = default;
-  XGBOOST_DEVICE explicit BitFieldContainer(common::Span<value_type> bits) : bits_{bits} {}
-  XGBOOST_DEVICE BitFieldContainer(BitFieldContainer const& other) : bits_{other.bits_} {}
+  XGBOOST_DEVICE explicit BitFieldContainer(common::Span<value_type> bits)
+      : bits_{bits.data()}, n_values_{bits.size()} {}
+  BitFieldContainer(BitFieldContainer const& other) = default;
+  BitFieldContainer(BitFieldContainer&& other) = default;
   BitFieldContainer &operator=(BitFieldContainer const &that) = default;
   BitFieldContainer &operator=(BitFieldContainer &&that) = default;
 
-  XGBOOST_DEVICE common::Span<value_type>       Bits()       { return bits_; }
-  XGBOOST_DEVICE common::Span<value_type const> Bits() const { return bits_; }
+  XGBOOST_DEVICE auto Bits() { return common::Span<value_type>{bits_, NumValues()}; }
+  XGBOOST_DEVICE auto Bits() const { return common::Span<value_type const>{bits_, NumValues()}; }
 
   /*\brief Compute the size of needed memory allocation.  The returned value is in terms
    *       of number of elements with `BitFieldContainer::value_type'.
@@ -103,17 +109,17 @@ struct BitFieldContainer {
 #if defined(__CUDA_ARCH__)
   __device__ BitFieldContainer& operator|=(BitFieldContainer const& rhs) {
     auto tid = blockIdx.x * blockDim.x + threadIdx.x;
-    size_t min_size = min(bits_.size(), rhs.bits_.size());
+    size_t min_size = min(NumValues(), rhs.NumValues());
     if (tid < min_size) {
-      bits_[tid] |= rhs.bits_[tid];
+      Data()[tid] |= rhs.Data()[tid];
     }
     return *this;
   }
 #else
   BitFieldContainer& operator|=(BitFieldContainer const& rhs) {
-    size_t min_size = std::min(bits_.size(), rhs.bits_.size());
+    size_t min_size = std::min(NumValues(), rhs.NumValues());
     for (size_t i = 0; i < min_size; ++i) {
-      bits_[i] |= rhs.bits_[i];
+      Data()[i] |= rhs.Data()[i];
     }
     return *this;
   }
@@ -121,75 +127,85 @@ struct BitFieldContainer {
 
 #if defined(__CUDA_ARCH__)
   __device__ BitFieldContainer& operator&=(BitFieldContainer const& rhs) {
-    size_t min_size = min(bits_.size(), rhs.bits_.size());
+    size_t min_size = min(NumValues(), rhs.NumValues());
     auto tid = blockIdx.x * blockDim.x + threadIdx.x;
     if (tid < min_size) {
-      bits_[tid] &= rhs.bits_[tid];
+      Data()[tid] &= rhs.Data()[tid];
     }
     return *this;
   }
 #else
   BitFieldContainer& operator&=(BitFieldContainer const& rhs) {
-    size_t min_size = std::min(bits_.size(), rhs.bits_.size());
+    size_t min_size = std::min(NumValues(), rhs.NumValues());
     for (size_t i = 0; i < min_size; ++i) {
-      bits_[i] &= rhs.bits_[i];
+      Data()[i] &= rhs.Data()[i];
     }
     return *this;
   }
 #endif  // defined(__CUDA_ARCH__)
 
 #if defined(__CUDA_ARCH__)
-  __device__ auto Set(index_type pos) {
+  __device__ auto Set(index_type pos) noexcept(true) {
     Pos pos_v = Direction::Shift(ToBitPos(pos));
-    value_type& value = bits_[pos_v.int_pos];
+    value_type& value = Data()[pos_v.int_pos];
     value_type set_bit = kOne << pos_v.bit_pos;
     using Type = typename dh::detail::AtomicDispatcher<sizeof(value_type)>::Type;
     atomicOr(reinterpret_cast<Type *>(&value), set_bit);
   }
-  __device__ void Clear(index_type pos) {
+  __device__ void Clear(index_type pos) noexcept(true) {
     Pos pos_v = Direction::Shift(ToBitPos(pos));
-    value_type& value = bits_[pos_v.int_pos];
+    value_type& value = Data()[pos_v.int_pos];
     value_type clear_bit = ~(kOne << pos_v.bit_pos);
     using Type = typename dh::detail::AtomicDispatcher<sizeof(value_type)>::Type;
     atomicAnd(reinterpret_cast<Type *>(&value), clear_bit);
   }
 #else
-  void Set(index_type pos) {
+  void Set(index_type pos) noexcept(true) {
     Pos pos_v = Direction::Shift(ToBitPos(pos));
-    value_type& value = bits_[pos_v.int_pos];
+    value_type& value = Data()[pos_v.int_pos];
     value_type set_bit = kOne << pos_v.bit_pos;
     value |= set_bit;
   }
-  void Clear(index_type pos) {
+  void Clear(index_type pos) noexcept(true) {
     Pos pos_v = Direction::Shift(ToBitPos(pos));
-    value_type& value = bits_[pos_v.int_pos];
+    value_type& value = Data()[pos_v.int_pos];
     value_type clear_bit = ~(kOne << pos_v.bit_pos);
     value &= clear_bit;
   }
 #endif  // defined(__CUDA_ARCH__)
 
-  XGBOOST_DEVICE bool Check(Pos pos_v) const {
+  XGBOOST_DEVICE bool Check(Pos pos_v) const noexcept(true) {
     pos_v = Direction::Shift(pos_v);
-    SPAN_LT(pos_v.int_pos, bits_.size());
-    value_type const value = bits_[pos_v.int_pos];
+    assert(pos_v.int_pos < NumValues());
+    value_type const value = Data()[pos_v.int_pos];
     value_type const test_bit = kOne << pos_v.bit_pos;
     value_type result = test_bit & value;
     return static_cast<bool>(result);
   }
-  XGBOOST_DEVICE bool Check(index_type pos) const {
+  [[nodiscard]] XGBOOST_DEVICE bool Check(index_type pos) const noexcept(true) {
     Pos pos_v = ToBitPos(pos);
     return Check(pos_v);
   }
+  /**
+   * @brief Returns the total number of bits that can be viewed. This is equal to or
+   *        larger than the acutal number of valid bits.
+   */
+  [[nodiscard]] XGBOOST_DEVICE size_type Capacity() const noexcept(true) {
+    return kValueSize * NumValues();
+  }
+  /**
+   * @brief Number of storage unit used in this bit field.
+   */
+  [[nodiscard]] XGBOOST_DEVICE size_type NumValues() const noexcept(true) { return n_values_; }
 
-  XGBOOST_DEVICE size_t Size() const { return kValueSize * bits_.size(); }
+  XGBOOST_DEVICE pointer Data() const noexcept(true) { return bits_; }
 
-  XGBOOST_DEVICE pointer Data() const { return bits_.data(); }
-
-  inline friend std::ostream &
-  operator<<(std::ostream &os, BitFieldContainer<VT, Direction, IsConst> field) {
-    os << "Bits " << "storage size: " << field.bits_.size() << "\n";
-    for (typename common::Span<value_type>::index_type i = 0; i < field.bits_.size(); ++i) {
-      std::bitset<BitFieldContainer<VT, Direction, IsConst>::kValueSize> bset(field.bits_[i]);
+  inline friend std::ostream& operator<<(std::ostream& os,
+                                         BitFieldContainer<VT, Direction, IsConst> field) {
+    os << "Bits "
+       << "storage size: " << field.NumValues() << "\n";
+    for (typename common::Span<value_type>::index_type i = 0; i < field.NumValues(); ++i) {
+      std::bitset<BitFieldContainer<VT, Direction, IsConst>::kValueSize> bset(field.Data()[i]);
       os << bset << "\n";
     }
     return os;
diff --git a/src/common/categorical.h b/src/common/categorical.h
index d7e262812..249a818e5 100644
--- a/src/common/categorical.h
+++ b/src/common/categorical.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2020-2022 by XGBoost Contributors
+/**
+ * Copyright 2020-2023, XGBoost Contributors
  * \file categorical.h
  */
 #ifndef XGBOOST_COMMON_CATEGORICAL_H_
@@ -10,7 +10,6 @@
 #include "bitfield.h"
 #include "xgboost/base.h"
 #include "xgboost/data.h"
-#include "xgboost/parameter.h"
 #include "xgboost/span.h"
 
 namespace xgboost {
diff --git a/src/common/hist_util.h b/src/common/hist_util.h
index 6380952d7..d2edf2ec8 100644
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -84,7 +84,7 @@ class HistogramCuts {
     return *this;
   }
 
-  uint32_t FeatureBins(bst_feature_t feature) const {
+  [[nodiscard]] bst_bin_t FeatureBins(bst_feature_t feature) const {
     return cut_ptrs_.ConstHostVector().at(feature + 1) - cut_ptrs_.ConstHostVector()[feature];
   }
 
@@ -92,8 +92,8 @@ class HistogramCuts {
   std::vector<float>    const& Values()    const { return cut_values_.ConstHostVector(); }
   std::vector<float>    const& MinValues() const { return min_vals_.ConstHostVector();   }
 
-  bool HasCategorical() const { return has_categorical_; }
-  float MaxCategory() const { return max_cat_; }
+  [[nodiscard]] bool HasCategorical() const { return has_categorical_; }
+  [[nodiscard]] float MaxCategory() const { return max_cat_; }
   /**
    * \brief Set meta info about categorical features.
    *
@@ -105,12 +105,13 @@ class HistogramCuts {
     max_cat_ = max_cat;
   }
 
-  size_t TotalBins() const { return cut_ptrs_.ConstHostVector().back(); }
+  [[nodiscard]] bst_bin_t TotalBins() const { return cut_ptrs_.ConstHostVector().back(); }
 
   // Return the index of a cut point that is strictly greater than the input
   // value, or the last available index if none exists
-  bst_bin_t SearchBin(float value, bst_feature_t column_id, std::vector<uint32_t> const& ptrs,
-                      std::vector<float> const& values) const {
+  [[nodiscard]] bst_bin_t SearchBin(float value, bst_feature_t column_id,
+                                    std::vector<uint32_t> const& ptrs,
+                                    std::vector<float> const& values) const {
     auto end = ptrs[column_id + 1];
     auto beg = ptrs[column_id];
     auto it = std::upper_bound(values.cbegin() + beg, values.cbegin() + end, value);
@@ -119,20 +120,20 @@ class HistogramCuts {
     return idx;
   }
 
-  bst_bin_t SearchBin(float value, bst_feature_t column_id) const {
+  [[nodiscard]] bst_bin_t SearchBin(float value, bst_feature_t column_id) const {
     return this->SearchBin(value, column_id, Ptrs(), Values());
   }
-
   /**
    * \brief Search the bin index for numerical feature.
    */
-  bst_bin_t SearchBin(Entry const& e) const { return SearchBin(e.fvalue, e.index); }
+  [[nodiscard]] bst_bin_t SearchBin(Entry const& e) const { return SearchBin(e.fvalue, e.index); }
 
   /**
    * \brief Search the bin index for categorical feature.
    */
-  bst_bin_t SearchCatBin(float value, bst_feature_t fidx, std::vector<uint32_t> const& ptrs,
-                         std::vector<float> const& vals) const {
+  [[nodiscard]] bst_bin_t SearchCatBin(float value, bst_feature_t fidx,
+                                       std::vector<uint32_t> const& ptrs,
+                                       std::vector<float> const& vals) const {
     auto end = ptrs.at(fidx + 1) + vals.cbegin();
     auto beg = ptrs[fidx] + vals.cbegin();
     // Truncates the value in case it's not perfectly rounded.
@@ -143,12 +144,14 @@ class HistogramCuts {
     }
     return bin_idx;
   }
-  bst_bin_t SearchCatBin(float value, bst_feature_t fidx) const {
+  [[nodiscard]] bst_bin_t SearchCatBin(float value, bst_feature_t fidx) const {
     auto const& ptrs = this->Ptrs();
     auto const& vals = this->Values();
     return this->SearchCatBin(value, fidx, ptrs, vals);
   }
-  bst_bin_t SearchCatBin(Entry const& e) const { return SearchCatBin(e.fvalue, e.index); }
+  [[nodiscard]] bst_bin_t SearchCatBin(Entry const& e) const {
+    return SearchCatBin(e.fvalue, e.index);
+  }
 
   /**
    * \brief Return numerical bin value given bin index.
diff --git a/src/data/array_interface.h b/src/data/array_interface.h
index 1b18f140a..bd66c2a53 100644
--- a/src/data/array_interface.h
+++ b/src/data/array_interface.h
@@ -590,7 +590,7 @@ class ArrayInterface {
 template <std::int32_t D, typename Fn>
 void DispatchDType(ArrayInterface<D> const array, std::int32_t device, Fn fn) {
   // Only used for cuDF at the moment.
-  CHECK_EQ(array.valid.Size(), 0);
+  CHECK_EQ(array.valid.Capacity(), 0);
   auto dispatch = [&](auto t) {
     using T = std::remove_const_t<decltype(t)> const;
     // Set the data size to max as we don't know the original size of a sliced array:
diff --git a/src/data/data.cc b/src/data/data.cc
index 00cff8ab0..d305749ee 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -416,7 +416,8 @@ void CopyTensorInfoImpl(Context const& ctx, Json arr_interface, linalg::Tensor<T
     p_out->Reshape(array.shape);
     return;
   }
-  CHECK(array.valid.Size() == 0) << "Meta info like label or weight can not have missing value.";
+  CHECK_EQ(array.valid.Capacity(), 0)
+      << "Meta info like label or weight can not have missing value.";
   if (array.is_contiguous && array.type == ToDType<T>::kType) {
     // Handle contigious
     p_out->ModifyInplace([&](HostDeviceVector<T>* data, common::Span<size_t, D> shape) {
diff --git a/src/data/data.cu b/src/data/data.cu
index eccbe7567..0f1fda661 100644
--- a/src/data/data.cu
+++ b/src/data/data.cu
@@ -33,7 +33,8 @@ void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tens
     p_out->Reshape(array.shape);
     return;
   }
-  CHECK(array.valid.Size() == 0) << "Meta info like label or weight can not have missing value.";
+  CHECK_EQ(array.valid.Capacity(), 0)
+      << "Meta info like label or weight can not have missing value.";
   auto ptr_device = SetDeviceToPtr(array.data);
   p_out->SetDevice(ptr_device);
 
diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index aa218fa31..13fcf9adf 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -5,6 +5,7 @@
 #include <thrust/iterator/transform_output_iterator.h>
 
 #include "../common/categorical.h"
+#include "../common/cuda_context.cuh"
 #include "../common/hist_util.cuh"
 #include "../common/random.h"
 #include "../common/transform_iterator.h"  // MakeIndexTransformIter
@@ -313,7 +314,8 @@ void CopyGHistToEllpack(GHistIndexMatrix const& page, common::Span<size_t const>
   auto d_csc_indptr = dh::ToSpan(csc_indptr);
 
   auto bin_type = page.index.GetBinTypeSize();
-  common::CompressedBufferWriter writer{page.cut.TotalBins() + 1};  // +1 for null value
+  common::CompressedBufferWriter writer{page.cut.TotalBins() +
+                                        static_cast<std::size_t>(1)};  // +1 for null value
 
   dh::LaunchN(row_stride * page.Size(), [=] __device__(size_t idx) mutable {
     auto ridx = idx / row_stride;
@@ -357,8 +359,10 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
 
   // copy gidx
   common::CompressedByteT* d_compressed_buffer = gidx_buffer.DevicePointer();
-  dh::device_vector<size_t> row_ptr(page.row_ptr);
+  dh::device_vector<size_t> row_ptr(page.row_ptr.size());
   auto d_row_ptr = dh::ToSpan(row_ptr);
+  dh::safe_cuda(cudaMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
+                                cudaMemcpyHostToDevice, ctx->CUDACtx()->Stream()));
 
   auto accessor = this->GetDeviceAccessor(ctx->gpu_id, ft);
   auto null = accessor.NullValue();
diff --git a/src/data/sparse_page_dmatrix.h b/src/data/sparse_page_dmatrix.h
index 02aa9a5c0..d4324000f 100644
--- a/src/data/sparse_page_dmatrix.h
+++ b/src/data/sparse_page_dmatrix.h
@@ -7,9 +7,6 @@
 #ifndef XGBOOST_DATA_SPARSE_PAGE_DMATRIX_H_
 #define XGBOOST_DATA_SPARSE_PAGE_DMATRIX_H_
 
-#include <xgboost/data.h>
-#include <xgboost/logging.h>
-
 #include <algorithm>
 #include <map>
 #include <memory>
@@ -20,35 +17,33 @@
 #include "ellpack_page_source.h"
 #include "gradient_index_page_source.h"
 #include "sparse_page_source.h"
+#include "xgboost/data.h"
+#include "xgboost/logging.h"
 
-namespace xgboost {
-namespace data {
+namespace xgboost::data {
 /**
  * \brief DMatrix used for external memory.
  *
  * The external memory is created for controlling memory usage by splitting up data into
- * multiple batches.  However that doesn't mean we will actually process exact 1 batch at
- * a time, which would be terribly slow considering that we have to loop through the
- * whole dataset for every tree split.  So we use async pre-fetch and let caller to decide
- * how many batches it wants to process by returning data as shared pointer.  The caller
- * can use async function to process the data or just stage those batches, making the
- * decision is out of the scope for sparse page dmatrix.  These 2 optimizations might
- * defeat the purpose of splitting up dataset since if you load all the batches then the
- * memory usage is even worse than using a single batch.  Essentially we need to control
- * how many batches can be in memory at the same time.
+ * multiple batches.  However that doesn't mean we will actually process exactly 1 batch
+ * at a time, which would be terribly slow considering that we have to loop through the
+ * whole dataset for every tree split.  So we use async to pre-fetch pages and let the
+ * caller to decide how many batches it wants to process by returning data as a shared
+ * pointer. The caller can use async function to process the data or just stage those
+ * batches based on its use cases. These two optimizations might defeat the purpose of
+ * splitting up dataset since if you stage all the batches then the memory usage might be
+ * even worse than using a single batch. As a result, we must control how many batches can
+ * be in memory at any given time.
  *
- * Right now the write to the cache is sequential operation and is blocking, reading from
- * cache is async but with a hard coded limit of 4 pages as an heuristic.  So by sparse
- * dmatrix itself there can be only 9 pages in main memory (might be of different types)
- * at the same time: 1 page pending for write, 4 pre-fetched sparse pages, 4 pre-fetched
- * dependent pages.  If the caller stops iteration at the middle and start again, then the
- * number of pages in memory can hit 16 due to pre-fetching, but this should be a bug in
- * caller's code (XGBoost doesn't discard a large portion of data at the end, there's not
- * sampling algo that samples only the first portion of data).
+ * Right now the write to the cache is a sequential operation and is blocking. Reading
+ * from cache on ther other hand, is async but with a hard coded limit of 3 pages as an
+ * heuristic.  So by sparse dmatrix itself there can be only 7 pages in main memory (might
+ * be of different types) at the same time: 1 page pending for write, 3 pre-fetched sparse
+ * pages, 3 pre-fetched dependent pages.
  *
  * Of course if the caller decides to retain some batches to perform parallel processing,
  * then we might load all pages in memory, which is also considered as a bug in caller's
- * code.  So if the algo supports external memory, it must be careful that queue for async
+ * code. So if the algo supports external memory, it must be careful that queue for async
  * call must have an upper limit.
  *
  * Another assumption we make is that the data must be immutable so caller should never
@@ -101,7 +96,7 @@ class SparsePageDMatrix : public DMatrix {
   MetaInfo &Info() override;
   const MetaInfo &Info() const override;
   Context const *Ctx() const override { return &fmat_ctx_; }
-
+  // The only DMatrix implementation that returns false.
   bool SingleColBlock() const override { return false; }
   DMatrix *Slice(common::Span<int32_t const>) override {
     LOG(FATAL) << "Slicing DMatrix is not supported for external memory.";
@@ -153,6 +148,5 @@ inline std::string MakeCache(SparsePageDMatrix *ptr, std::string format, std::st
   }
   return id;
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
 #endif  // XGBOOST_DATA_SPARSE_PAGE_DMATRIX_H_
diff --git a/src/data/sparse_page_source.h b/src/data/sparse_page_source.h
index b4e42f2db..9f7bee521 100644
--- a/src/data/sparse_page_source.h
+++ b/src/data/sparse_page_source.h
@@ -6,39 +6,43 @@
 #define XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_
 
 #include <algorithm>  // for min
-#include <future>     // async
+#include <future>     // for async
 #include <map>
 #include <memory>
 #include <string>
 #include <thread>
-#include <utility>
+#include <utility>  // for pair, move
 #include <vector>
 
 #include "../common/common.h"
-#include "../common/io.h"     // for PrivateMmapStream, PadPageForMMAP
+#include "../common/io.h"     // for PrivateMmapConstStream
 #include "../common/timer.h"  // for Monitor, Timer
 #include "adapter.h"
-#include "dmlc/common.h"  // OMPException
-#include "proxy_dmatrix.h"
-#include "sparse_page_writer.h"
+#include "dmlc/common.h"         // for OMPException
+#include "proxy_dmatrix.h"       // for DMatrixProxy
+#include "sparse_page_writer.h"  // for SparsePageFormat
 #include "xgboost/base.h"
 #include "xgboost/data.h"
 
 namespace xgboost::data {
 inline void TryDeleteCacheFile(const std::string& file) {
   if (std::remove(file.c_str()) != 0) {
+    // Don't throw, this is called in a destructor.
     LOG(WARNING) << "Couldn't remove external memory cache file " << file
                  << "; you may want to remove it manually";
   }
 }
 
+/**
+ * @brief Information about the cache including path and page offsets.
+ */
 struct Cache {
   // whether the write to the cache is complete
   bool written;
   std::string name;
   std::string format;
   // offset into binary cache file.
-  std::vector<size_t> offset;
+  std::vector<std::uint64_t> offset;
 
   Cache(bool w, std::string n, std::string fmt)
       : written{w}, name{std::move(n)}, format{std::move(fmt)} {
@@ -50,14 +54,24 @@ struct Cache {
     return name + format;
   }
 
-  std::string ShardName() {
+  [[nodiscard]] std::string ShardName() const {
     return ShardName(this->name, this->format);
   }
-  void Push(std::size_t n_bytes) {
-    offset.push_back(n_bytes);
+  /**
+   * @brief Record a page with size of n_bytes.
+   */
+  void Push(std::size_t n_bytes) { offset.push_back(n_bytes); }
+  /**
+   * @brief Returns the view start and length for the i^th page.
+   */
+  [[nodiscard]] auto View(std::size_t i) const {
+    std::uint64_t off = offset.at(i);
+    std::uint64_t len = offset.at(i + 1) - offset[i];
+    return std::pair{off, len};
   }
-
-  // The write is completed.
+  /**
+   * @brief Call this once the write for the cache is complete.
+   */
   void Commit() {
     if (!written) {
       std::partial_sum(offset.begin(), offset.end(), offset.begin());
@@ -66,7 +80,7 @@ struct Cache {
   }
 };
 
-// Prevents multi-threaded call.
+// Prevents multi-threaded call to `GetBatches`.
 class TryLockGuard {
   std::mutex& lock_;
 
@@ -79,22 +93,25 @@ class TryLockGuard {
   }
 };
 
+/**
+ * @brief Base class for all page sources. Handles fetching, writing, and iteration.
+ */
 template <typename S>
 class SparsePageSourceImpl : public BatchIteratorImpl<S> {
  protected:
   // Prevents calling this iterator from multiple places(or threads).
   std::mutex single_threaded_;
-
+  // The current page.
   std::shared_ptr<S> page_;
 
   bool at_end_ {false};
   float missing_;
-  int nthreads_;
+  std::int32_t nthreads_;
   bst_feature_t n_features_;
-
-  uint32_t count_{0};
-
-  uint32_t n_batches_ {0};
+  // Index to the current page.
+  std::uint32_t count_{0};
+  // Total number of batches.
+  std::uint32_t n_batches_{0};
 
   std::shared_ptr<Cache> cache_info_;
 
@@ -102,6 +119,9 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
   // A ring storing futures to data.  Since the DMatrix iterator is forward only, so we
   // can pre-fetch data in a ring.
   std::unique_ptr<Ring> ring_{new Ring};
+  // Catching exception in pre-fetch threads to prevent segfault. Not always work though,
+  // OOM error can be delayed due to lazy commit. On the bright side, if mmap is used then
+  // OOM error should be rare.
   dmlc::OMPException exec_;
   common::Monitor monitor_;
 
@@ -123,7 +143,6 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
 
     exec_.Rethrow();
 
-    monitor_.Start("launch");
     for (std::size_t i = 0; i < n_prefetch_batches; ++i, ++fetch_it) {
       fetch_it %= n_batches_;  // ring
       if (ring_->at(fetch_it).valid()) {
@@ -134,33 +153,25 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
       ring_->at(fetch_it) = std::async(std::launch::async, [fetch_it, self, this]() {
         auto page = std::make_shared<S>();
         this->exec_.Run([&] {
-          common::Timer timer;
-          timer.Start();
           std::unique_ptr<SparsePageFormat<S>> fmt{CreatePageFormat<S>("raw")};
-          auto n = self->cache_info_->ShardName();
-
-          std::uint64_t offset = self->cache_info_->offset.at(fetch_it);
-          std::uint64_t length = self->cache_info_->offset.at(fetch_it + 1) - offset;
-
-          auto fi = std::make_unique<common::PrivateMmapConstStream>(n, offset, length);
+          auto name = self->cache_info_->ShardName();
+          auto [offset, length] = self->cache_info_->View(fetch_it);
+          auto fi = std::make_unique<common::PrivateMmapConstStream>(name, offset, length);
           CHECK(fmt->Read(page.get(), fi.get()));
-          timer.Stop();
-
-          LOG(INFO) << "Read a page `" << typeid(S).name() << "` in " << timer.ElapsedSeconds()
-                    << " seconds.";
         });
         return page;
       });
     }
-    monitor_.Stop("launch");
 
     CHECK_EQ(std::count_if(ring_->cbegin(), ring_->cend(), [](auto const& f) { return f.valid(); }),
              n_prefetch_batches)
         << "Sparse DMatrix assumes forward iteration.";
+
     monitor_.Start("Wait");
     page_ = (*ring_)[count_].get();
-    monitor_.Stop("Wait");
     CHECK(!(*ring_)[count_].valid());
+    monitor_.Stop("Wait");
+
     exec_.Rethrow();
 
     return true;
@@ -183,6 +194,7 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
     auto bytes = fmt->Write(*page_, fo.get());
 
     timer.Stop();
+    // Not entirely accurate, the kernels doesn't have to flush the data.
     LOG(INFO) << static_cast<double>(bytes) / 1024.0 / 1024.0 << " MB written in "
               << timer.ElapsedSeconds() << " seconds.";
     cache_info_->Push(bytes);
@@ -204,6 +216,7 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
   SparsePageSourceImpl(SparsePageSourceImpl const &that) = delete;
 
   ~SparsePageSourceImpl() override {
+    // Don't orphan the threads.
     for (auto& fu : *ring_) {
       if (fu.valid()) {
         fu.get();
@@ -211,18 +224,18 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
     }
   }
 
-  uint32_t Iter() const { return count_; }
+  [[nodiscard]] uint32_t Iter() const { return count_; }
 
   const S &operator*() const override {
     CHECK(page_);
     return *page_;
   }
 
-  std::shared_ptr<S const> Page() const override {
+  [[nodiscard]] std::shared_ptr<S const> Page() const override {
     return page_;
   }
 
-  bool AtEnd() const override {
+  [[nodiscard]] bool AtEnd() const override {
     return at_end_;
   }
 
@@ -230,20 +243,23 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
     TryLockGuard guard{single_threaded_};
     at_end_ = false;
     count_ = 0;
+    // Pre-fetch for the next round of iterations.
     this->Fetch();
   }
 };
 
 #if defined(XGBOOST_USE_CUDA)
+// Push data from CUDA.
 void DevicePush(DMatrixProxy* proxy, float missing, SparsePage* page);
 #else
 inline void DevicePush(DMatrixProxy*, float, SparsePage*) { common::AssertGPUSupport(); }
 #endif
 
 class SparsePageSource : public SparsePageSourceImpl<SparsePage> {
+  // This is the source from the user.
   DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext> iter_;
   DMatrixProxy* proxy_;
-  size_t base_row_id_ {0};
+  std::size_t base_row_id_{0};
 
   void Fetch() final {
     page_ = std::make_shared<SparsePage>();
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index 4b834e78f..98e380682 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -439,7 +439,7 @@ struct ShapSplitCondition {
     if (isnan(x)) {
       return is_missing_branch;
     }
-    if (categories.Size() != 0) {
+    if (categories.Capacity() != 0) {
       auto cat = static_cast<uint32_t>(x);
       return categories.Check(cat);
     } else {
@@ -454,7 +454,7 @@ struct ShapSplitCondition {
     if (l.Data() == r.Data()) {
       return l;
     }
-    if (l.Size() > r.Size()) {
+    if (l.Capacity() > r.Capacity()) {
       thrust::swap(l, r);
     }
     for (size_t i = 0; i < r.Bits().size(); ++i) {
@@ -466,7 +466,7 @@ struct ShapSplitCondition {
   // Combine two split conditions on the same feature
   XGBOOST_DEVICE void Merge(ShapSplitCondition other) {
     // Combine duplicate features
-    if (categories.Size() != 0 || other.categories.Size() != 0) {
+    if (categories.Capacity() != 0 || other.categories.Capacity() != 0) {
       categories = Intersect(categories, other.categories);
     } else {
       feature_lower_bound = max(feature_lower_bound, other.feature_lower_bound);
diff --git a/src/tree/constraints.cu b/src/tree/constraints.cu
index b6db0eda0..ae1d3073c 100644
--- a/src/tree/constraints.cu
+++ b/src/tree/constraints.cu
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2019 XGBoost contributors
+/**
+ * Copyright 2019-2023, XGBoost contributors
  */
 #include <thrust/copy.h>
 #include <thrust/device_vector.h>
@@ -140,20 +140,20 @@ void FeatureInteractionConstraintDevice::Reset() {
 __global__ void ClearBuffersKernel(
     LBitField64 result_buffer_output, LBitField64 result_buffer_input) {
   auto tid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (tid < result_buffer_output.Size()) {
+  if (tid < result_buffer_output.Capacity()) {
     result_buffer_output.Clear(tid);
   }
-  if (tid < result_buffer_input.Size()) {
+  if (tid < result_buffer_input.Capacity()) {
     result_buffer_input.Clear(tid);
   }
 }
 
 void FeatureInteractionConstraintDevice::ClearBuffers() {
-  CHECK_EQ(output_buffer_bits_.Size(), input_buffer_bits_.Size());
-  CHECK_LE(feature_buffer_.Size(), output_buffer_bits_.Size());
+  CHECK_EQ(output_buffer_bits_.Capacity(), input_buffer_bits_.Capacity());
+  CHECK_LE(feature_buffer_.Capacity(), output_buffer_bits_.Capacity());
   uint32_t constexpr kBlockThreads = 256;
   auto const n_grids = static_cast<uint32_t>(
-      common::DivRoundUp(input_buffer_bits_.Size(), kBlockThreads));
+      common::DivRoundUp(input_buffer_bits_.Capacity(), kBlockThreads));
   dh::LaunchKernel {n_grids, kBlockThreads} (
       ClearBuffersKernel,
       output_buffer_bits_, input_buffer_bits_);
@@ -207,11 +207,11 @@ common::Span<bst_feature_t> FeatureInteractionConstraintDevice::Query(
   ClearBuffers();
 
   LBitField64 node_constraints = s_node_constraints_[nid];
-  CHECK_EQ(input_buffer_bits_.Size(), output_buffer_bits_.Size());
+  CHECK_EQ(input_buffer_bits_.Capacity(), output_buffer_bits_.Capacity());
 
   uint32_t constexpr kBlockThreads = 256;
   auto n_grids = static_cast<uint32_t>(
-      common::DivRoundUp(output_buffer_bits_.Size(), kBlockThreads));
+      common::DivRoundUp(output_buffer_bits_.Capacity(), kBlockThreads));
   dh::LaunchKernel {n_grids, kBlockThreads} (
       SetInputBufferKernel,
       feature_list, input_buffer_bits_);
@@ -274,13 +274,13 @@ __global__ void InteractionConstraintSplitKernel(LBitField64 feature,
                                                  LBitField64 left,
                                                  LBitField64 right) {
   auto tid = threadIdx.x + blockDim.x * blockIdx.x;
-  if (tid > node.Size()) {
+  if (tid > node.Capacity()) {
     return;
   }
   // enable constraints from feature
   node |= feature;
   // clear the buffer after use
-  if (tid < feature.Size()) {
+  if (tid < feature.Capacity()) {
     feature.Clear(tid);
   }
 
@@ -323,7 +323,7 @@ void FeatureInteractionConstraintDevice::Split(
       s_sets_, s_sets_ptr_);
 
   uint32_t constexpr kBlockThreads = 256;
-  auto n_grids = static_cast<uint32_t>(common::DivRoundUp(node.Size(), kBlockThreads));
+  auto n_grids = static_cast<uint32_t>(common::DivRoundUp(node.Capacity(), kBlockThreads));
 
   dh::LaunchKernel {n_grids, kBlockThreads} (
       InteractionConstraintSplitKernel,
diff --git a/src/tree/tree_model.cc b/src/tree/tree_model.cc
index 7550904b5..f32ea701f 100644
--- a/src/tree/tree_model.cc
+++ b/src/tree/tree_model.cc
@@ -213,7 +213,7 @@ std::vector<bst_cat_t> GetSplitCategories(RegTree const &tree, int32_t nidx) {
   auto split = common::KCatBitField{csr.categories.subspan(seg.beg, seg.size)};
 
   std::vector<bst_cat_t> cats;
-  for (size_t i = 0; i < split.Size(); ++i) {
+  for (size_t i = 0; i < split.Capacity(); ++i) {
     if (split.Check(i)) {
       cats.push_back(static_cast<bst_cat_t>(i));
     }
@@ -1004,7 +1004,7 @@ void RegTree::SaveCategoricalSplit(Json* p_out) const {
       auto segment = split_categories_segments_[i];
       auto node_categories = this->GetSplitCategories().subspan(segment.beg, segment.size);
       common::KCatBitField const cat_bits(node_categories);
-      for (size_t i = 0; i < cat_bits.Size(); ++i) {
+      for (size_t i = 0; i < cat_bits.Capacity(); ++i) {
         if (cat_bits.Check(i)) {
           categories.GetArray().emplace_back(i);
         }
diff --git a/tests/cpp/common/test_bitfield.cc b/tests/cpp/common/test_bitfield.cc
index c7b2d5cb9..902e69f85 100644
--- a/tests/cpp/common/test_bitfield.cc
+++ b/tests/cpp/common/test_bitfield.cc
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2019 XGBoost contributors
+/**
+ * Copyright 2019-2023, XGBoost contributors
  */
 #include <gtest/gtest.h>
 #include "../../../src/common/bitfield.h"
@@ -14,7 +14,7 @@ TEST(BitField, Check) {
                 static_cast<typename common::Span<LBitField64::value_type>::index_type>(
                     storage.size())});
     size_t true_bit = 190;
-    for (size_t i = true_bit + 1; i < bits.Size(); ++i) {
+    for (size_t i = true_bit + 1; i < bits.Capacity(); ++i) {
       ASSERT_FALSE(bits.Check(i));
     }
     ASSERT_TRUE(bits.Check(true_bit));
@@ -34,7 +34,7 @@ TEST(BitField, Check) {
       ASSERT_FALSE(bits.Check(i));
     }
     ASSERT_TRUE(bits.Check(true_bit));
-    for (size_t i = true_bit + 1; i < bits.Size(); ++i) {
+    for (size_t i = true_bit + 1; i < bits.Capacity(); ++i) {
       ASSERT_FALSE(bits.Check(i));
     }
   }
diff --git a/tests/cpp/common/test_bitfield.cu b/tests/cpp/common/test_bitfield.cu
index 98fbd2ad1..a9b183c43 100644
--- a/tests/cpp/common/test_bitfield.cu
+++ b/tests/cpp/common/test_bitfield.cu
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2019 XGBoost contributors
+/**
+ * Copyright 2019-2023, XGBoost contributors
  */
 #include <gtest/gtest.h>
 #include <thrust/copy.h>
@@ -12,7 +12,7 @@ namespace xgboost {
 
 __global__ void TestSetKernel(LBitField64 bits) {
   auto tid = threadIdx.x + blockIdx.x * blockDim.x;
-  if (tid < bits.Size()) {
+  if (tid < bits.Capacity()) {
     bits.Set(tid);
   }
 }
@@ -36,20 +36,16 @@ TEST(BitField, GPUSet) {
 
   std::vector<LBitField64::value_type> h_storage(storage.size());
   thrust::copy(storage.begin(), storage.end(), h_storage.begin());
-
-  LBitField64 outputs {
-    common::Span<LBitField64::value_type>{h_storage.data(),
-                                       h_storage.data() + h_storage.size()}};
+  LBitField64 outputs{
+      common::Span<LBitField64::value_type>{h_storage.data(), h_storage.data() + h_storage.size()}};
   for (size_t i = 0; i < kBits; ++i) {
     ASSERT_TRUE(outputs.Check(i));
   }
 }
 
-__global__ void TestOrKernel(LBitField64 lhs, LBitField64 rhs) {
-  lhs |= rhs;
-}
-
-TEST(BitField, GPUAnd) {
+namespace {
+template <bool is_and, typename Op>
+void TestGPULogic(Op op) {
   uint32_t constexpr kBits = 128;
   dh::device_vector<LBitField64::value_type> lhs_storage(kBits);
   dh::device_vector<LBitField64::value_type> rhs_storage(kBits);
@@ -57,13 +53,32 @@ TEST(BitField, GPUAnd) {
   auto rhs = LBitField64(dh::ToSpan(rhs_storage));
   thrust::fill(lhs_storage.begin(), lhs_storage.end(), 0UL);
   thrust::fill(rhs_storage.begin(), rhs_storage.end(), ~static_cast<LBitField64::value_type>(0UL));
-  TestOrKernel<<<1, kBits>>>(lhs, rhs);
+  dh::LaunchN(kBits, [=] __device__(auto) mutable { op(lhs, rhs); });
 
   std::vector<LBitField64::value_type> h_storage(lhs_storage.size());
   thrust::copy(lhs_storage.begin(), lhs_storage.end(), h_storage.begin());
-  LBitField64 outputs {{h_storage.data(), h_storage.data() + h_storage.size()}};
-  for (size_t i = 0; i < kBits; ++i) {
-    ASSERT_TRUE(outputs.Check(i));
+  LBitField64 outputs{{h_storage.data(), h_storage.data() + h_storage.size()}};
+  if (is_and) {
+    for (size_t i = 0; i < kBits; ++i) {
+      ASSERT_FALSE(outputs.Check(i));
+    }
+  } else {
+    for (size_t i = 0; i < kBits; ++i) {
+      ASSERT_TRUE(outputs.Check(i));
+    }
   }
 }
-}  // namespace xgboost
\ No newline at end of file
+
+void TestGPUAnd() {
+  TestGPULogic<true>([] XGBOOST_DEVICE(LBitField64 & lhs, LBitField64 const& rhs) { lhs &= rhs; });
+}
+
+void TestGPUOr() {
+  TestGPULogic<false>([] XGBOOST_DEVICE(LBitField64 & lhs, LBitField64 const& rhs) { lhs |= rhs; });
+}
+}  // namespace
+
+TEST(BitField, GPUAnd) { TestGPUAnd(); }
+
+TEST(BitField, GPUOr) { TestGPUOr(); }
+}  // namespace xgboost
diff --git a/tests/cpp/common/test_column_matrix.cc b/tests/cpp/common/test_column_matrix.cc
index 0578683d8..8b8df4861 100644
--- a/tests/cpp/common/test_column_matrix.cc
+++ b/tests/cpp/common/test_column_matrix.cc
@@ -83,7 +83,9 @@ template <typename BinIdxType>
 void CheckColumWithMissingValue(const DenseColumnIter<BinIdxType, true>& col,
                                 const GHistIndexMatrix& gmat) {
   for (auto i = 0ull; i < col.Size(); i++) {
-    if (col.IsMissing(i)) continue;
+    if (col.IsMissing(i)) {
+      continue;
+    }
     EXPECT_EQ(gmat.index[gmat.row_ptr[i]], col.GetGlobalBinIdx(i));
   }
 }
diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
index f1317fc02..cb2f7d604 100644
--- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
+++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
@@ -285,8 +285,6 @@ TEST(GpuHist, PartitionTwoNodes) {
                                     dh::ToSpan(feature_histogram_b)};
     thrust::device_vector<GPUExpandEntry> results(2);
     evaluator.EvaluateSplits({0, 1}, 1, dh::ToSpan(inputs), shared_inputs, dh::ToSpan(results));
-    GPUExpandEntry result_a = results[0];
-    GPUExpandEntry result_b = results[1];
     EXPECT_EQ(std::bitset<32>(evaluator.GetHostNodeCats(0)[0]),
               std::bitset<32>("10000000000000000000000000000000"));
     EXPECT_EQ(std::bitset<32>(evaluator.GetHostNodeCats(1)[0]),
diff --git a/tests/cpp/tree/test_constraints.cu b/tests/cpp/tree/test_constraints.cu
index c9f1639b3..09e72a1d2 100644
--- a/tests/cpp/tree/test_constraints.cu
+++ b/tests/cpp/tree/test_constraints.cu
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2019 XGBoost contributors
+/**
+ * Copyright 2019-2023, XGBoost contributors
  */
 #include <gtest/gtest.h>
 #include <thrust/copy.h>
@@ -53,7 +53,7 @@ void CompareBitField(LBitField64 d_field, std::set<uint32_t> positions) {
   LBitField64 h_field{ {h_field_storage.data(),
                         h_field_storage.data() + h_field_storage.size()} };
 
-  for (size_t i = 0; i < h_field.Size(); ++i) {
+  for (size_t i = 0; i < h_field.Capacity(); ++i) {
     if (positions.find(i) != positions.cend()) {
       ASSERT_TRUE(h_field.Check(i));
     } else {
@@ -82,7 +82,7 @@ TEST(GPUFeatureInteractionConstraint, Init) {
         {h_node_storage.data(), h_node_storage.data() +  h_node_storage.size()}
       };
       // no feature is attached to node.
-      for (size_t i = 0; i < h_node.Size(); ++i) {
+      for (size_t i = 0; i < h_node.Capacity(); ++i) {
         ASSERT_FALSE(h_node.Check(i));
       }
     }

From 6efe7c129fbb0f08d5dd9b856d9afaa28f5f033f Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 26 Jun 2023 18:32:11 +0800
Subject: [PATCH 007/136] [doc] Update reference in R vignettes. (#9323)

---
 R-package/vignettes/xgboost.bib | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/R-package/vignettes/xgboost.bib b/R-package/vignettes/xgboost.bib
index 5deb1e13d..908be3136 100644
--- a/R-package/vignettes/xgboost.bib
+++ b/R-package/vignettes/xgboost.bib
@@ -18,13 +18,11 @@
   publisher={Institute of Mathematical Statistics}
 }
 
-
 @misc{
     Bache+Lichman:2013 ,
     author = "K. Bache and M. Lichman",
     year = "2013",
     title = "{UCI} Machine Learning Repository",
-    url = "http://archive.ics.uci.edu/ml/",
-    institution = "University of California, Irvine, School of Information and Computer Sciences" 
+    url = "https://archive.ics.uci.edu/",
+    institution = "University of California, Irvine, School of Information and Computer Sciences"
 }
-

From cfa9c42eb4687c227bcd4c37d46602e9e5ce32b8 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 26 Jun 2023 22:35:02 +0800
Subject: [PATCH 008/136] Fix callback in AFT viz demo. (#9333)

* Fix callback in AFT viz demo.

- Update the callback function.
- Add lint check.
---
 demo/aft_survival/aft_survival_viz_demo.py | 135 +++++++++++++--------
 tests/ci_build/lint_python.py              |  10 +-
 2 files changed, 96 insertions(+), 49 deletions(-)

diff --git a/demo/aft_survival/aft_survival_viz_demo.py b/demo/aft_survival/aft_survival_viz_demo.py
index a17c55edf..b925ca547 100644
--- a/demo/aft_survival/aft_survival_viz_demo.py
+++ b/demo/aft_survival/aft_survival_viz_demo.py
@@ -11,33 +11,43 @@ import numpy as np
 
 import xgboost as xgb
 
-plt.rcParams.update({'font.size': 13})
+plt.rcParams.update({"font.size": 13})
+
 
 # Function to visualize censored labels
-def plot_censored_labels(X, y_lower, y_upper):
-    def replace_inf(x, target_value):
+def plot_censored_labels(
+    X: np.ndarray, y_lower: np.ndarray, y_upper: np.ndarray
+) -> None:
+    def replace_inf(x: np.ndarray, target_value: float) -> np.ndarray:
         x[np.isinf(x)] = target_value
         return x
-    plt.plot(X, y_lower, 'o', label='y_lower', color='blue')
-    plt.plot(X, y_upper, 'o', label='y_upper', color='fuchsia')
-    plt.vlines(X, ymin=replace_inf(y_lower, 0.01), ymax=replace_inf(y_upper, 1000),
-               label='Range for y', color='gray')
+
+    plt.plot(X, y_lower, "o", label="y_lower", color="blue")
+    plt.plot(X, y_upper, "o", label="y_upper", color="fuchsia")
+    plt.vlines(
+        X,
+        ymin=replace_inf(y_lower, 0.01),
+        ymax=replace_inf(y_upper, 1000.0),
+        label="Range for y",
+        color="gray",
+    )
+
 
 # Toy data
 X = np.array([1, 2, 3, 4, 5]).reshape((-1, 1))
 INF = np.inf
-y_lower = np.array([ 10,  15, -INF, 30, 100])
-y_upper = np.array([INF, INF,   20, 50, INF])
+y_lower = np.array([10, 15, -INF, 30, 100])
+y_upper = np.array([INF, INF, 20, 50, INF])
 
 # Visualize toy data
 plt.figure(figsize=(5, 4))
 plot_censored_labels(X, y_lower, y_upper)
 plt.ylim((6, 200))
-plt.legend(loc='lower right')
-plt.title('Toy data')
-plt.xlabel('Input feature')
-plt.ylabel('Label')
-plt.yscale('log')
+plt.legend(loc="lower right")
+plt.title("Toy data")
+plt.xlabel("Input feature")
+plt.ylabel("Label")
+plt.yscale("log")
 plt.tight_layout()
 plt.show(block=True)
 
@@ -46,54 +56,83 @@ grid_pts = np.linspace(0.8, 5.2, 1000).reshape((-1, 1))
 
 # Train AFT model using XGBoost
 dmat = xgb.DMatrix(X)
-dmat.set_float_info('label_lower_bound', y_lower)
-dmat.set_float_info('label_upper_bound', y_upper)
-params = {'max_depth': 3, 'objective':'survival:aft', 'min_child_weight': 0}
+dmat.set_float_info("label_lower_bound", y_lower)
+dmat.set_float_info("label_upper_bound", y_upper)
+params = {"max_depth": 3, "objective": "survival:aft", "min_child_weight": 0}
 
 accuracy_history = []
-def plot_intermediate_model_callback(env):
-    """Custom callback to plot intermediate models"""
-    # Compute y_pred = prediction using the intermediate model, at current boosting iteration
-    y_pred = env.model.predict(dmat)
-    # "Accuracy" = the number of data points whose ranged label (y_lower, y_upper) includes
-    #              the corresponding predicted label (y_pred)
-    acc = np.sum(np.logical_and(y_pred >= y_lower, y_pred <= y_upper)/len(X) * 100)
-    accuracy_history.append(acc)
 
-    # Plot ranged labels as well as predictions by the model
-    plt.subplot(5, 3, env.iteration + 1)
-    plot_censored_labels(X, y_lower, y_upper)
-    y_pred_grid_pts = env.model.predict(xgb.DMatrix(grid_pts))
-    plt.plot(grid_pts, y_pred_grid_pts, 'r-', label='XGBoost AFT model', linewidth=4)
-    plt.title('Iteration {}'.format(env.iteration), x=0.5, y=0.8)
-    plt.xlim((0.8, 5.2))
-    plt.ylim((1 if np.min(y_pred) < 6 else 6, 200))
-    plt.yscale('log')
 
-res = {}
-plt.figure(figsize=(12,13))
-bst = xgb.train(params, dmat, 15, [(dmat, 'train')], evals_result=res,
-                callbacks=[plot_intermediate_model_callback])
+class PlotIntermediateModel(xgb.callback.TrainingCallback):
+    """Custom callback to plot intermediate models."""
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def after_iteration(
+        self,
+        model: xgb.Booster,
+        epoch: int,
+        evals_log: xgb.callback.TrainingCallback.EvalsLog,
+    ) -> bool:
+        """Run after training is finished."""
+        # Compute y_pred = prediction using the intermediate model, at current boosting
+        # iteration
+        y_pred = model.predict(dmat)
+        # "Accuracy" = the number of data points whose ranged label (y_lower, y_upper)
+        #              includes the corresponding predicted label (y_pred)
+        acc = np.sum(
+            np.logical_and(y_pred >= y_lower, y_pred <= y_upper) / len(X) * 100
+        )
+        accuracy_history.append(acc)
+
+        # Plot ranged labels as well as predictions by the model
+        plt.subplot(5, 3, epoch + 1)
+        plot_censored_labels(X, y_lower, y_upper)
+        y_pred_grid_pts = model.predict(xgb.DMatrix(grid_pts))
+        plt.plot(
+            grid_pts, y_pred_grid_pts, "r-", label="XGBoost AFT model", linewidth=4
+        )
+        plt.title("Iteration {}".format(epoch), x=0.5, y=0.8)
+        plt.xlim((0.8, 5.2))
+        plt.ylim((1 if np.min(y_pred) < 6 else 6, 200))
+        plt.yscale("log")
+        return False
+
+
+res: xgb.callback.TrainingCallback.EvalsLog = {}
+plt.figure(figsize=(12, 13))
+bst = xgb.train(
+    params,
+    dmat,
+    15,
+    [(dmat, "train")],
+    evals_result=res,
+    callbacks=[PlotIntermediateModel()],
+)
 plt.tight_layout()
-plt.legend(loc='lower center', ncol=4,
-           bbox_to_anchor=(0.5, 0),
-           bbox_transform=plt.gcf().transFigure)
+plt.legend(
+    loc="lower center",
+    ncol=4,
+    bbox_to_anchor=(0.5, 0),
+    bbox_transform=plt.gcf().transFigure,
+)
 plt.tight_layout()
 
 # Plot negative log likelihood over boosting iterations
-plt.figure(figsize=(8,3))
+plt.figure(figsize=(8, 3))
 plt.subplot(1, 2, 1)
-plt.plot(res['train']['aft-nloglik'], 'b-o', label='aft-nloglik')
-plt.xlabel('# Boosting Iterations')
-plt.legend(loc='best')
+plt.plot(res["train"]["aft-nloglik"], "b-o", label="aft-nloglik")
+plt.xlabel("# Boosting Iterations")
+plt.legend(loc="best")
 
 # Plot "accuracy" over boosting iterations
 # "Accuracy" = the number of data points whose ranged label (y_lower, y_upper) includes
 #              the corresponding predicted label (y_pred)
 plt.subplot(1, 2, 2)
-plt.plot(accuracy_history, 'r-o', label='Accuracy (%)')
-plt.xlabel('# Boosting Iterations')
-plt.legend(loc='best')
+plt.plot(accuracy_history, "r-o", label="Accuracy (%)")
+plt.xlabel("# Boosting Iterations")
+plt.legend(loc="best")
 plt.tight_layout()
 
 plt.show()
diff --git a/tests/ci_build/lint_python.py b/tests/ci_build/lint_python.py
index 90c52aad4..85ece676e 100644
--- a/tests/ci_build/lint_python.py
+++ b/tests/ci_build/lint_python.py
@@ -37,6 +37,7 @@ class LintersPaths:
         "demo/guide-python/quantile_regression.py",
         "demo/guide-python/multioutput_regression.py",
         "demo/guide-python/learning_to_rank.py",
+        "demo/aft_survival/aft_survival_viz_demo.py",
         # CI
         "tests/ci_build/lint_python.py",
         "tests/ci_build/test_r_package.py",
@@ -78,6 +79,7 @@ class LintersPaths:
         "demo/guide-python/quantile_regression.py",
         "demo/guide-python/multioutput_regression.py",
         "demo/guide-python/learning_to_rank.py",
+        "demo/aft_survival/aft_survival_viz_demo.py",
         # CI
         "tests/ci_build/lint_python.py",
         "tests/ci_build/test_r_package.py",
@@ -114,7 +116,13 @@ def run_black(rel_path: str, fix: bool) -> bool:
 @cd(PY_PACKAGE)
 def run_isort(rel_path: str, fix: bool) -> bool:
     # Isort gets confused when trying to find the config file, so specified explicitly.
-    cmd = ["isort", "--settings-path", PY_PACKAGE, os.path.join(ROOT, rel_path)]
+    cmd = [
+        "isort",
+        "--settings-path",
+        PY_PACKAGE,
+        f"--src={PY_PACKAGE}",
+        os.path.join(ROOT, rel_path),
+    ]
     if not fix:
         cmd += ["--check"]
 

From 96c3071a8ae3740ce88b940882e3bc2807e57b81 Mon Sep 17 00:00:00 2001
From: jasjung <insikjung2017@u.northwestern.edu>
Date: Mon, 26 Jun 2023 22:56:18 -0700
Subject: [PATCH 009/136] [doc] Update learning_to_rank.rst (#9336)

---
 doc/tutorials/learning_to_rank.rst | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/doc/tutorials/learning_to_rank.rst b/doc/tutorials/learning_to_rank.rst
index b9883d236..e0c27b87b 100644
--- a/doc/tutorials/learning_to_rank.rst
+++ b/doc/tutorials/learning_to_rank.rst
@@ -48,8 +48,9 @@ Notice that the samples are sorted based on their query index in a non-decreasin
   import xgboost as xgb
 
   # Make a synthetic ranking dataset for demonstration
-  X, y = make_classification(random_state=rng)
-  rng = np.random.default_rng(1994)
+  seed = 1994 
+  X, y = make_classification(random_state=seed)
+  rng = np.random.default_rng(seed)
   n_query_groups = 3
   qid = rng.integers(0, 3, size=X.shape[0])
 

From bc267dd72983c5ca734cfc7f20296cc4797ef25c Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 27 Jun 2023 19:05:46 +0800
Subject: [PATCH 010/136] Use ptr from `mmap` for `GHistIndexMatrix` and
 `ColumnMatrix`.  (#9315)

* Use ptr from mmap for `GHistIndexMatrix` and `ColumnMatrix`.

- Define a resource for holding various types of memory pointers.
- Define ref vector for holding resources.
- Swap the underlying resources for GHist and ColumnM.
- Add documentation for current status.
- s390x support is removed. It should work if you can compile XGBoost, all the old workaround code does is to get GCC to compile.
---
 doc/c.rst                                     |   2 +
 doc/tutorials/dask.rst                        |   3 +
 doc/tutorials/external_memory.rst             |  79 +++-
 doc/tutorials/index.rst                       |  18 +-
 doc/tutorials/param_tuning.rst                |  43 +++
 rabit/include/rabit/internal/io.h             |  11 +-
 src/common/column_matrix.cc                   |  79 +++-
 src/common/column_matrix.h                    | 242 ++++++-------
 src/common/hist_util.h                        |  96 ++---
 src/common/io.cc                              | 104 ++++--
 src/common/io.h                               | 336 ++++++++++++++++--
 src/common/ref_resource_view.h                | 158 ++++++++
 src/data/ellpack_page_raw_format.cu           |  63 ++--
 src/data/gradient_index.cc                    |  58 ++-
 src/data/gradient_index.cu                    |  10 +-
 src/data/gradient_index.h                     |  61 ++--
 src/data/gradient_index_format.cc             |  96 ++---
 src/data/histogram_cut_format.h               |  40 ++-
 src/data/iterative_dmatrix.cc                 |   6 +-
 src/data/sparse_page_raw_format.cc            |  49 ++-
 src/data/sparse_page_source.h                 |  58 ++-
 src/data/sparse_page_writer.h                 |  59 ++-
 src/gbm/gbtree.cc                             |  16 +
 tests/cpp/common/test_io.cc                   |  80 ++++-
 tests/cpp/common/test_ref_resource_view.cc    | 108 ++++++
 .../cpp/data/test_ellpack_page_raw_format.cu  |  15 +-
 tests/cpp/data/test_gradient_index.cc         |  18 +-
 .../test_gradient_index_page_raw_format.cc    |  26 +-
 tests/cpp/data/test_sparse_page_raw_format.cc |  23 +-
 29 files changed, 1448 insertions(+), 509 deletions(-)
 create mode 100644 src/common/ref_resource_view.h
 create mode 100644 tests/cpp/common/test_ref_resource_view.cc

diff --git a/doc/c.rst b/doc/c.rst
index d63e779e1..9a9d7b557 100644
--- a/doc/c.rst
+++ b/doc/c.rst
@@ -33,6 +33,8 @@ DMatrix
 .. doxygengroup:: DMatrix
    :project: xgboost
 
+.. _c_streaming:
+
 Streaming
 ---------
 
diff --git a/doc/tutorials/dask.rst b/doc/tutorials/dask.rst
index 3562015e2..fa487f1c8 100644
--- a/doc/tutorials/dask.rst
+++ b/doc/tutorials/dask.rst
@@ -54,6 +54,9 @@ on a dask cluster:
         y = da.random.random(size=(num_obs, 1), chunks=(1000, 1))
 
         dtrain = xgb.dask.DaskDMatrix(client, X, y)
+        # or
+        # dtrain = xgb.dask.DaskQuantileDMatrix(client, X, y)
+        # `DaskQuantileDMatrix` is available for the `hist` and `gpu_hist` tree method.
 
         output = xgb.dask.train(
             client,
diff --git a/doc/tutorials/external_memory.rst b/doc/tutorials/external_memory.rst
index f5b6132c7..832d13edd 100644
--- a/doc/tutorials/external_memory.rst
+++ b/doc/tutorials/external_memory.rst
@@ -22,6 +22,15 @@ GPU-based training algorithm. We will introduce them in the following sections.
 
    The feature is still experimental as of 2.0. The performance is not well optimized.
 
+The external memory support has gone through multiple iterations and is still under heavy
+development. Like the :py:class:`~xgboost.QuantileDMatrix` with
+:py:class:`~xgboost.DataIter`, XGBoost loads data batch-by-batch using a custom iterator
+supplied by the user. However, unlike the :py:class:`~xgboost.QuantileDMatrix`, external
+memory will not concatenate the batches unless GPU is used (it uses a hybrid approach,
+more details follow). Instead, it will cache all batches on the external memory and fetch
+them on-demand.  Go to the end of the document to see a comparison between
+`QuantileDMatrix` and external memory.
+
 *************
 Data Iterator
 *************
@@ -113,10 +122,11 @@ External memory is supported by GPU algorithms (i.e. when ``tree_method`` is set
 ``gpu_hist``). However, the algorithm used for GPU is different from the one used for
 CPU. When training on a CPU, the tree method iterates through all batches from external
 memory for each step of the tree construction algorithm. On the other hand, the GPU
-algorithm concatenates all batches into one and stores it in GPU memory. To reduce overall
-memory usage, users can utilize subsampling. The good news is that the GPU hist tree
-method supports gradient-based sampling, enabling users to set a low sampling rate without
-compromising accuracy.
+algorithm uses a hybrid approach. It iterates through the data during the beginning of
+each iteration and concatenates all batches into one in GPU memory. To reduce overall
+memory usage, users can utilize subsampling. The GPU hist tree method supports
+`gradient-based sampling`, enabling users to set a low sampling rate without compromising
+accuracy.
 
 .. code-block:: python
 
@@ -134,6 +144,8 @@ see `this paper <https://arxiv.org/abs/2005.09148>`_.
    When GPU is running out of memory during iteration on external memory, user might
    recieve a segfault instead of an OOM exception.
 
+.. _ext_remarks:
+
 *******
 Remarks
 *******
@@ -142,17 +154,64 @@ When using external memory with XBGoost, data is divided into smaller chunks so
 a fraction of it needs to be stored in memory at any given time. It's important to note
 that this method only applies to the predictor data (``X``), while other data, like labels
 and internal runtime structures are concatenated. This means that memory reduction is most
-effective when dealing with wide datasets where ``X`` is larger compared to other data
-like ``y``, while it has little impact on slim datasets.
+effective when dealing with wide datasets where ``X`` is significantly larger in size
+compared to other data like ``y``, while it has little impact on slim datasets.
+
+As one might expect, fetching data on-demand puts significant pressure on the storage
+device. Today's computing device can process way more data than a storage can read in a
+single unit of time. The ratio is at order of magnitudes. An GPU is capable of processing
+hundred of Gigabytes of floating-point data in a split second. On the other hand, a
+four-lane NVMe storage connected to a PCIe-4 slot usually has about 6GB/s of data transfer
+rate. As a result, the training is likely to be severely bounded by your storage
+device. Before adopting the external memory solution, some back-of-envelop calculations
+might help you see whether it's viable. For instance, if your NVMe drive can transfer 4GB
+(a fairly practical number) of data per second and you have a 100GB of data in compressed
+XGBoost cache (which corresponds to a dense float32 numpy array with the size of 200GB,
+give or take). A tree with depth 8 needs at least 16 iterations through the data when the
+parameter is right. You need about 14 minutes to train a single tree without accounting
+for some other overheads and assume the computation overlaps with the IO. If your dataset
+happens to have TB-level size, then you might need thousands of trees to get a generalized
+model. These calculations can help you get an estimate on the expected training time.
+
+However, sometimes we can ameliorate this limitation. One should also consider that the OS
+(mostly talking about the Linux kernel) can usually cache the data on host memory. It only
+evicts pages when new data comes in and there's no room left. In practice, at least some
+portion of the data can persist on the host memory throughout the entire training
+session. We are aware of this cache when optimizing the external memory fetcher. The
+compressed cache is usually smaller than the raw input data, especially when the input is
+dense without any missing value. If the host memory can fit a significant portion of this
+compressed cache, then the performance should be decent after initialization. Our
+development so far focus on two fronts of optimization for external memory:
+
+- Avoid iterating through the data whenever appropriate.
+- If the OS can cache the data, the performance should be close to in-core training.
 
 Starting with XGBoost 2.0, the implementation of external memory uses ``mmap``. It is not
-yet tested against system errors like disconnected network devices (`SIGBUS`). Also, it's
-worth noting that most tests have been conducted on Linux distributions.
+tested against system errors like disconnected network devices (`SIGBUS`). In the face of
+a bus error, you will see a hard crash and need to clean up the cache files. If the
+training session might take a long time and you are using solutions like NVMe-oF, we
+recommend checkpointing your model periodically. Also, it's worth noting that most tests
+have been conducted on Linux distributions.
+
 
 Another important point to keep in mind is that creating the initial cache for XGBoost may
-take some time. The interface to external memory is through custom iterators, which may or
-may not be thread-safe. Therefore, initialization is performed sequentially.
+take some time. The interface to external memory is through custom iterators, which we can
+not assume to be thread-safe. Therefore, initialization is performed sequentially. Using
+the `xgboost.config_context` with `verbosity=2` can give you some information on what
+XGBoost is doing during the wait if you don't mind the extra output.
 
+*******************************
+Compared to the QuantileDMatrix
+*******************************
+
+Passing an iterator to the :py:class:`~xgboost.QuantileDmatrix` enables direct
+construction of `QuantileDmatrix` with data chunks. On the other hand, if it's passed to
+:py:class:`~xgboost.DMatrix`, it instead enables the external memory feature. The
+:py:class:`~xgboost.QuantileDmatrix` concatenates the data on memory after compression and
+doesn't fetch data during training. On the other hand, the external memory `DMatrix`
+fetches data batches from external memory on-demand.  Use the `QuantileDMatrix` (with
+iterator if necessary) when you can fit most of your data in memory. The training would be
+an order of magnitute faster than using external memory.
 
 ****************
 Text File Inputs
diff --git a/doc/tutorials/index.rst b/doc/tutorials/index.rst
index eb8c23726..7693173e9 100644
--- a/doc/tutorials/index.rst
+++ b/doc/tutorials/index.rst
@@ -11,22 +11,22 @@ See `Awesome XGBoost <https://github.com/dmlc/xgboost/tree/master/demo>`_ for mo
 
   model
   saving_model
+  learning_to_rank
+  dart
+  monotonic
+  feature_interaction_constraint
+  aft_survival_analysis
+  categorical
+  multioutput
+  rf
   kubernetes
   Distributed XGBoost with XGBoost4J-Spark <https://xgboost.readthedocs.io/en/latest/jvm/xgboost4j_spark_tutorial.html>
   Distributed XGBoost with XGBoost4J-Spark-GPU <https://xgboost.readthedocs.io/en/latest/jvm/xgboost4j_spark_gpu_tutorial.html>
   dask
   spark_estimator
   ray
-  dart
-  monotonic
-  rf
-  feature_interaction_constraint
-  learning_to_rank
-  aft_survival_analysis
+  external_memory
   c_api_tutorial
   input_format
   param_tuning
-  external_memory
   custom_metric_obj
-  categorical
-  multioutput
diff --git a/doc/tutorials/param_tuning.rst b/doc/tutorials/param_tuning.rst
index cce145444..cb58fcc20 100644
--- a/doc/tutorials/param_tuning.rst
+++ b/doc/tutorials/param_tuning.rst
@@ -58,3 +58,46 @@ This can affect the training of XGBoost model, and there are two ways to improve
 
   - In such a case, you cannot re-balance the dataset
   - Set parameter ``max_delta_step`` to a finite number (say 1) to help convergence
+
+
+*********************
+Reducing Memory Usage
+*********************
+
+If you are using a HPO library like :py:class:`sklearn.model_selection.GridSearchCV`,
+please control the number of threads it can use. It's best to let XGBoost to run in
+parallel instead of asking `GridSearchCV` to run multiple experiments at the same
+time. For instance, creating a fold of data for cross validation can consume a significant
+amount of memory:
+
+.. code-block:: python
+
+    # This creates a copy of dataset. X and X_train are both in memory at the same time.
+
+    # This happens for every thread at the same time if you run `GridSearchCV` with
+    # `n_jobs` larger than 1
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y)
+
+.. code-block:: python
+
+    df = pd.DataFrame()
+    # This creates a new copy of the dataframe, even if you specify the inplace parameter
+    new_df = df.drop(...)
+
+.. code-block:: python
+
+    array = np.array(...)
+    # This may or may not make a copy of the data, depending on the type of the data
+    array.astype(np.float32)
+
+.. code-block::
+
+    # np by default uses double, do you actually need it?
+    array = np.array(...)
+
+You can find some more specific memory reduction practices scattered through the documents
+For instances: :doc:`/tutorials/dask`, :doc:`/gpu/index`,
+:doc:`/contrib/scaling`. However, before going into these, being conscious about making
+data copies is a good starting point. It usually consumes a lot more memory than people
+expect.
diff --git a/rabit/include/rabit/internal/io.h b/rabit/include/rabit/internal/io.h
index a12e1decd..d93f32ff9 100644
--- a/rabit/include/rabit/internal/io.h
+++ b/rabit/include/rabit/internal/io.h
@@ -19,8 +19,7 @@
 #include "rabit/internal/utils.h"
 #include "rabit/serializable.h"
 
-namespace rabit {
-namespace utils {
+namespace rabit::utils {
 /*! \brief re-use definition of dmlc::SeekStream */
 using SeekStream = dmlc::SeekStream;
 /**
@@ -31,9 +30,6 @@ struct MemoryFixSizeBuffer : public SeekStream {
   // similar to SEEK_END in libc
   static std::size_t constexpr kSeekEnd = std::numeric_limits<std::size_t>::max();
 
- protected:
-  MemoryFixSizeBuffer() = default;
-
  public:
   /**
    * @brief Ctor
@@ -68,7 +64,7 @@ struct MemoryFixSizeBuffer : public SeekStream {
    * @brief Current position in the buffer (stream).
    */
   std::size_t Tell() override { return curr_ptr_; }
-  virtual bool AtEnd() const { return curr_ptr_ == buffer_size_; }
+  [[nodiscard]] virtual bool AtEnd() const { return curr_ptr_ == buffer_size_; }
 
  protected:
   /*! \brief in memory buffer */
@@ -119,6 +115,5 @@ struct MemoryBufferStream : public SeekStream {
   /*! \brief current pointer */
   size_t curr_ptr_;
 };  // class MemoryBufferStream
-}  // namespace utils
-}  // namespace rabit
+}  // namespace rabit::utils
 #endif  // RABIT_INTERNAL_IO_H_
diff --git a/src/common/column_matrix.cc b/src/common/column_matrix.cc
index d8acfa7a5..1d44f1840 100644
--- a/src/common/column_matrix.cc
+++ b/src/common/column_matrix.cc
@@ -1,16 +1,27 @@
-/*!
- * Copyright 2017-2022 by XGBoost Contributors
+/**
+ * Copyright 2017-2023, XGBoost Contributors
  * \brief Utility for fast column-wise access
  */
 #include "column_matrix.h"
 
-namespace xgboost {
-namespace common {
+#include <algorithm>    // for transform
+#include <cstddef>      // for size_t
+#include <cstdint>      // for uint64_t, uint8_t
+#include <limits>       // for numeric_limits
+#include <type_traits>  // for remove_reference_t
+#include <vector>       // for vector
+
+#include "../data/gradient_index.h"  // for GHistIndexMatrix
+#include "io.h"                      // for AlignedResourceReadStream, AlignedFileWriteStream
+#include "xgboost/base.h"            // for bst_feaature_t
+#include "xgboost/span.h"            // for Span
+
+namespace xgboost::common {
 void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_threshold) {
   auto const nfeature = gmat.Features();
   const size_t nrow = gmat.Size();
   // identify type of each column
-  type_.resize(nfeature);
+  type_ = common::MakeFixedVecWithMalloc(nfeature, ColumnType{});
 
   uint32_t max_val = std::numeric_limits<uint32_t>::max();
   for (bst_feature_t fid = 0; fid < nfeature; ++fid) {
@@ -34,7 +45,7 @@ void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_thres
 
   // want to compute storage boundary for each feature
   // using variants of prefix sum scan
-  feature_offsets_.resize(nfeature + 1);
+  feature_offsets_ = common::MakeFixedVecWithMalloc(nfeature + 1, std::size_t{0});
   size_t accum_index = 0;
   feature_offsets_[0] = accum_index;
   for (bst_feature_t fid = 1; fid < nfeature + 1; ++fid) {
@@ -49,9 +60,11 @@ void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_thres
   SetTypeSize(gmat.MaxNumBinPerFeat());
   auto storage_size =
       feature_offsets_.back() * static_cast<std::underlying_type_t<BinTypeSize>>(bins_type_size_);
-  index_.resize(storage_size, 0);
+
+  index_ = common::MakeFixedVecWithMalloc(storage_size, std::uint8_t{0});
+
   if (!all_dense_column) {
-    row_ind_.resize(feature_offsets_[nfeature]);
+    row_ind_ = common::MakeFixedVecWithMalloc(feature_offsets_[nfeature], std::size_t{0});
   }
 
   // store least bin id for each feature
@@ -59,7 +72,51 @@ void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_thres
 
   any_missing_ = !gmat.IsDense();
 
-  missing_flags_.clear();
+  missing_ = MissingIndicator{0, false};
 }
-}  // namespace common
-}  // namespace xgboost
+
+// IO procedures for external memory.
+bool ColumnMatrix::Read(AlignedResourceReadStream* fi, uint32_t const* index_base) {
+  if (!common::ReadVec(fi, &index_)) {
+    return false;
+  }
+  if (!common::ReadVec(fi, &type_)) {
+    return false;
+  }
+  if (!common::ReadVec(fi, &row_ind_)) {
+    return false;
+  }
+  if (!common::ReadVec(fi, &feature_offsets_)) {
+    return false;
+  }
+
+  if (!common::ReadVec(fi, &missing_.storage)) {
+    return false;
+  }
+  missing_.InitView();
+
+  index_base_ = index_base;
+  if (!fi->Read(&bins_type_size_)) {
+    return false;
+  }
+  if (!fi->Read(&any_missing_)) {
+    return false;
+  }
+  return true;
+}
+
+std::size_t ColumnMatrix::Write(AlignedFileWriteStream* fo) const {
+  std::size_t bytes{0};
+
+  bytes += common::WriteVec(fo, index_);
+  bytes += common::WriteVec(fo, type_);
+  bytes += common::WriteVec(fo, row_ind_);
+  bytes += common::WriteVec(fo, feature_offsets_);
+  bytes += common::WriteVec(fo, missing_.storage);
+
+  bytes += fo->Write(bins_type_size_);
+  bytes += fo->Write(any_missing_);
+
+  return bytes;
+}
+}  // namespace xgboost::common
diff --git a/src/common/column_matrix.h b/src/common/column_matrix.h
index f121b7a46..78361744d 100644
--- a/src/common/column_matrix.h
+++ b/src/common/column_matrix.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2017-2022 by Contributors
+/**
+ * Copyright 2017-2023, XGBoost Contributors
  * \file column_matrix.h
  * \brief Utility for fast column-wise access
  * \author Philip Cho
@@ -8,25 +8,30 @@
 #ifndef XGBOOST_COMMON_COLUMN_MATRIX_H_
 #define XGBOOST_COMMON_COLUMN_MATRIX_H_
 
-#include <dmlc/endian.h>
-
 #include <algorithm>
+#include <cstddef>  // for size_t
+#include <cstdint>  // for uint8_t
 #include <limits>
 #include <memory>
-#include <utility>  // std::move
+#include <utility>  // for move
 #include <vector>
 
 #include "../data/adapter.h"
 #include "../data/gradient_index.h"
 #include "algorithm.h"
+#include "bitfield.h"  // for RBitField8
 #include "hist_util.h"
+#include "ref_resource_view.h"  // for RefResourceView
+#include "xgboost/base.h"       // for bst_bin_t
+#include "xgboost/span.h"       // for Span
 
-namespace xgboost {
-namespace common {
-
+namespace xgboost::common {
 class ColumnMatrix;
+class AlignedFileWriteStream;
+class AlignedResourceReadStream;
+
 /*! \brief column type */
-enum ColumnType : uint8_t { kDenseColumn, kSparseColumn };
+enum ColumnType : std::uint8_t { kDenseColumn, kSparseColumn };
 
 /*! \brief a column storage, to be used with ApplySplit. Note that each
     bin id is stored as index[i] + index_base.
@@ -41,12 +46,12 @@ class Column {
       : index_(index), index_base_(least_bin_idx) {}
   virtual ~Column() = default;
 
-  bst_bin_t GetGlobalBinIdx(size_t idx) const {
+  [[nodiscard]] bst_bin_t GetGlobalBinIdx(size_t idx) const {
     return index_base_ + static_cast<bst_bin_t>(index_[idx]);
   }
 
   /* returns number of elements in column */
-  size_t Size() const { return index_.size(); }
+  [[nodiscard]] size_t Size() const { return index_.size(); }
 
  private:
   /* bin indexes in range [0, max_bins - 1] */
@@ -63,7 +68,7 @@ class SparseColumnIter : public Column<BinIdxT> {
   common::Span<const size_t> row_ind_;
   size_t idx_;
 
-  size_t const* RowIndices() const { return row_ind_.data(); }
+  [[nodiscard]] size_t const* RowIndices() const { return row_ind_.data(); }
 
  public:
   SparseColumnIter(common::Span<const BinIdxT> index, bst_bin_t least_bin_idx,
@@ -81,7 +86,7 @@ class SparseColumnIter : public Column<BinIdxT> {
   SparseColumnIter(SparseColumnIter const&) = delete;
   SparseColumnIter(SparseColumnIter&&) = default;
 
-  size_t GetRowIdx(size_t idx) const { return RowIndices()[idx]; }
+  [[nodiscard]] size_t GetRowIdx(size_t idx) const { return RowIndices()[idx]; }
   bst_bin_t operator[](size_t rid) {
     const size_t column_size = this->Size();
     if (!((idx_) < column_size)) {
@@ -101,6 +106,10 @@ class SparseColumnIter : public Column<BinIdxT> {
   }
 };
 
+/**
+ * @brief Column stored as a dense vector. It might still contain missing values as
+ *        indicated by the missing flags.
+ */
 template <typename BinIdxT, bool any_missing>
 class DenseColumnIter : public Column<BinIdxT> {
  public:
@@ -109,17 +118,19 @@ class DenseColumnIter : public Column<BinIdxT> {
  private:
   using Base = Column<BinIdxT>;
   /* flags for missing values in dense columns */
-  std::vector<ByteType> const& missing_flags_;
+  LBitField32 missing_flags_;
   size_t feature_offset_;
 
  public:
   explicit DenseColumnIter(common::Span<const BinIdxT> index, bst_bin_t index_base,
-                           std::vector<ByteType> const& missing_flags, size_t feature_offset)
+                           LBitField32 missing_flags, size_t feature_offset)
       : Base{index, index_base}, missing_flags_{missing_flags}, feature_offset_{feature_offset} {}
   DenseColumnIter(DenseColumnIter const&) = delete;
   DenseColumnIter(DenseColumnIter&&) = default;
 
-  bool IsMissing(size_t ridx) const { return missing_flags_[feature_offset_ + ridx]; }
+  [[nodiscard]] bool IsMissing(size_t ridx) const {
+    return missing_flags_.Check(feature_offset_ + ridx);
+  }
 
   bst_bin_t operator[](size_t ridx) const {
     if (any_missing) {
@@ -131,12 +142,54 @@ class DenseColumnIter : public Column<BinIdxT> {
 };
 
 /**
- * \brief Column major matrix for gradient index. This matrix contains both dense column
- * and sparse column, the type of the column is controlled by sparse threshold. When the
- * number of missing values in a column is below the threshold it's classified as dense
- * column.
+ * @brief Column major matrix for gradient index on CPU.
+ *
+ *    This matrix contains both dense columns and sparse columns, the type of the column
+ *    is controlled by the sparse threshold parameter. When the number of missing values
+ *    in a column is below the threshold it's classified as dense column.
  */
 class ColumnMatrix {
+  /**
+   * @brief A bit set for indicating whether an element in a dense column is missing.
+   */
+  struct MissingIndicator {
+    LBitField32 missing;
+    RefResourceView<std::uint32_t> storage;
+
+    MissingIndicator() = default;
+    /**
+     * @param n_elements Size of the bit set
+     * @param init       Initialize the indicator to true or false.
+     */
+    MissingIndicator(std::size_t n_elements, bool init) {
+      auto m_size = missing.ComputeStorageSize(n_elements);
+      storage = common::MakeFixedVecWithMalloc(m_size, init ? ~std::uint32_t{0} : std::uint32_t{0});
+      this->InitView();
+    }
+    /** @brief Set the i^th element to be a valid element (instead of missing). */
+    void SetValid(typename LBitField32::index_type i) { missing.Clear(i); }
+    /** @brief assign the storage to the view. */
+    void InitView() {
+      missing = LBitField32{Span{storage.data(), storage.size()}};
+    }
+
+    void GrowTo(std::size_t n_elements, bool init) {
+      CHECK(storage.Resource()->Type() == ResourceHandler::kMalloc)
+          << "[Internal Error]: Cannot grow the vector when external memory is used.";
+      auto m_size = missing.ComputeStorageSize(n_elements);
+      CHECK_GE(m_size, storage.size());
+      if (m_size == storage.size()) {
+        return;
+      }
+
+      auto new_storage =
+          common::MakeFixedVecWithMalloc(m_size, init ? ~std::uint32_t{0} : std::uint32_t{0});
+      std::copy_n(storage.cbegin(), storage.size(), new_storage.begin());
+      storage = std::move(new_storage);
+      this->InitView();
+    }
+  };
+
   void InitStorage(GHistIndexMatrix const& gmat, double sparse_threshold);
 
   template <typename ColumnBinT, typename BinT, typename RIdx>
@@ -144,9 +197,10 @@ class ColumnMatrix {
     if (type_[fid] == kDenseColumn) {
       ColumnBinT* begin = &local_index[feature_offsets_[fid]];
       begin[rid] = bin_id - index_base_[fid];
-      // not thread-safe with bool vector.  FIXME(jiamingy): We can directly assign
-      // kMissingId to the index to avoid missing flags.
-      missing_flags_[feature_offsets_[fid] + rid] = false;
+      // not thread-safe with bit field.
+      // FIXME(jiamingy): We can directly assign kMissingId to the index to avoid missing
+      // flags.
+      missing_.SetValid(feature_offsets_[fid] + rid);
     } else {
       ColumnBinT* begin = &local_index[feature_offsets_[fid]];
       begin[num_nonzeros_[fid]] = bin_id - index_base_[fid];
@@ -158,7 +212,9 @@ class ColumnMatrix {
  public:
   using ByteType = bool;
   // get number of features
-  bst_feature_t GetNumFeature() const { return static_cast<bst_feature_t>(type_.size()); }
+  [[nodiscard]] bst_feature_t GetNumFeature() const {
+    return static_cast<bst_feature_t>(type_.size());
+  }
 
   ColumnMatrix() = default;
   ColumnMatrix(GHistIndexMatrix const& gmat, double sparse_threshold) {
@@ -166,7 +222,7 @@ class ColumnMatrix {
   }
 
   /**
-   * \brief Initialize ColumnMatrix from GHistIndexMatrix with reference to the original
+   * @brief Initialize ColumnMatrix from GHistIndexMatrix with reference to the original
    *        SparsePage.
    */
   void InitFromSparse(SparsePage const& page, const GHistIndexMatrix& gmat, double sparse_threshold,
@@ -178,8 +234,8 @@ class ColumnMatrix {
   }
 
   /**
-   * \brief Initialize ColumnMatrix from GHistIndexMatrix without reference to actual
-   * data.
+   * @brief Initialize ColumnMatrix from GHistIndexMatrix without reference to actual
+   *        data.
    *
    *    This function requires a binary search for each bin to get back the feature index
    *    for those bins.
@@ -199,7 +255,7 @@ class ColumnMatrix {
     }
   }
 
-  bool IsInitialized() const { return !type_.empty(); }
+  [[nodiscard]] bool IsInitialized() const { return !type_.empty(); }
 
   /**
    * \brief Push batch of data for Quantile DMatrix support.
@@ -257,7 +313,7 @@ class ColumnMatrix {
         reinterpret_cast<const BinIdxType*>(&index_[feature_offset * bins_type_size_]),
         column_size};
     return std::move(DenseColumnIter<BinIdxType, any_missing>{
-        bin_index, static_cast<bst_bin_t>(index_base_[fidx]), missing_flags_, feature_offset});
+        bin_index, static_cast<bst_bin_t>(index_base_[fidx]), missing_.missing, feature_offset});
   }
 
   // all columns are dense column and has no missing value
@@ -265,7 +321,8 @@ class ColumnMatrix {
   template <typename RowBinIdxT>
   void SetIndexNoMissing(bst_row_t base_rowid, RowBinIdxT const* row_index, const size_t n_samples,
                          const size_t n_features, int32_t n_threads) {
-    missing_flags_.resize(feature_offsets_[n_features], false);
+    missing_.GrowTo(feature_offsets_[n_features], false);
+
     DispatchBinType(bins_type_size_, [&](auto t) {
       using ColumnBinT = decltype(t);
       auto column_index = Span<ColumnBinT>{reinterpret_cast<ColumnBinT*>(index_.data()),
@@ -290,9 +347,15 @@ class ColumnMatrix {
   void SetIndexMixedColumns(size_t base_rowid, Batch const& batch, const GHistIndexMatrix& gmat,
                             float missing) {
     auto n_features = gmat.Features();
-    missing_flags_.resize(feature_offsets_[n_features], true);
-    auto const* row_index = gmat.index.data<uint32_t>() + gmat.row_ptr[base_rowid];
-    num_nonzeros_.resize(n_features, 0);
+
+    missing_.GrowTo(feature_offsets_[n_features], true);
+    auto const* row_index = gmat.index.data<std::uint32_t>() + gmat.row_ptr[base_rowid];
+    if (num_nonzeros_.empty()) {
+      num_nonzeros_ = common::MakeFixedVecWithMalloc(n_features, std::size_t{0});
+    } else {
+      CHECK_EQ(num_nonzeros_.size(), n_features);
+    }
+
     auto is_valid = data::IsValidFunctor{missing};
 
     DispatchBinType(bins_type_size_, [&](auto t) {
@@ -321,8 +384,9 @@ class ColumnMatrix {
    */
   void SetIndexMixedColumns(const GHistIndexMatrix& gmat) {
     auto n_features = gmat.Features();
-    missing_flags_.resize(feature_offsets_[n_features], true);
-    num_nonzeros_.resize(n_features, 0);
+
+    missing_ = MissingIndicator{feature_offsets_[n_features], true};
+    num_nonzeros_ = common::MakeFixedVecWithMalloc(n_features, std::size_t{0});
 
     DispatchBinType(bins_type_size_, [&](auto t) {
       using ColumnBinT = decltype(t);
@@ -335,106 +399,34 @@ class ColumnMatrix {
     });
   }
 
-  BinTypeSize GetTypeSize() const { return bins_type_size_; }
-  auto GetColumnType(bst_feature_t fidx) const { return type_[fidx]; }
+  [[nodiscard]] BinTypeSize GetTypeSize() const { return bins_type_size_; }
+  [[nodiscard]] auto GetColumnType(bst_feature_t fidx) const { return type_[fidx]; }
 
   // And this returns part of state
-  bool AnyMissing() const { return any_missing_; }
+  [[nodiscard]] bool AnyMissing() const { return any_missing_; }
 
   // IO procedures for external memory.
-  bool Read(dmlc::SeekStream* fi, uint32_t const* index_base) {
-    fi->Read(&index_);
-#if !DMLC_LITTLE_ENDIAN
-    // s390x
-    std::vector<std::underlying_type<ColumnType>::type> int_types;
-    fi->Read(&int_types);
-    type_.resize(int_types.size());
-    std::transform(
-        int_types.begin(), int_types.end(), type_.begin(),
-        [](std::underlying_type<ColumnType>::type i) { return static_cast<ColumnType>(i); });
-#else
-    fi->Read(&type_);
-#endif  // !DMLC_LITTLE_ENDIAN
-
-    fi->Read(&row_ind_);
-    fi->Read(&feature_offsets_);
-
-    std::vector<std::uint8_t> missing;
-    fi->Read(&missing);
-    missing_flags_.resize(missing.size());
-    std::transform(missing.cbegin(), missing.cend(), missing_flags_.begin(),
-                   [](std::uint8_t flag) { return !!flag; });
-
-    index_base_ = index_base;
-#if !DMLC_LITTLE_ENDIAN
-    std::underlying_type<BinTypeSize>::type v;
-    fi->Read(&v);
-    bins_type_size_ = static_cast<BinTypeSize>(v);
-#else
-    fi->Read(&bins_type_size_);
-#endif
-
-    fi->Read(&any_missing_);
-    return true;
-  }
-
-  size_t Write(dmlc::Stream* fo) const {
-    size_t bytes{0};
-
-    auto write_vec = [&](auto const& vec) {
-      fo->Write(vec);
-      bytes += vec.size() * sizeof(typename std::remove_reference_t<decltype(vec)>::value_type) +
-               sizeof(uint64_t);
-    };
-    write_vec(index_);
-#if !DMLC_LITTLE_ENDIAN
-    // s390x
-    std::vector<std::underlying_type<ColumnType>::type> int_types(type_.size());
-    std::transform(type_.begin(), type_.end(), int_types.begin(), [](ColumnType t) {
-      return static_cast<std::underlying_type<ColumnType>::type>(t);
-    });
-    write_vec(int_types);
-#else
-    write_vec(type_);
-#endif  // !DMLC_LITTLE_ENDIAN
-    write_vec(row_ind_);
-    write_vec(feature_offsets_);
-    // dmlc can not handle bool vector
-    std::vector<std::uint8_t> missing(missing_flags_.size());
-    std::transform(missing_flags_.cbegin(), missing_flags_.cend(), missing.begin(),
-                   [](bool flag) { return static_cast<std::uint8_t>(flag); });
-    write_vec(missing);
-
-#if !DMLC_LITTLE_ENDIAN
-    auto v = static_cast<std::underlying_type<BinTypeSize>::type>(bins_type_size_);
-    fo->Write(v);
-#else
-    fo->Write(bins_type_size_);
-#endif  // DMLC_LITTLE_ENDIAN
-    bytes += sizeof(bins_type_size_);
-    fo->Write(any_missing_);
-    bytes += sizeof(any_missing_);
-
-    return bytes;
-  }
+  [[nodiscard]] bool Read(AlignedResourceReadStream* fi, uint32_t const* index_base);
+  [[nodiscard]] std::size_t Write(AlignedFileWriteStream* fo) const;
 
  private:
-  std::vector<uint8_t> index_;
+  RefResourceView<std::uint8_t> index_;
 
-  std::vector<ColumnType> type_;
-  /* indptr of a CSC matrix. */
-  std::vector<size_t> row_ind_;
-  /* indicate where each column's index and row_ind is stored. */
-  std::vector<size_t> feature_offsets_;
-  /* The number of nnz of each column. */
-  std::vector<size_t> num_nonzeros_;
+  RefResourceView<ColumnType> type_;
+  /** @brief indptr of a CSC matrix. */
+  RefResourceView<std::size_t> row_ind_;
+  /** @brief indicate where each column's index and row_ind is stored. */
+  RefResourceView<std::size_t> feature_offsets_;
+  /** @brief The number of nnz of each column. */
+  RefResourceView<std::size_t> num_nonzeros_;
 
   // index_base_[fid]: least bin id for feature fid
-  uint32_t const* index_base_;
-  std::vector<ByteType> missing_flags_;
+  std::uint32_t const* index_base_;
+
+  MissingIndicator missing_;
+
   BinTypeSize bins_type_size_;
   bool any_missing_;
 };
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
 #endif  // XGBOOST_COMMON_COLUMN_MATRIX_H_
diff --git a/src/common/hist_util.h b/src/common/hist_util.h
index d2edf2ec8..2781da8e0 100644
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -203,13 +203,33 @@ auto DispatchBinType(BinTypeSize type, Fn&& fn) {
 }
 
 /**
- * \brief Optionally compressed gradient index. The compression works only with dense
+ * @brief Optionally compressed gradient index. The compression works only with dense
  *        data.
  *
  *   The main body of construction code is in gradient_index.cc, this struct is only a
- *   storage class.
+ *   view class.
  */
-struct Index {
+class Index {
+ private:
+  void SetBinTypeSize(BinTypeSize binTypeSize) {
+    binTypeSize_ = binTypeSize;
+    switch (binTypeSize) {
+      case kUint8BinsTypeSize:
+        func_ = &GetValueFromUint8;
+        break;
+      case kUint16BinsTypeSize:
+        func_ = &GetValueFromUint16;
+        break;
+      case kUint32BinsTypeSize:
+        func_ = &GetValueFromUint32;
+        break;
+      default:
+        CHECK(binTypeSize == kUint8BinsTypeSize || binTypeSize == kUint16BinsTypeSize ||
+              binTypeSize == kUint32BinsTypeSize);
+    }
+  }
+
+ public:
   // Inside the compressor, bin_idx is the index for cut value across all features. By
   // subtracting it with starting pointer of each feature, we can reduce it to smaller
   // value and store it with smaller types. Usable only with dense data.
@@ -233,10 +253,24 @@ struct Index {
   }
 
   Index() { SetBinTypeSize(binTypeSize_); }
-  Index(const Index& i) = delete;
-  Index& operator=(Index i) = delete;
+
+  Index(Index const& i) = delete;
+  Index& operator=(Index const& i) = delete;
   Index(Index&& i) = delete;
-  Index& operator=(Index&& i) = delete;
+
+  /** @brief Move assignment for lazy initialization. */
+  Index& operator=(Index&& i) = default;
+
+  /**
+   * @brief Construct the index from data.
+   *
+   * @param data     Storage for compressed histogram bin.
+   * @param bin_size Number of bytes for each bin.
+   */
+  Index(Span<std::uint8_t> data, BinTypeSize bin_size) : data_{data} {
+    this->SetBinTypeSize(bin_size);
+  }
+
   uint32_t operator[](size_t i) const {
     if (!bin_offset_.empty()) {
       // dense, compressed
@@ -247,26 +281,7 @@ struct Index {
       return func_(data_.data(), i);
     }
   }
-  void SetBinTypeSize(BinTypeSize binTypeSize) {
-    binTypeSize_ = binTypeSize;
-    switch (binTypeSize) {
-      case kUint8BinsTypeSize:
-        func_ = &GetValueFromUint8;
-        break;
-      case kUint16BinsTypeSize:
-        func_ = &GetValueFromUint16;
-        break;
-      case kUint32BinsTypeSize:
-        func_ = &GetValueFromUint32;
-        break;
-      default:
-        CHECK(binTypeSize == kUint8BinsTypeSize || binTypeSize == kUint16BinsTypeSize ||
-              binTypeSize == kUint32BinsTypeSize);
-    }
-  }
-  BinTypeSize GetBinTypeSize() const {
-    return binTypeSize_;
-  }
+  [[nodiscard]] BinTypeSize GetBinTypeSize() const { return binTypeSize_; }
   template <typename T>
   T const* data() const {  // NOLINT
     return reinterpret_cast<T const*>(data_.data());
@@ -275,30 +290,27 @@ struct Index {
   T* data() {  // NOLINT
     return reinterpret_cast<T*>(data_.data());
   }
-  uint32_t const* Offset() const { return bin_offset_.data(); }
-  size_t OffsetSize() const { return bin_offset_.size(); }
-  size_t Size() const { return data_.size() / (binTypeSize_); }
+  [[nodiscard]] std::uint32_t const* Offset() const { return bin_offset_.data(); }
+  [[nodiscard]] std::size_t OffsetSize() const { return bin_offset_.size(); }
+  [[nodiscard]] std::size_t Size() const { return data_.size() / (binTypeSize_); }
 
-  void Resize(const size_t n_bytes) {
-    data_.resize(n_bytes);
-  }
   // set the offset used in compression, cut_ptrs is the CSC indptr in HistogramCuts
   void SetBinOffset(std::vector<uint32_t> const& cut_ptrs) {
     bin_offset_.resize(cut_ptrs.size() - 1);  // resize to number of features.
     std::copy_n(cut_ptrs.begin(), bin_offset_.size(), bin_offset_.begin());
   }
-  std::vector<uint8_t>::const_iterator begin() const {  // NOLINT
-    return data_.begin();
+  auto begin() const {  // NOLINT
+    return data_.data();
   }
-  std::vector<uint8_t>::const_iterator end() const {  // NOLINT
-    return data_.end();
+  auto end() const {  // NOLINT
+    return data_.data() + data_.size();
   }
 
-  std::vector<uint8_t>::iterator begin() {  // NOLINT
-    return data_.begin();
+  auto begin() {  // NOLINT
+    return data_.data();
   }
-  std::vector<uint8_t>::iterator end() {  // NOLINT
-    return data_.end();
+  auto end() {  // NOLINT
+    return data_.data() + data_.size();
   }
 
  private:
@@ -313,12 +325,12 @@ struct Index {
 
   using Func = uint32_t (*)(uint8_t const*, size_t);
 
-  std::vector<uint8_t> data_;
+  Span<std::uint8_t> data_;
   // starting position of each feature inside the cut values (the indptr of the CSC cut matrix
   // HistogramCuts without the last entry.) Used for bin compression.
   std::vector<uint32_t> bin_offset_;
 
-  BinTypeSize binTypeSize_ {kUint8BinsTypeSize};
+  BinTypeSize binTypeSize_{kUint8BinsTypeSize};
   Func func_;
 };
 
diff --git a/src/common/io.cc b/src/common/io.cc
index ba97db574..db1624b95 100644
--- a/src/common/io.cc
+++ b/src/common/io.cc
@@ -200,21 +200,43 @@ std::string FileExtension(std::string fname, bool lower) {
   }
 }
 
-struct PrivateMmapConstStream::MMAPFile {
+// For some reason, NVCC 12.1 marks the function deleted if we expose it in the header.
+// NVCC 11.8 doesn't allow `noexcept(false) = default` altogether.
+ResourceHandler::~ResourceHandler() noexcept(false) {}  // NOLINT
+
+struct MMAPFile {
 #if defined(xgboost_IS_WIN)
   HANDLE fd{INVALID_HANDLE_VALUE};
   HANDLE file_map{INVALID_HANDLE_VALUE};
 #else
   std::int32_t fd{0};
 #endif
-  char* base_ptr{nullptr};
+  std::byte* base_ptr{nullptr};
   std::size_t base_size{0};
+  std::size_t delta{0};
   std::string path;
+
+  MMAPFile() = default;
+
+#if defined(xgboost_IS_WIN)
+  MMAPFile(HANDLE fd, HANDLE fm, std::byte* base_ptr, std::size_t base_size, std::size_t delta,
+           std::string path)
+      : fd{fd},
+        file_map{fm},
+        base_ptr{base_ptr},
+        base_size{base_size},
+        delta{delta},
+        path{std::move(path)} {}
+#else
+  MMAPFile(std::int32_t fd, std::byte* base_ptr, std::size_t base_size, std::size_t delta,
+           std::string path)
+      : fd{fd}, base_ptr{base_ptr}, base_size{base_size}, delta{delta}, path{std::move(path)} {}
+#endif
 };
 
-char* PrivateMmapConstStream::Open(std::string path, std::size_t offset, std::size_t length) {
+std::unique_ptr<MMAPFile> Open(std::string path, std::size_t offset, std::size_t length) {
   if (length == 0) {
-    return nullptr;
+    return std::make_unique<MMAPFile>();
   }
 
 #if defined(xgboost_IS_WIN)
@@ -226,16 +248,18 @@ char* PrivateMmapConstStream::Open(std::string path, std::size_t offset, std::si
   CHECK_GE(fd, 0) << "Failed to open:" << path << ". " << SystemErrorMsg();
 #endif
 
-  char* ptr{nullptr};
+  std::byte* ptr{nullptr};
   // Round down for alignment.
   auto view_start = offset / GetMmapAlignment() * GetMmapAlignment();
   auto view_size = length + (offset - view_start);
 
 #if defined(__linux__) || defined(__GLIBC__)
   int prot{PROT_READ};
-  ptr = reinterpret_cast<char*>(mmap64(nullptr, view_size, prot, MAP_PRIVATE, fd, view_start));
+  ptr = reinterpret_cast<std::byte*>(mmap64(nullptr, view_size, prot, MAP_PRIVATE, fd, view_start));
+  madvise(ptr, view_size, MADV_WILLNEED);
   CHECK_NE(ptr, MAP_FAILED) << "Failed to map: " << path << ". " << SystemErrorMsg();
-  handle_.reset(new MMAPFile{fd, ptr, view_size, std::move(path)});
+  auto handle =
+      std::make_unique<MMAPFile>(fd, ptr, view_size, offset - view_start, std::move(path));
 #elif defined(xgboost_IS_WIN)
   auto file_size = GetFileSize(fd, nullptr);
   DWORD access = PAGE_READONLY;
@@ -244,33 +268,32 @@ char* PrivateMmapConstStream::Open(std::string path, std::size_t offset, std::si
   std::uint32_t loff = static_cast<std::uint32_t>(view_start);
   std::uint32_t hoff = view_start >> 32;
   CHECK(map_file) << "Failed to map: " << path << ". " << SystemErrorMsg();
-  ptr = reinterpret_cast<char*>(MapViewOfFile(map_file, access, hoff, loff, view_size));
+  ptr = reinterpret_cast<std::byte*>(MapViewOfFile(map_file, access, hoff, loff, view_size));
   CHECK_NE(ptr, nullptr) << "Failed to map: " << path << ". " << SystemErrorMsg();
-  handle_.reset(new MMAPFile{fd, map_file, ptr, view_size, std::move(path)});
+  auto handle = std::make_unique<MMAPFile>(fd, map_file, ptr, view_size, offset - view_start,
+                                           std::move(path));
 #else
   CHECK_LE(offset, std::numeric_limits<off_t>::max())
       << "File size has exceeded the limit on the current system.";
   int prot{PROT_READ};
-  ptr = reinterpret_cast<char*>(mmap(nullptr, view_size, prot, MAP_PRIVATE, fd, view_start));
+  ptr = reinterpret_cast<std::byte*>(mmap(nullptr, view_size, prot, MAP_PRIVATE, fd, view_start));
   CHECK_NE(ptr, MAP_FAILED) << "Failed to map: " << path << ". " << SystemErrorMsg();
-  handle_.reset(new MMAPFile{fd, ptr, view_size, std::move(path)});
+  auto handle =
+      std::make_unique<MMAPFile>(fd, ptr, view_size, offset - view_start, std::move(path));
 #endif  // defined(__linux__)
 
-  ptr += (offset - view_start);
-  return ptr;
+  return handle;
 }
 
-PrivateMmapConstStream::PrivateMmapConstStream(std::string path, std::size_t offset,
-                                               std::size_t length)
-    : MemoryFixSizeBuffer{}, handle_{nullptr} {
-  this->p_buffer_ = Open(std::move(path), offset, length);
-  this->buffer_size_ = length;
-}
+MmapResource::MmapResource(std::string path, std::size_t offset, std::size_t length)
+    : ResourceHandler{kMmap}, handle_{Open(std::move(path), offset, length)}, n_{length} {}
 
-PrivateMmapConstStream::~PrivateMmapConstStream() {
-  CHECK(handle_);
+MmapResource::~MmapResource() noexcept(false) {
+  if (!handle_) {
+    return;
+  }
 #if defined(xgboost_IS_WIN)
-  if (p_buffer_) {
+  if (handle_->base_ptr) {
     CHECK(UnmapViewOfFile(handle_->base_ptr)) "Faled to call munmap: " << SystemErrorMsg();
   }
   if (handle_->fd != INVALID_HANDLE_VALUE) {
@@ -290,6 +313,43 @@ PrivateMmapConstStream::~PrivateMmapConstStream() {
   }
 #endif
 }
+
+[[nodiscard]] void* MmapResource::Data() {
+  if (!handle_) {
+    return nullptr;
+  }
+  return handle_->base_ptr + handle_->delta;
+}
+
+[[nodiscard]] std::size_t MmapResource::Size() const { return n_; }
+
+// For some reason, NVCC 12.1 marks the function deleted if we expose it in the header.
+// NVCC 11.8 doesn't allow `noexcept(false) = default` altogether.
+AlignedResourceReadStream::~AlignedResourceReadStream() noexcept(false) {}  // NOLINT
+PrivateMmapConstStream::~PrivateMmapConstStream() noexcept(false) {}        // NOLINT
+
+AlignedFileWriteStream::AlignedFileWriteStream(StringView path, StringView flags)
+    : pimpl_{dmlc::Stream::Create(path.c_str(), flags.c_str())} {}
+
+[[nodiscard]] std::size_t AlignedFileWriteStream::DoWrite(const void* ptr,
+                                                          std::size_t n_bytes) noexcept(true) {
+  pimpl_->Write(ptr, n_bytes);
+  return n_bytes;
+}
+
+AlignedMemWriteStream::AlignedMemWriteStream(std::string* p_buf)
+    : pimpl_{std::make_unique<MemoryBufferStream>(p_buf)} {}
+AlignedMemWriteStream::~AlignedMemWriteStream() = default;
+
+[[nodiscard]] std::size_t AlignedMemWriteStream::DoWrite(const void* ptr,
+                                                         std::size_t n_bytes) noexcept(true) {
+  this->pimpl_->Write(ptr, n_bytes);
+  return n_bytes;
+}
+
+[[nodiscard]] std::size_t AlignedMemWriteStream::Tell() const noexcept(true) {
+  return this->pimpl_->Tell();
+}
 }  // namespace xgboost::common
 
 #if defined(xgboost_IS_WIN)
diff --git a/src/common/io.h b/src/common/io.h
index ab408dec1..baf518aa5 100644
--- a/src/common/io.h
+++ b/src/common/io.h
@@ -4,22 +4,29 @@
  * \brief general stream interface for serialization, I/O
  * \author Tianqi Chen
  */
-
 #ifndef XGBOOST_COMMON_IO_H_
 #define XGBOOST_COMMON_IO_H_
 
 #include <dmlc/io.h>
 #include <rabit/rabit.h>
 
-#include <cstring>
-#include <fstream>
-#include <memory>  // for unique_ptr
-#include <string>  // for string
+#include <algorithm>    // for min
+#include <array>        // for array
+#include <cstddef>      // for byte, size_t
+#include <cstdlib>      // for malloc, realloc, free
+#include <cstring>      // for memcpy
+#include <fstream>      // for ifstream
+#include <limits>       // for numeric_limits
+#include <memory>       // for unique_ptr
+#include <string>       // for string
+#include <type_traits>  // for alignment_of_v, enable_if_t
+#include <utility>      // for move
+#include <vector>       // for vector
 
 #include "common.h"
+#include "xgboost/string_view.h"  // for StringView
 
-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 using MemoryFixSizeBuffer = rabit::utils::MemoryFixSizeBuffer;
 using MemoryBufferStream = rabit::utils::MemoryBufferStream;
 
@@ -58,8 +65,8 @@ class FixedSizeStream : public PeekableInStream {
 
   size_t Read(void* dptr, size_t size) override;
   size_t PeekRead(void* dptr, size_t size) override;
-  size_t Size() const { return buffer_.size(); }
-  size_t Tell() const { return pointer_; }
+  [[nodiscard]] std::size_t Size() const { return buffer_.size(); }
+  [[nodiscard]] std::size_t Tell() const { return pointer_; }
   void Seek(size_t pos);
 
   void Write(const void*, size_t) override {
@@ -129,18 +136,245 @@ inline std::string ReadAll(std::string const &path) {
   return content;
 }
 
+struct MMAPFile;
+
+/**
+ * @brief Handler for one-shot resource. Unlike `std::pmr::*`, the resource handler is
+ *        fixed once it's constructed. Users cannot use mutable operations like resize
+ *        without acquiring the specific resource first.
+ */
+class ResourceHandler {
+ public:
+  // RTTI
+  enum Kind : std::uint8_t {
+    kMalloc = 0,
+    kMmap = 1,
+  };
+
+ private:
+  Kind kind_{kMalloc};
+
+ public:
+  virtual void* Data() = 0;
+  template <typename T>
+  [[nodiscard]] T* DataAs() {
+    return reinterpret_cast<T*>(this->Data());
+  }
+
+  [[nodiscard]] virtual std::size_t Size() const = 0;
+  [[nodiscard]] auto Type() const { return kind_; }
+
+  // Allow exceptions for cleaning up resource.
+  virtual ~ResourceHandler() noexcept(false);
+
+  explicit ResourceHandler(Kind kind) : kind_{kind} {}
+  // Use shared_ptr to manage a pool like resource handler. All copy and assignment
+  // operators are disabled.
+  ResourceHandler(ResourceHandler const& that) = delete;
+  ResourceHandler& operator=(ResourceHandler const& that) = delete;
+  ResourceHandler(ResourceHandler&& that) = delete;
+  ResourceHandler& operator=(ResourceHandler&& that) = delete;
+  /**
+   * @brief Wether two resources have the same type. (both malloc or both mmap).
+   */
+  [[nodiscard]] bool IsSameType(ResourceHandler const& that) const {
+    return this->Type() == that.Type();
+  }
+};
+
+class MallocResource : public ResourceHandler {
+  void* ptr_{nullptr};
+  std::size_t n_{0};
+
+  void Clear() noexcept(true) {
+    std::free(ptr_);
+    ptr_ = nullptr;
+    n_ = 0;
+  }
+
+ public:
+  explicit MallocResource(std::size_t n_bytes) : ResourceHandler{kMalloc} { this->Resize(n_bytes); }
+  ~MallocResource() noexcept(true) override { this->Clear(); }
+
+  void* Data() override { return ptr_; }
+  [[nodiscard]] std::size_t Size() const override { return n_; }
+  /**
+   * @brief Resize the resource to n_bytes. Unlike std::vector::resize, it prefers realloc
+   *        over malloc.
+   *
+   * @tparam force_malloc Force the use of malloc over realloc. Used for testing.
+   *
+   * @param n_bytes The new size.
+   */
+  template <bool force_malloc = false>
+  void Resize(std::size_t n_bytes) {
+    // realloc(ptr, 0) works, but is deprecated.
+    if (n_bytes == 0) {
+      this->Clear();
+      return;
+    }
+
+    // If realloc fails, we need to copy the data ourselves.
+    bool need_copy{false};
+    void* new_ptr{nullptr};
+    // use realloc first, it can handle nullptr.
+    if constexpr (!force_malloc) {
+      new_ptr = std::realloc(ptr_, n_bytes);
+    }
+    // retry with malloc if realloc fails
+    if (!new_ptr) {
+      // ptr_ is preserved if realloc fails
+      new_ptr = std::malloc(n_bytes);
+      need_copy = true;
+    }
+    if (!new_ptr) {
+      // malloc fails
+      LOG(FATAL) << "bad_malloc: Failed to allocate " << n_bytes << " bytes.";
+    }
+
+    if (need_copy) {
+      std::copy_n(reinterpret_cast<std::byte*>(ptr_), n_, reinterpret_cast<std::byte*>(new_ptr));
+    }
+    // default initialize
+    std::memset(reinterpret_cast<std::byte*>(new_ptr) + n_, '\0', n_bytes - n_);
+    // free the old ptr if malloc is used.
+    if (need_copy) {
+      this->Clear();
+    }
+
+    ptr_ = new_ptr;
+    n_ = n_bytes;
+  }
+};
+
+/**
+ * @brief A class for wrapping mmap as a resource for RAII.
+ */
+class MmapResource : public ResourceHandler {
+  std::unique_ptr<MMAPFile> handle_;
+  std::size_t n_;
+
+ public:
+  MmapResource(std::string path, std::size_t offset, std::size_t length);
+  ~MmapResource() noexcept(false) override;
+
+  [[nodiscard]] void* Data() override;
+  [[nodiscard]] std::size_t Size() const override;
+};
+
+/**
+ * @param Alignment for resource read stream and aligned write stream.
+ */
+constexpr std::size_t IOAlignment() {
+  // For most of the pod types in XGBoost, 8 byte is sufficient.
+  return 8;
+}
+
+/**
+ * @brief Wrap resource into a dmlc stream.
+ *
+ *  This class is to facilitate the use of mmap. Caller can optionally use the `Read()`
+ *  method or the `Consume()` method. The former copies data into output, while the latter
+ *  makes copy only if it's a primitive type.
+ *
+ *  Input is required to be aligned to IOAlignment().
+ */
+class AlignedResourceReadStream {
+  std::shared_ptr<ResourceHandler> resource_;
+  std::size_t curr_ptr_{0};
+
+  // Similar to SEEK_END in libc
+  static std::size_t constexpr kSeekEnd = std::numeric_limits<std::size_t>::max();
+
+ public:
+  explicit AlignedResourceReadStream(std::shared_ptr<ResourceHandler> resource)
+      : resource_{std::move(resource)} {}
+
+  [[nodiscard]] std::shared_ptr<ResourceHandler> Share() noexcept(true) { return resource_; }
+  /**
+   * @brief Consume n_bytes of data, no copying is performed.
+   *
+   * @return A pair with the beginning pointer and the number of available bytes, which
+   *         may be smaller than requested.
+   */
+  [[nodiscard]] auto Consume(std::size_t n_bytes) noexcept(true) {
+    auto res_size = resource_->Size();
+    auto data = reinterpret_cast<std::byte*>(resource_->Data());
+    auto ptr = data + curr_ptr_;
+
+    // Move the cursor
+    auto aligned_n_bytes = DivRoundUp(n_bytes, IOAlignment()) * IOAlignment();
+    auto aligned_forward = std::min(res_size - curr_ptr_, aligned_n_bytes);
+    std::size_t forward = std::min(res_size - curr_ptr_, n_bytes);
+
+    curr_ptr_ += aligned_forward;
+
+    return std::pair{ptr, forward};
+  }
+
+  template <typename T>
+  [[nodiscard]] auto Consume(T* out) noexcept(false) -> std::enable_if_t<std::is_pod_v<T>, bool> {
+    auto [ptr, size] = this->Consume(sizeof(T));
+    if (size != sizeof(T)) {
+      return false;
+    }
+    CHECK_EQ(reinterpret_cast<std::uintptr_t>(ptr) % std::alignment_of_v<T>, 0);
+    *out = *reinterpret_cast<T*>(ptr);
+    return true;
+  }
+
+  [[nodiscard]] virtual std::size_t Tell() noexcept(true) { return curr_ptr_; }
+  /**
+   * @brief Read n_bytes of data, output is copied into ptr.
+   */
+  [[nodiscard]] std::size_t Read(void* ptr, std::size_t n_bytes) noexcept(true) {
+    auto [res_ptr, forward] = this->Consume(n_bytes);
+    if (forward != 0) {
+      std::memcpy(ptr, res_ptr, forward);
+    }
+    return forward;
+  }
+  /**
+   * @brief Read a primitive type.
+   *
+   * @return Whether the read is successful.
+   */
+  template <typename T>
+  [[nodiscard]] auto Read(T* out) noexcept(false) -> std::enable_if_t<std::is_pod_v<T>, bool> {
+    return this->Consume(out);
+  }
+  /**
+   * @brief Read a vector.
+   *
+   * @return Whether the read is successful.
+   */
+  template <typename T>
+  [[nodiscard]] bool Read(std::vector<T>* out) noexcept(true) {
+    std::uint64_t n{0};
+    if (!this->Consume(&n)) {
+      return false;
+    }
+    out->resize(n);
+
+    auto n_bytes = sizeof(T) * n;
+    if (this->Read(out->data(), n_bytes) != n_bytes) {
+      return false;
+    }
+    return true;
+  }
+
+  virtual ~AlignedResourceReadStream() noexcept(false);
+};
+
 /**
  * @brief Private mmap file as a read-only stream.
  *
  *  It can calculate alignment automatically based on system page size (or allocation
  *  granularity on Windows).
+ *
+ *  The file is required to be aligned by IOAlignment().
  */
-class PrivateMmapConstStream : public MemoryFixSizeBuffer {
-  struct MMAPFile;
-  std::unique_ptr<MMAPFile> handle_;
-
-  char* Open(std::string path, std::size_t offset, std::size_t length);
-
+class PrivateMmapConstStream : public AlignedResourceReadStream {
  public:
   /**
    * @brief Construct a private mmap stream.
@@ -149,11 +383,71 @@ class PrivateMmapConstStream : public MemoryFixSizeBuffer {
    * @param offset    See the `offset` parameter of `mmap` for details.
    * @param length    See the `length` parameter of `mmap` for details.
    */
-  explicit PrivateMmapConstStream(std::string path, std::size_t offset, std::size_t length);
-  void Write(void const*, std::size_t) override { LOG(FATAL) << "Read-only stream."; }
-
-  ~PrivateMmapConstStream() override;
+  explicit PrivateMmapConstStream(std::string path, std::size_t offset, std::size_t length)
+      : AlignedResourceReadStream{std::make_shared<MmapResource>(path, offset, length)} {}
+  ~PrivateMmapConstStream() noexcept(false) override;
 };
-}  // namespace common
-}  // namespace xgboost
+
+/**
+ * @brief Base class for write stream with alignment defined by IOAlignment().
+ */
+class AlignedWriteStream {
+ protected:
+  [[nodiscard]] virtual std::size_t DoWrite(const void* ptr,
+                                            std::size_t n_bytes) noexcept(true) = 0;
+
+ public:
+  virtual ~AlignedWriteStream() = default;
+
+  [[nodiscard]] std::size_t Write(const void* ptr, std::size_t n_bytes) noexcept(false) {
+    auto aligned_n_bytes = DivRoundUp(n_bytes, IOAlignment()) * IOAlignment();
+    auto w_n_bytes = this->DoWrite(ptr, n_bytes);
+    CHECK_EQ(w_n_bytes, n_bytes);
+    auto remaining = aligned_n_bytes - n_bytes;
+    if (remaining > 0) {
+      std::array<std::uint8_t, IOAlignment()> padding;
+      std::memset(padding.data(), '\0', padding.size());
+      w_n_bytes = this->DoWrite(padding.data(), remaining);
+      CHECK_EQ(w_n_bytes, remaining);
+    }
+    return aligned_n_bytes;
+  }
+
+  template <typename T>
+  [[nodiscard]] std::enable_if_t<std::is_pod_v<T>, std::size_t> Write(T const& v) {
+    return this->Write(&v, sizeof(T));
+  }
+};
+
+/**
+ * @brief Output stream backed by a file. Aligned to IOAlignment() bytes.
+ */
+class AlignedFileWriteStream : public AlignedWriteStream {
+  std::unique_ptr<dmlc::Stream> pimpl_;
+
+ protected:
+  [[nodiscard]] std::size_t DoWrite(const void* ptr, std::size_t n_bytes) noexcept(true) override;
+
+ public:
+  AlignedFileWriteStream() = default;
+  AlignedFileWriteStream(StringView path, StringView flags);
+  ~AlignedFileWriteStream() override = default;
+};
+
+/**
+ * @brief Output stream backed by memory buffer. Aligned to IOAlignment() bytes.
+ */
+class AlignedMemWriteStream : public AlignedFileWriteStream {
+  std::unique_ptr<MemoryBufferStream> pimpl_;
+
+ protected:
+  [[nodiscard]] std::size_t DoWrite(const void* ptr, std::size_t n_bytes) noexcept(true) override;
+
+ public:
+  explicit AlignedMemWriteStream(std::string* p_buf);
+  ~AlignedMemWriteStream() override;
+
+  [[nodiscard]] std::size_t Tell() const noexcept(true);
+};
+}  // namespace xgboost::common
 #endif  // XGBOOST_COMMON_IO_H_
diff --git a/src/common/ref_resource_view.h b/src/common/ref_resource_view.h
new file mode 100644
index 000000000..2804d79eb
--- /dev/null
+++ b/src/common/ref_resource_view.h
@@ -0,0 +1,158 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#ifndef XGBOOST_COMMON_REF_RESOURCE_VIEW_H_
+#define XGBOOST_COMMON_REF_RESOURCE_VIEW_H_
+
+#include <algorithm>    // for fill_n
+#include <cstdint>      // for uint64_t
+#include <cstring>      // for memcpy
+#include <memory>       // for shared_ptr, make_shared
+#include <type_traits>  // for is_reference_v, remove_reference_t, is_same_v
+#include <utility>      // for swap, move
+
+#include "io.h"  // for ResourceHandler, AlignedResourceReadStream, MallocResource
+#include "xgboost/logging.h"
+#include "xgboost/span.h"  // for Span
+
+namespace xgboost::common {
+/**
+ * @brief A vector-like type that holds a reference counted resource.
+ *
+ *    The vector size is immutable after construction. This way we can swap the underlying
+ *    resource when needed.
+ */
+template <typename T>
+class RefResourceView {
+  static_assert(!std::is_reference_v<T>);
+
+ public:
+  using value_type = T;             // NOLINT
+  using size_type = std::uint64_t;  // NOLINT
+
+ private:
+  value_type* ptr_{nullptr};
+  size_type size_{0};
+  std::shared_ptr<common::ResourceHandler> mem_{nullptr};
+
+ public:
+  RefResourceView(value_type* ptr, size_type n, std::shared_ptr<common::ResourceHandler> mem)
+      : ptr_{ptr}, size_{n}, mem_{std::move(mem)} {
+    CHECK_GE(mem_->Size(), n);
+  }
+  /**
+   * @brief Construct a view on ptr with length n. The ptr is held by the mem resource.
+   *
+   * @param ptr  The pointer to view.
+   * @param n    The length of the view.
+   * @param mem  The owner of the pointer.
+   * @param init Initialize the view with this value.
+   */
+  RefResourceView(value_type* ptr, size_type n, std::shared_ptr<common::ResourceHandler> mem,
+                  T const& init)
+      : RefResourceView{ptr, n, mem} {
+    if (n != 0) {
+      std::fill_n(ptr_, n, init);
+    }
+  }
+
+  ~RefResourceView() = default;
+
+  RefResourceView() = default;
+  RefResourceView(RefResourceView const& that) = delete;
+  RefResourceView(RefResourceView&& that) = delete;
+  RefResourceView& operator=(RefResourceView const& that) = delete;
+  /**
+   * @brief We allow move assignment for lazy initialization.
+   */
+  RefResourceView& operator=(RefResourceView&& that) = default;
+
+  [[nodiscard]] size_type size() const { return size_; }  // NOLINT
+  [[nodiscard]] size_type size_bytes() const {            // NOLINT
+    return Span{data(), size()}.size_bytes();
+  }
+  [[nodiscard]] value_type* data() { return ptr_; };              // NOLINT
+  [[nodiscard]] value_type const* data() const { return ptr_; };  // NOLINT
+  [[nodiscard]] bool empty() const { return size() == 0; }        // NOLINT
+
+  [[nodiscard]] auto cbegin() const { return data(); }         // NOLINT
+  [[nodiscard]] auto begin() { return data(); }                // NOLINT
+  [[nodiscard]] auto begin() const { return cbegin(); }        // NOLINT
+  [[nodiscard]] auto cend() const { return data() + size(); }  // NOLINT
+  [[nodiscard]] auto end() { return data() + size(); }         // NOLINT
+  [[nodiscard]] auto end() const { return cend(); }            // NOLINT
+
+  [[nodiscard]] auto const& front() const { return data()[0]; }          // NOLINT
+  [[nodiscard]] auto& front() { return data()[0]; }                      // NOLINT
+  [[nodiscard]] auto const& back() const { return data()[size() - 1]; }  // NOLINT
+  [[nodiscard]] auto& back() { return data()[size() - 1]; }              // NOLINT
+
+  [[nodiscard]] value_type& operator[](size_type i) { return ptr_[i]; }
+  [[nodiscard]] value_type const& operator[](size_type i) const { return ptr_[i]; }
+
+  /**
+   * @brief Get the underlying resource.
+   */
+  auto Resource() const { return mem_; }
+};
+
+/**
+ * @brief Read a vector from stream. Accepts both `std::vector` and `RefResourceView`.
+ *
+ *  If the output vector is a referenced counted view, no copying occur.
+ */
+template <typename Vec>
+[[nodiscard]] bool ReadVec(common::AlignedResourceReadStream* fi, Vec* vec) {
+  std::uint64_t n{0};
+  if (!fi->Read(&n)) {
+    return false;
+  }
+  if (n == 0) {
+    return true;
+  }
+
+  using T = typename Vec::value_type;
+  auto expected_bytes = sizeof(T) * n;
+
+  auto [ptr, n_bytes] = fi->Consume(expected_bytes);
+  if (n_bytes != expected_bytes) {
+    return false;
+  }
+
+  if constexpr (std::is_same_v<Vec, RefResourceView<T>>) {
+    *vec = RefResourceView<T>{reinterpret_cast<T*>(ptr), n, fi->Share()};
+  } else {
+    vec->resize(n);
+    std::memcpy(vec->data(), ptr, n_bytes);
+  }
+  return true;
+}
+
+/**
+ * @brief Write a vector to stream. Accepts both `std::vector` and `RefResourceView`.
+ */
+template <typename Vec>
+[[nodiscard]] std::size_t WriteVec(AlignedFileWriteStream* fo, Vec const& vec) {
+  std::size_t bytes{0};
+  auto n = static_cast<std::uint64_t>(vec.size());
+  bytes += fo->Write(n);
+  if (n == 0) {
+    return sizeof(n);
+  }
+
+  using T = typename std::remove_reference_t<decltype(vec)>::value_type;
+  bytes += fo->Write(vec.data(), vec.size() * sizeof(T));
+
+  return bytes;
+}
+
+/**
+ * @brief Make a fixed size `RefResourceView` with malloc resource.
+ */
+template <typename T>
+[[nodiscard]] RefResourceView<T> MakeFixedVecWithMalloc(std::size_t n_elements, T const& init) {
+  auto resource = std::make_shared<common::MallocResource>(n_elements * sizeof(T));
+  return RefResourceView{resource->DataAs<T>(), n_elements, resource, init};
+}
+}  // namespace xgboost::common
+#endif  // XGBOOST_COMMON_REF_RESOURCE_VIEW_H_
diff --git a/src/data/ellpack_page_raw_format.cu b/src/data/ellpack_page_raw_format.cu
index 2f54b91c9..8316368ba 100644
--- a/src/data/ellpack_page_raw_format.cu
+++ b/src/data/ellpack_page_raw_format.cu
@@ -1,60 +1,59 @@
-/*!
- * Copyright 2019-2021 XGBoost contributors
+/**
+ * Copyright 2019-2023, XGBoost contributors
  */
-#include <xgboost/data.h>
 #include <dmlc/registry.h>
 
+#include <cstddef>  // for size_t
+
+#include "../common/io.h"                 // for AlignedResourceReadStream, AlignedFileWriteStream
+#include "../common/ref_resource_view.h"  // for ReadVec, WriteVec
 #include "ellpack_page.cuh"
-#include "sparse_page_writer.h"
-#include "histogram_cut_format.h"
-
-namespace xgboost {
-namespace data {
+#include "histogram_cut_format.h"  // for ReadHistogramCuts, WriteHistogramCuts
+#include "sparse_page_writer.h"    // for SparsePageFormat
 
+namespace xgboost::data {
 DMLC_REGISTRY_FILE_TAG(ellpack_page_raw_format);
 
-
 class EllpackPageRawFormat : public SparsePageFormat<EllpackPage> {
  public:
-  bool Read(EllpackPage* page, dmlc::SeekStream* fi) override {
+  bool Read(EllpackPage* page, common::AlignedResourceReadStream* fi) override {
     auto* impl = page->Impl();
     if (!ReadHistogramCuts(&impl->Cuts(), fi)) {
       return false;
     }
-    fi->Read(&impl->n_rows);
-    fi->Read(&impl->is_dense);
-    fi->Read(&impl->row_stride);
-    fi->Read(&impl->gidx_buffer.HostVector());
+    if (!fi->Read(&impl->n_rows)) {
+      return false;
+    }
+    if (!fi->Read(&impl->is_dense)) {
+      return false;
+    }
+    if (!fi->Read(&impl->row_stride)) {
+      return false;
+    }
+    if (!common::ReadVec(fi, &impl->gidx_buffer.HostVector())) {
+      return false;
+    }
     if (!fi->Read(&impl->base_rowid)) {
       return false;
     }
     return true;
   }
 
-  size_t Write(const EllpackPage& page, dmlc::Stream* fo) override {
-    size_t bytes = 0;
+  size_t Write(const EllpackPage& page, common::AlignedFileWriteStream* fo) override {
+    std::size_t bytes{0};
     auto* impl = page.Impl();
     bytes += WriteHistogramCuts(impl->Cuts(), fo);
-    fo->Write(impl->n_rows);
-    bytes += sizeof(impl->n_rows);
-    fo->Write(impl->is_dense);
-    bytes += sizeof(impl->is_dense);
-    fo->Write(impl->row_stride);
-    bytes += sizeof(impl->row_stride);
+    bytes += fo->Write(impl->n_rows);
+    bytes += fo->Write(impl->is_dense);
+    bytes += fo->Write(impl->row_stride);
     CHECK(!impl->gidx_buffer.ConstHostVector().empty());
-    fo->Write(impl->gidx_buffer.HostVector());
-    bytes += impl->gidx_buffer.ConstHostSpan().size_bytes() + sizeof(uint64_t);
-    fo->Write(impl->base_rowid);
-    bytes += sizeof(impl->base_rowid);
+    bytes += common::WriteVec(fo, impl->gidx_buffer.HostVector());
+    bytes += fo->Write(impl->base_rowid);
     return bytes;
   }
 };
 
 XGBOOST_REGISTER_ELLPACK_PAGE_FORMAT(raw)
     .describe("Raw ELLPACK binary data format.")
-    .set_body([]() {
-      return new EllpackPageRawFormat();
-    });
-
-}  // namespace data
-}  // namespace xgboost
+    .set_body([]() { return new EllpackPageRawFormat(); });
+}  // namespace xgboost::data
diff --git a/src/data/gradient_index.cc b/src/data/gradient_index.cc
index 11e9a4bec..1d47ae9e6 100644
--- a/src/data/gradient_index.cc
+++ b/src/data/gradient_index.cc
@@ -29,7 +29,7 @@ GHistIndexMatrix::GHistIndexMatrix(Context const *ctx, DMatrix *p_fmat, bst_bin_
   cut = common::SketchOnDMatrix(ctx, p_fmat, max_bins_per_feat, sorted_sketch, hess);
 
   const uint32_t nbins = cut.Ptrs().back();
-  hit_count.resize(nbins, 0);
+  hit_count = common::MakeFixedVecWithMalloc(nbins, std::size_t{0});
   hit_count_tloc_.resize(ctx->Threads() * nbins, 0);
 
   size_t new_size = 1;
@@ -37,8 +37,7 @@ GHistIndexMatrix::GHistIndexMatrix(Context const *ctx, DMatrix *p_fmat, bst_bin_
     new_size += batch.Size();
   }
 
-  row_ptr.resize(new_size);
-  row_ptr[0] = 0;
+  row_ptr = common::MakeFixedVecWithMalloc(new_size, std::size_t{0});
 
   const bool isDense = p_fmat->IsDense();
   this->isDense_ = isDense;
@@ -61,8 +60,8 @@ GHistIndexMatrix::GHistIndexMatrix(Context const *ctx, DMatrix *p_fmat, bst_bin_
 
 GHistIndexMatrix::GHistIndexMatrix(MetaInfo const &info, common::HistogramCuts &&cuts,
                                    bst_bin_t max_bin_per_feat)
-    : row_ptr(info.num_row_ + 1, 0),
-      hit_count(cuts.TotalBins(), 0),
+    : row_ptr{common::MakeFixedVecWithMalloc(info.num_row_ + 1, std::size_t{0})},
+      hit_count{common::MakeFixedVecWithMalloc(cuts.TotalBins(), std::size_t{0})},
       cut{std::forward<common::HistogramCuts>(cuts)},
       max_numeric_bins_per_feat(max_bin_per_feat),
       isDense_{info.num_col_ * info.num_row_ == info.num_nonzero_} {}
@@ -95,12 +94,10 @@ GHistIndexMatrix::GHistIndexMatrix(SparsePage const &batch, common::Span<Feature
       isDense_{isDense} {
   CHECK_GE(n_threads, 1);
   CHECK_EQ(row_ptr.size(), 0);
-  // The number of threads is pegged to the batch size. If the OMP
-  // block is parallelized on anything other than the batch/block size,
-  // it should be reassigned
-  row_ptr.resize(batch.Size() + 1, 0);
+  row_ptr = common::MakeFixedVecWithMalloc(batch.Size() + 1, std::size_t{0});
+
   const uint32_t nbins = cut.Ptrs().back();
-  hit_count.resize(nbins, 0);
+  hit_count = common::MakeFixedVecWithMalloc(nbins, std::size_t{0});
   hit_count_tloc_.resize(n_threads * nbins, 0);
 
   this->PushBatch(batch, ft, n_threads);
@@ -128,20 +125,45 @@ INSTANTIATION_PUSH(data::SparsePageAdapterBatch)
 #undef INSTANTIATION_PUSH
 
 void GHistIndexMatrix::ResizeIndex(const size_t n_index, const bool isDense) {
+  auto make_index = [this, n_index](auto t, common::BinTypeSize t_size) {
+    // Must resize instead of allocating a new one. This function is called everytime a
+    // new batch is pushed, and we grow the size accordingly without loosing the data the
+    // previous batches.
+    using T = decltype(t);
+    std::size_t n_bytes = sizeof(T) * n_index;
+    CHECK_GE(n_bytes, this->data.size());
+
+    auto resource = this->data.Resource();
+    decltype(this->data) new_vec;
+    if (!resource) {
+      CHECK(this->data.empty());
+      new_vec = common::MakeFixedVecWithMalloc(n_bytes, std::uint8_t{0});
+    } else {
+      CHECK(resource->Type() == common::ResourceHandler::kMalloc);
+      auto malloc_resource = std::dynamic_pointer_cast<common::MallocResource>(resource);
+      CHECK(malloc_resource);
+      malloc_resource->Resize(n_bytes);
+
+      // gcc-11.3 doesn't work if DataAs is used.
+      std::uint8_t *new_ptr = reinterpret_cast<std::uint8_t *>(malloc_resource->Data());
+      new_vec = {new_ptr, n_bytes / sizeof(std::uint8_t), malloc_resource};
+    }
+    this->data = std::move(new_vec);
+    this->index = common::Index{common::Span{data.data(), data.size()}, t_size};
+  };
+
   if ((MaxNumBinPerFeat() - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) &&
       isDense) {
     // compress dense index to uint8
-    index.SetBinTypeSize(common::kUint8BinsTypeSize);
-    index.Resize((sizeof(uint8_t)) * n_index);
+    make_index(std::uint8_t{}, common::kUint8BinsTypeSize);
   } else if ((MaxNumBinPerFeat() - 1 > static_cast<int>(std::numeric_limits<uint8_t>::max()) &&
               MaxNumBinPerFeat() - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) &&
              isDense) {
     // compress dense index to uint16
-    index.SetBinTypeSize(common::kUint16BinsTypeSize);
-    index.Resize((sizeof(uint16_t)) * n_index);
+    make_index(std::uint16_t{}, common::kUint16BinsTypeSize);
   } else {
-    index.SetBinTypeSize(common::kUint32BinsTypeSize);
-    index.Resize((sizeof(uint32_t)) * n_index);
+    // no compression
+    make_index(std::uint32_t{}, common::kUint32BinsTypeSize);
   }
 }
 
@@ -214,11 +236,11 @@ float GHistIndexMatrix::GetFvalue(std::vector<std::uint32_t> const &ptrs,
   return std::numeric_limits<float>::quiet_NaN();
 }
 
-bool GHistIndexMatrix::ReadColumnPage(dmlc::SeekStream *fi) {
+bool GHistIndexMatrix::ReadColumnPage(common::AlignedResourceReadStream *fi) {
   return this->columns_->Read(fi, this->cut.Ptrs().data());
 }
 
-size_t GHistIndexMatrix::WriteColumnPage(dmlc::Stream *fo) const {
+std::size_t GHistIndexMatrix::WriteColumnPage(common::AlignedFileWriteStream *fo) const {
   return this->columns_->Write(fo);
 }
 }  // namespace xgboost
diff --git a/src/data/gradient_index.cu b/src/data/gradient_index.cu
index af5b0f67b..42018eab4 100644
--- a/src/data/gradient_index.cu
+++ b/src/data/gradient_index.cu
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2022 by XGBoost Contributors
+/**
+ * Copyright 2022-2023, XGBoost Contributors
  */
 #include <memory>  // std::unique_ptr
 
@@ -41,9 +41,9 @@ void SetIndexData(Context const* ctx, EllpackPageImpl const* page,
 }
 
 void GetRowPtrFromEllpack(Context const* ctx, EllpackPageImpl const* page,
-                          std::vector<size_t>* p_out) {
+                          common::RefResourceView<std::size_t>* p_out) {
   auto& row_ptr = *p_out;
-  row_ptr.resize(page->Size() + 1, 0);
+  row_ptr = common::MakeFixedVecWithMalloc(page->Size() + 1, std::size_t{0});
   if (page->is_dense) {
     std::fill(row_ptr.begin() + 1, row_ptr.end(), page->row_stride);
   } else {
@@ -95,7 +95,7 @@ GHistIndexMatrix::GHistIndexMatrix(Context const* ctx, MetaInfo const& info,
         ctx, page, &hit_count_tloc_, [&](auto bin_idx, auto) { return bin_idx; }, this);
   }
 
-  this->hit_count.resize(n_bins_total, 0);
+  this->hit_count = common::MakeFixedVecWithMalloc(n_bins_total, std::size_t{0});
   this->GatherHitCount(ctx->Threads(), n_bins_total);
 
   // sanity checks
diff --git a/src/data/gradient_index.h b/src/data/gradient_index.h
index d36373d6b..840be4b06 100644
--- a/src/data/gradient_index.h
+++ b/src/data/gradient_index.h
@@ -9,13 +9,14 @@
 #include <atomic>     // for atomic
 #include <cinttypes>  // for uint32_t
 #include <cstddef>    // for size_t
-#include <memory>
+#include <memory>     // for make_unique
 #include <vector>
 
 #include "../common/categorical.h"
 #include "../common/error_msg.h"  // for InfInData
 #include "../common/hist_util.h"
 #include "../common/numeric.h"
+#include "../common/ref_resource_view.h"  // for RefResourceView
 #include "../common/threading_utils.h"
 #include "../common/transform_iterator.h"  // for MakeIndexTransformIter
 #include "adapter.h"
@@ -25,9 +26,11 @@
 namespace xgboost {
 namespace common {
 class ColumnMatrix;
+class AlignedFileWriteStream;
 }  // namespace common
-/*!
- * \brief preprocessed global index matrix, in CSR format
+
+/**
+ * @brief preprocessed global index matrix, in CSR format.
  *
  *  Transform floating values to integer index in histogram This is a global histogram
  *  index for CPU histogram.  On GPU ellpack page is used.
@@ -133,20 +136,22 @@ class GHistIndexMatrix {
   }
 
  public:
-  /*! \brief row pointer to rows by element position */
-  std::vector<size_t> row_ptr;
-  /*! \brief The index data */
+  /** @brief row pointer to rows by element position */
+  common::RefResourceView<std::size_t> row_ptr;
+  /** @brief data storage for index. */
+  common::RefResourceView<std::uint8_t> data;
+  /** @brief The histogram index. */
   common::Index index;
-  /*! \brief hit count of each index, used for constructing the ColumnMatrix */
-  std::vector<size_t> hit_count;
-  /*! \brief The corresponding cuts */
+  /** @brief hit count of each index, used for constructing the ColumnMatrix */
+  common::RefResourceView<std::size_t> hit_count;
+  /** @brief The corresponding cuts */
   common::HistogramCuts cut;
-  /** \brief max_bin for each feature. */
+  /** @brief max_bin for each feature. */
   bst_bin_t max_numeric_bins_per_feat;
-  /*! \brief base row index for current page (used by external memory) */
-  size_t base_rowid{0};
+  /** @brief base row index for current page (used by external memory) */
+  bst_row_t base_rowid{0};
 
-  bst_bin_t MaxNumBinPerFeat() const {
+  [[nodiscard]] bst_bin_t MaxNumBinPerFeat() const {
     return std::max(static_cast<bst_bin_t>(cut.MaxCategory() + 1), max_numeric_bins_per_feat);
   }
 
@@ -218,29 +223,27 @@ class GHistIndexMatrix {
     }
   }
 
-  bool IsDense() const {
-    return isDense_;
-  }
+  [[nodiscard]] bool IsDense() const { return isDense_; }
   void SetDense(bool is_dense) { isDense_ = is_dense; }
   /**
-   * \brief Get the local row index.
+   * @brief Get the local row index.
    */
-  size_t RowIdx(size_t ridx) const { return row_ptr[ridx - base_rowid]; }
+  [[nodiscard]] std::size_t RowIdx(size_t ridx) const { return row_ptr[ridx - base_rowid]; }
 
-  bst_row_t Size() const { return row_ptr.empty() ? 0 : row_ptr.size() - 1; }
-  bst_feature_t Features() const { return cut.Ptrs().size() - 1; }
+  [[nodiscard]] bst_row_t Size() const { return row_ptr.empty() ? 0 : row_ptr.size() - 1; }
+  [[nodiscard]] bst_feature_t Features() const { return cut.Ptrs().size() - 1; }
 
-  bool ReadColumnPage(dmlc::SeekStream* fi);
-  size_t WriteColumnPage(dmlc::Stream* fo) const;
+  [[nodiscard]] bool ReadColumnPage(common::AlignedResourceReadStream* fi);
+  [[nodiscard]] std::size_t WriteColumnPage(common::AlignedFileWriteStream* fo) const;
 
-  common::ColumnMatrix const& Transpose() const;
+  [[nodiscard]] common::ColumnMatrix const& Transpose() const;
 
-  bst_bin_t GetGindex(size_t ridx, size_t fidx) const;
+  [[nodiscard]] bst_bin_t GetGindex(size_t ridx, size_t fidx) const;
 
-  float GetFvalue(size_t ridx, size_t fidx, bool is_cat) const;
-  float GetFvalue(std::vector<std::uint32_t> const& ptrs, std::vector<float> const& values,
-                  std::vector<float> const& mins, bst_row_t ridx, bst_feature_t fidx,
-                  bool is_cat) const;
+  [[nodiscard]] float GetFvalue(size_t ridx, size_t fidx, bool is_cat) const;
+  [[nodiscard]] float GetFvalue(std::vector<std::uint32_t> const& ptrs,
+                                std::vector<float> const& values, std::vector<float> const& mins,
+                                bst_row_t ridx, bst_feature_t fidx, bool is_cat) const;
 
  private:
   std::unique_ptr<common::ColumnMatrix> columns_;
@@ -294,5 +297,5 @@ void AssignColumnBinIndex(GHistIndexMatrix const& page, Fn&& assign) {
     }
   });
 }
-}      // namespace xgboost
+}  // namespace xgboost
 #endif  // XGBOOST_DATA_GRADIENT_INDEX_H_
diff --git a/src/data/gradient_index_format.cc b/src/data/gradient_index_format.cc
index 204157682..ac52c0697 100644
--- a/src/data/gradient_index_format.cc
+++ b/src/data/gradient_index_format.cc
@@ -1,38 +1,49 @@
-/*!
- * Copyright 2021-2022 XGBoost contributors
+/**
+ * Copyright 2021-2023 XGBoost contributors
  */
-#include "sparse_page_writer.h"
-#include "gradient_index.h"
-#include "histogram_cut_format.h"
+#include <cstddef>      // for size_t
+#include <cstdint>      // for uint8_t
+#include <type_traits>  // for underlying_type_t
+#include <vector>       // for vector
 
-namespace xgboost {
-namespace data {
+#include "../common/io.h"                 // for AlignedResourceReadStream
+#include "../common/ref_resource_view.h"  // for ReadVec, WriteVec
+#include "gradient_index.h"               // for GHistIndexMatrix
+#include "histogram_cut_format.h"         // for ReadHistogramCuts
+#include "sparse_page_writer.h"           // for SparsePageFormat
+
+namespace xgboost::data {
 class GHistIndexRawFormat : public SparsePageFormat<GHistIndexMatrix> {
  public:
-  bool Read(GHistIndexMatrix* page, dmlc::SeekStream* fi) override {
+  bool Read(GHistIndexMatrix* page, common::AlignedResourceReadStream* fi) override {
+    CHECK(fi);
+
     if (!ReadHistogramCuts(&page->cut, fi)) {
       return false;
     }
+
     // indptr
-    fi->Read(&page->row_ptr);
-    // data
-    std::vector<uint8_t> data;
-    if (!fi->Read(&data)) {
+    if (!common::ReadVec(fi, &page->row_ptr)) {
       return false;
     }
-    page->index.Resize(data.size());
-    std::copy(data.cbegin(), data.cend(), page->index.begin());
-    // bin type
+
+    // data
+    // - bin type
     // Old gcc doesn't support reading from enum.
     std::underlying_type_t<common::BinTypeSize> uint_bin_type{0};
     if (!fi->Read(&uint_bin_type)) {
       return false;
     }
-    common::BinTypeSize size_type =
-        static_cast<common::BinTypeSize>(uint_bin_type);
-    page->index.SetBinTypeSize(size_type);
+    common::BinTypeSize size_type = static_cast<common::BinTypeSize>(uint_bin_type);
+    // - index buffer
+    if (!common::ReadVec(fi, &page->data)) {
+      return false;
+    }
+    // - index
+    page->index = common::Index{common::Span{page->data.data(), page->data.size()}, size_type};
+
     // hit count
-    if (!fi->Read(&page->hit_count)) {
+    if (!common::ReadVec(fi, &page->hit_count)) {
       return false;
     }
     if (!fi->Read(&page->max_numeric_bins_per_feat)) {
@@ -50,38 +61,33 @@ class GHistIndexRawFormat : public SparsePageFormat<GHistIndexMatrix> {
       page->index.SetBinOffset(page->cut.Ptrs());
     }
 
-    page->ReadColumnPage(fi);
+    if (!page->ReadColumnPage(fi)) {
+      return false;
+    }
     return true;
   }
 
-  size_t Write(GHistIndexMatrix const &page, dmlc::Stream *fo) override {
-    size_t bytes = 0;
+  std::size_t Write(GHistIndexMatrix const& page, common::AlignedFileWriteStream* fo) override {
+    std::size_t bytes = 0;
     bytes += WriteHistogramCuts(page.cut, fo);
     // indptr
-    fo->Write(page.row_ptr);
-    bytes += page.row_ptr.size() * sizeof(decltype(page.row_ptr)::value_type) +
-             sizeof(uint64_t);
+    bytes += common::WriteVec(fo, page.row_ptr);
+
     // data
-    std::vector<uint8_t> data(page.index.begin(), page.index.end());
-    fo->Write(data);
-    bytes += data.size() * sizeof(decltype(data)::value_type) + sizeof(uint64_t);
-    // bin type
-    std::underlying_type_t<common::BinTypeSize> uint_bin_type =
-        page.index.GetBinTypeSize();
-    fo->Write(uint_bin_type);
-    bytes += sizeof(page.index.GetBinTypeSize());
+    // - bin type
+    std::underlying_type_t<common::BinTypeSize> uint_bin_type = page.index.GetBinTypeSize();
+    bytes += fo->Write(uint_bin_type);
+    // - index buffer
+    std::vector<std::uint8_t> data(page.index.begin(), page.index.end());
+    bytes += fo->Write(static_cast<std::uint64_t>(data.size()));
+    bytes += fo->Write(data.data(), data.size());
+
     // hit count
-    fo->Write(page.hit_count);
-    bytes +=
-        page.hit_count.size() * sizeof(decltype(page.hit_count)::value_type) +
-        sizeof(uint64_t);
+    bytes += common::WriteVec(fo, page.hit_count);
     // max_bins, base row, is_dense
-    fo->Write(page.max_numeric_bins_per_feat);
-    bytes += sizeof(page.max_numeric_bins_per_feat);
-    fo->Write(page.base_rowid);
-    bytes += sizeof(page.base_rowid);
-    fo->Write(page.IsDense());
-    bytes += sizeof(page.IsDense());
+    bytes += fo->Write(page.max_numeric_bins_per_feat);
+    bytes += fo->Write(page.base_rowid);
+    bytes += fo->Write(page.IsDense());
 
     bytes += page.WriteColumnPage(fo);
     return bytes;
@@ -93,6 +99,4 @@ DMLC_REGISTRY_FILE_TAG(gradient_index_format);
 XGBOOST_REGISTER_GHIST_INDEX_PAGE_FORMAT(raw)
     .describe("Raw GHistIndex binary data format.")
     .set_body([]() { return new GHistIndexRawFormat(); });
-
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
diff --git a/src/data/histogram_cut_format.h b/src/data/histogram_cut_format.h
index 39961c4a2..45a96134f 100644
--- a/src/data/histogram_cut_format.h
+++ b/src/data/histogram_cut_format.h
@@ -1,36 +1,38 @@
-/*!
- * Copyright 2021 XGBoost contributors
+/**
+ * Copyright 2021-2023, XGBoost contributors
  */
 #ifndef XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_
 #define XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_
 
-#include "../common/hist_util.h"
+#include <dmlc/io.h>  // for Stream
 
-namespace xgboost {
-namespace data {
-inline bool ReadHistogramCuts(common::HistogramCuts *cuts, dmlc::SeekStream *fi) {
-  if (!fi->Read(&cuts->cut_values_.HostVector())) {
+#include <cstddef>  // for size_t
+
+#include "../common/hist_util.h"          // for HistogramCuts
+#include "../common/io.h"                 // for AlignedResourceReadStream, AlignedFileWriteStream
+#include "../common/ref_resource_view.h"  // for WriteVec, ReadVec
+
+namespace xgboost::data {
+inline bool ReadHistogramCuts(common::HistogramCuts *cuts, common::AlignedResourceReadStream *fi) {
+  if (!common::ReadVec(fi, &cuts->cut_values_.HostVector())) {
     return false;
   }
-  if (!fi->Read(&cuts->cut_ptrs_.HostVector())) {
+  if (!common::ReadVec(fi, &cuts->cut_ptrs_.HostVector())) {
     return false;
   }
-  if (!fi->Read(&cuts->min_vals_.HostVector())) {
+  if (!common::ReadVec(fi, &cuts->min_vals_.HostVector())) {
     return false;
   }
   return true;
 }
 
-inline size_t WriteHistogramCuts(common::HistogramCuts const &cuts, dmlc::Stream *fo) {
-  size_t bytes = 0;
-  fo->Write(cuts.cut_values_.ConstHostVector());
-  bytes += cuts.cut_values_.ConstHostSpan().size_bytes() + sizeof(uint64_t);
-  fo->Write(cuts.cut_ptrs_.ConstHostVector());
-  bytes += cuts.cut_ptrs_.ConstHostSpan().size_bytes() + sizeof(uint64_t);
-  fo->Write(cuts.min_vals_.ConstHostVector());
-  bytes += cuts.min_vals_.ConstHostSpan().size_bytes() + sizeof(uint64_t);
+inline std::size_t WriteHistogramCuts(common::HistogramCuts const &cuts,
+                                      common::AlignedFileWriteStream *fo) {
+  std::size_t bytes = 0;
+  bytes += common::WriteVec(fo, cuts.Values());
+  bytes += common::WriteVec(fo, cuts.Ptrs());
+  bytes += common::WriteVec(fo, cuts.MinValues());
   return bytes;
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
 #endif  // XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_
diff --git a/src/data/iterative_dmatrix.cc b/src/data/iterative_dmatrix.cc
index 627606aa3..c2c9a1d70 100644
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@@ -240,9 +240,9 @@ void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p,
    * Generate gradient index.
    */
   this->ghist_ = std::make_unique<GHistIndexMatrix>(Info(), std::move(cuts), p.max_bin);
-  size_t rbegin = 0;
-  size_t prev_sum = 0;
-  size_t i = 0;
+  std::size_t rbegin = 0;
+  std::size_t prev_sum = 0;
+  std::size_t i = 0;
   while (iter.Next()) {
     HostAdapterDispatch(proxy, [&](auto const& batch) {
       proxy->Info().num_nonzero_ = batch_nnz[i];
diff --git a/src/data/sparse_page_raw_format.cc b/src/data/sparse_page_raw_format.cc
index 1e5d1ec71..1edf27c46 100644
--- a/src/data/sparse_page_raw_format.cc
+++ b/src/data/sparse_page_raw_format.cc
@@ -1,59 +1,57 @@
-/*!
- * Copyright (c) 2015-2021 by Contributors
+/**
+ * Copyright 2015-2023, XGBoost Contributors
  * \file sparse_page_raw_format.cc
  *  Raw binary format of sparse page.
  */
-#include <xgboost/data.h>
 #include <dmlc/registry.h>
 
-#include "xgboost/logging.h"
+#include "../common/io.h"                 // for AlignedResourceReadStream, AlignedFileWriteStream
+#include "../common/ref_resource_view.h"  // for WriteVec
 #include "./sparse_page_writer.h"
+#include "xgboost/data.h"
+#include "xgboost/logging.h"
 
-namespace xgboost {
-namespace data {
-
+namespace xgboost::data {
 DMLC_REGISTRY_FILE_TAG(sparse_page_raw_format);
 
-template<typename T>
+template <typename T>
 class SparsePageRawFormat : public SparsePageFormat<T> {
  public:
-  bool Read(T* page, dmlc::SeekStream* fi) override {
+  bool Read(T* page, common::AlignedResourceReadStream* fi) override {
     auto& offset_vec = page->offset.HostVector();
-    if (!fi->Read(&offset_vec)) {
+    if (!common::ReadVec(fi, &offset_vec)) {
       return false;
     }
     auto& data_vec = page->data.HostVector();
     CHECK_NE(page->offset.Size(), 0U) << "Invalid SparsePage file";
     data_vec.resize(offset_vec.back());
     if (page->data.Size() != 0) {
-      size_t n_bytes = fi->Read(dmlc::BeginPtr(data_vec),
-                                (page->data).Size() * sizeof(Entry));
-      CHECK_EQ(n_bytes, (page->data).Size() * sizeof(Entry))
-          << "Invalid SparsePage file";
+      if (!common::ReadVec(fi, &data_vec)) {
+        return false;
+      }
+    }
+    if (!fi->Read(&page->base_rowid, sizeof(page->base_rowid))) {
+      return false;
     }
-    fi->Read(&page->base_rowid, sizeof(page->base_rowid));
     return true;
   }
 
-  size_t Write(const T& page, dmlc::Stream* fo) override {
+  std::size_t Write(const T& page, common::AlignedFileWriteStream* fo) override {
     const auto& offset_vec = page.offset.HostVector();
     const auto& data_vec = page.data.HostVector();
     CHECK(page.offset.Size() != 0 && offset_vec[0] == 0);
     CHECK_EQ(offset_vec.back(), page.data.Size());
-    fo->Write(offset_vec);
-    auto bytes = page.MemCostBytes();
-    bytes += sizeof(uint64_t);
+
+    std::size_t bytes{0};
+    bytes += common::WriteVec(fo, offset_vec);
     if (page.data.Size() != 0) {
-      fo->Write(dmlc::BeginPtr(data_vec), page.data.Size() * sizeof(Entry));
+      bytes += common::WriteVec(fo, data_vec);
     }
-    fo->Write(&page.base_rowid, sizeof(page.base_rowid));
-    bytes += sizeof(page.base_rowid);
+    bytes += fo->Write(&page.base_rowid, sizeof(page.base_rowid));
     return bytes;
   }
 
  private:
-  /*! \brief external memory column offset */
-  std::vector<size_t> disk_offset_;
 };
 
 XGBOOST_REGISTER_SPARSE_PAGE_FORMAT(raw)
@@ -74,5 +72,4 @@ XGBOOST_REGISTER_SORTED_CSC_PAGE_FORMAT(raw)
     return new SparsePageRawFormat<SortedCSCPage>();
   });
 
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
diff --git a/src/data/sparse_page_source.h b/src/data/sparse_page_source.h
index 9f7bee521..b32c536af 100644
--- a/src/data/sparse_page_source.h
+++ b/src/data/sparse_page_source.h
@@ -6,9 +6,11 @@
 #define XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_
 
 #include <algorithm>  // for min
+#include <atomic>     // for atomic
 #include <future>     // for async
 #include <map>
 #include <memory>
+#include <mutex>  // for mutex
 #include <string>
 #include <thread>
 #include <utility>  // for pair, move
@@ -18,7 +20,6 @@
 #include "../common/io.h"     // for PrivateMmapConstStream
 #include "../common/timer.h"  // for Monitor, Timer
 #include "adapter.h"
-#include "dmlc/common.h"         // for OMPException
 #include "proxy_dmatrix.h"       // for DMatrixProxy
 #include "sparse_page_writer.h"  // for SparsePageFormat
 #include "xgboost/base.h"
@@ -93,6 +94,47 @@ class TryLockGuard {
   }
 };
 
+// Similar to `dmlc::OMPException`, but doesn't need the threads to be joined before rethrow
+class ExceHandler {
+  std::mutex mutex_;
+  std::atomic<bool> flag_{false};
+  std::exception_ptr curr_exce_{nullptr};
+
+ public:
+  template <typename Fn>
+  decltype(auto) Run(Fn&& fn) noexcept(true) {
+    try {
+      return fn();
+    } catch (dmlc::Error const& e) {
+      std::lock_guard<std::mutex> guard{mutex_};
+      if (!curr_exce_) {
+        curr_exce_ = std::current_exception();
+      }
+      flag_ = true;
+    } catch (std::exception const& e) {
+      std::lock_guard<std::mutex> guard{mutex_};
+      if (!curr_exce_) {
+        curr_exce_ = std::current_exception();
+      }
+      flag_ = true;
+    } catch (...) {
+      std::lock_guard<std::mutex> guard{mutex_};
+      if (!curr_exce_) {
+        curr_exce_ = std::current_exception();
+      }
+      flag_ = true;
+    }
+    return std::invoke_result_t<Fn>();
+  }
+
+  void Rethrow() noexcept(false) {
+    if (flag_) {
+      CHECK(curr_exce_);
+      std::rethrow_exception(curr_exce_);
+    }
+  }
+};
+
 /**
  * @brief Base class for all page sources. Handles fetching, writing, and iteration.
  */
@@ -122,7 +164,7 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
   // Catching exception in pre-fetch threads to prevent segfault. Not always work though,
   // OOM error can be delayed due to lazy commit. On the bright side, if mmap is used then
   // OOM error should be rare.
-  dmlc::OMPException exec_;
+  ExceHandler exce_;
   common::Monitor monitor_;
 
   bool ReadCache() {
@@ -141,7 +183,7 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
     CHECK_GT(n_prefetch_batches, 0) << "total batches:" << n_batches_;
     std::size_t fetch_it = count_;
 
-    exec_.Rethrow();
+    exce_.Rethrow();
 
     for (std::size_t i = 0; i < n_prefetch_batches; ++i, ++fetch_it) {
       fetch_it %= n_batches_;  // ring
@@ -152,7 +194,7 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
       CHECK_LT(fetch_it, cache_info_->offset.size());
       ring_->at(fetch_it) = std::async(std::launch::async, [fetch_it, self, this]() {
         auto page = std::make_shared<S>();
-        this->exec_.Run([&] {
+        this->exce_.Run([&] {
           std::unique_ptr<SparsePageFormat<S>> fmt{CreatePageFormat<S>("raw")};
           auto name = self->cache_info_->ShardName();
           auto [offset, length] = self->cache_info_->View(fetch_it);
@@ -172,7 +214,7 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
     CHECK(!(*ring_)[count_].valid());
     monitor_.Stop("Wait");
 
-    exec_.Rethrow();
+    exce_.Rethrow();
 
     return true;
   }
@@ -184,11 +226,11 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
     std::unique_ptr<SparsePageFormat<S>> fmt{CreatePageFormat<S>("raw")};
 
     auto name = cache_info_->ShardName();
-    std::unique_ptr<dmlc::Stream> fo;
+    std::unique_ptr<common::AlignedFileWriteStream> fo;
     if (this->Iter() == 0) {
-      fo.reset(dmlc::Stream::Create(name.c_str(), "wb"));
+      fo = std::make_unique<common::AlignedFileWriteStream>(StringView{name}, "wb");
     } else {
-      fo.reset(dmlc::Stream::Create(name.c_str(), "ab"));
+      fo = std::make_unique<common::AlignedFileWriteStream>(StringView{name}, "ab");
     }
 
     auto bytes = fmt->Write(*page_, fo.get());
diff --git a/src/data/sparse_page_writer.h b/src/data/sparse_page_writer.h
index 91a6504fe..c909d817d 100644
--- a/src/data/sparse_page_writer.h
+++ b/src/data/sparse_page_writer.h
@@ -1,52 +1,44 @@
-/*!
- * Copyright (c) 2014-2019 by Contributors
+/**
+ * Copyright 2014-2023, XGBoost Contributors
  * \file sparse_page_writer.h
  * \author Tianqi Chen
  */
 #ifndef XGBOOST_DATA_SPARSE_PAGE_WRITER_H_
 #define XGBOOST_DATA_SPARSE_PAGE_WRITER_H_
 
-#include <xgboost/data.h>
-#include <dmlc/io.h>
-#include <vector>
-#include <algorithm>
-#include <cstring>
-#include <string>
-#include <utility>
-#include <memory>
-#include <functional>
+#include <functional>  // for function
+#include <string>      // for string
 
-#if DMLC_ENABLE_STD_THREAD
-#include <dmlc/concurrency.h>
-#include <thread>
-#endif  // DMLC_ENABLE_STD_THREAD
-
-namespace xgboost {
-namespace data {
+#include "../common/io.h"   // for AlignedResourceReadStream, AlignedFileWriteStream
+#include "dmlc/io.h"        // for Stream
+#include "dmlc/registry.h"  // for Registry, FunctionRegEntryBase
+#include "xgboost/data.h"   // for SparsePage,CSCPage,SortedCSCPage,EllpackPage ...
 
+namespace xgboost::data {
 template<typename T>
 struct SparsePageFormatReg;
 
-/*!
- * \brief Format specification of SparsePage.
+/**
+ * @brief Format specification of various data formats like SparsePage.
  */
-template<typename T>
+template <typename T>
 class SparsePageFormat {
  public:
-  /*! \brief virtual destructor */
   virtual ~SparsePageFormat() = default;
-  /*!
-   * \brief Load all the segments into page, advance fi to end of the block.
-   * \param page The data to read page into.
-   * \param fi the input stream of the file
-   * \return true of the loading as successful, false if end of file was reached
+  /**
+   * @brief Load all the segments into page, advance fi to end of the block.
+   *
+   * @param page The data to read page into.
+   * @param fi the input stream of the file
+   * @return true of the loading as successful, false if end of file was reached
    */
-  virtual bool Read(T* page, dmlc::SeekStream* fi) = 0;
-  /*!
-   * \brief save the data to fo, when a page was written.
-   * \param fo output stream
+  virtual bool Read(T* page, common::AlignedResourceReadStream* fi) = 0;
+  /**
+   * @brief save the data to fo, when a page was written.
+   *
+   * @param fo output stream
    */
-  virtual size_t Write(const T& page, dmlc::Stream* fo) = 0;
+  virtual size_t Write(const T& page, common::AlignedFileWriteStream* fo) = 0;
 };
 
 /*!
@@ -105,6 +97,5 @@ struct SparsePageFormatReg
   DMLC_REGISTRY_REGISTER(SparsePageFormatReg<GHistIndexMatrix>,                \
                          GHistIndexPageFmt, Name)
 
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
 #endif  // XGBOOST_DATA_SPARSE_PAGE_WRITER_H_
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index f67c05344..9d268b8d7 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -634,6 +634,22 @@ GBTree::GetPredictor(HostDeviceVector<float> const *out_pred,
     return cpu_predictor_;
   }
 
+  // Data comes from SparsePageDMatrix. Since we are loading data in pages, no need to
+  // prevent data copy.
+  if (f_dmat && !f_dmat->SingleColBlock()) {
+    if (ctx_->IsCPU()) {
+      return cpu_predictor_;
+    } else {
+#if defined(XGBOOST_USE_CUDA)
+      CHECK_GE(common::AllVisibleGPUs(), 1) << "No visible GPU is found for XGBoost.";
+      return gpu_predictor_;
+#else
+      common::AssertGPUSupport();
+      return cpu_predictor_;
+#endif  // defined(XGBOOST_USE_CUDA)
+    }
+  }
+
   // Data comes from Device DMatrix.
   auto is_ellpack = f_dmat && f_dmat->PageExists<EllpackPage>() &&
                     !f_dmat->PageExists<SparsePage>();
diff --git a/tests/cpp/common/test_io.cc b/tests/cpp/common/test_io.cc
index a64b60b80..986e58c5a 100644
--- a/tests/cpp/common/test_io.cc
+++ b/tests/cpp/common/test_io.cc
@@ -3,11 +3,12 @@
  */
 #include <gtest/gtest.h>
 
-#include <fstream>
+#include <cstddef>  // for size_t
+#include <fstream>  // for ofstream
 
 #include "../../../src/common/io.h"
-#include "../helpers.h"
 #include "../filesystem.h"  // dmlc::TemporaryDirectory
+#include "../helpers.h"
 
 namespace xgboost::common {
 TEST(MemoryFixSizeBuffer, Seek) {
@@ -89,6 +90,57 @@ TEST(IO, LoadSequentialFile) {
   ASSERT_THROW(LoadSequentialFile("non-exist", true), dmlc::Error);
 }
 
+TEST(IO, Resource) {
+  {
+    // test malloc basic
+    std::size_t n = 128;
+    std::shared_ptr<ResourceHandler> resource = std::make_shared<MallocResource>(n);
+    ASSERT_EQ(resource->Size(), n);
+    ASSERT_EQ(resource->Type(), ResourceHandler::kMalloc);
+  }
+
+  // test malloc resize
+  auto test_malloc_resize = [](bool force_malloc) {
+    std::size_t n = 64;
+    std::shared_ptr<ResourceHandler> resource = std::make_shared<MallocResource>(n);
+    auto ptr = reinterpret_cast<std::uint8_t *>(resource->Data());
+    std::iota(ptr, ptr + n, 0);
+
+    auto malloc_resource = std::dynamic_pointer_cast<MallocResource>(resource);
+    ASSERT_TRUE(malloc_resource);
+    if (force_malloc) {
+      malloc_resource->Resize<true>(n * 2);
+    } else {
+      malloc_resource->Resize<false>(n * 2);
+    }
+    for (std::size_t i = 0; i < n; ++i) {
+      ASSERT_EQ(malloc_resource->DataAs<std::uint8_t>()[i], i) << force_malloc;
+    }
+    for (std::size_t i = n; i < 2 * n; ++i) {
+      ASSERT_EQ(malloc_resource->DataAs<std::uint8_t>()[i], 0);
+    }
+  };
+  test_malloc_resize(true);
+  test_malloc_resize(false);
+
+  {
+    // test mmap
+    dmlc::TemporaryDirectory tmpdir;
+    auto path = tmpdir.path + "/testfile";
+
+    std::ofstream fout(path, std::ios::binary);
+    double val{1.0};
+    fout.write(reinterpret_cast<char const *>(&val), sizeof(val));
+    fout << 1.0 << std::endl;
+    fout.close();
+
+    auto resource = std::make_shared<MmapResource>(path, 0, sizeof(double));
+    ASSERT_EQ(resource->Size(), sizeof(double));
+    ASSERT_EQ(resource->Type(), ResourceHandler::kMmap);
+    ASSERT_EQ(resource->DataAs<double>()[0], val);
+  }
+}
+
 TEST(IO, PrivateMmapStream) {
   dmlc::TemporaryDirectory tempdir;
   auto path = tempdir.path + "/testfile";
@@ -124,17 +176,35 @@ TEST(IO, PrivateMmapStream) {
   // Turn size info offset
   std::partial_sum(offset.begin(), offset.end(), offset.begin());
 
+  // Test read
   for (std::size_t i = 0; i < n_batches; ++i) {
     std::size_t off = offset[i];
     std::size_t n = offset.at(i + 1) - offset[i];
-    std::unique_ptr<dmlc::Stream> fi{std::make_unique<PrivateMmapConstStream>(path, off, n)};
+    auto fi{std::make_unique<PrivateMmapConstStream>(path, off, n)};
     std::vector<T> data;
 
     std::uint64_t size{0};
-    fi->Read(&size);
+    ASSERT_TRUE(fi->Read(&size));
+    ASSERT_EQ(fi->Tell(), sizeof(size));
     data.resize(size);
 
-    fi->Read(data.data(), size * sizeof(T));
+    ASSERT_EQ(fi->Read(data.data(), size * sizeof(T)), size * sizeof(T));
+    ASSERT_EQ(data, batches[i]);
+  }
+
+  // Test consume
+  for (std::size_t i = 0; i < n_batches; ++i) {
+    std::size_t off = offset[i];
+    std::size_t n = offset.at(i + 1) - offset[i];
+    std::unique_ptr<AlignedResourceReadStream> fi{std::make_unique<PrivateMmapConstStream>(path, off, n)};
+    std::vector<T> data;
+
+    std::uint64_t size{0};
+    ASSERT_TRUE(fi->Consume(&size));
+    ASSERT_EQ(fi->Tell(), sizeof(size));
+    data.resize(size);
+
+    ASSERT_EQ(fi->Read(data.data(), size * sizeof(T)), sizeof(T) * size);
     ASSERT_EQ(data, batches[i]);
   }
 }
diff --git a/tests/cpp/common/test_ref_resource_view.cc b/tests/cpp/common/test_ref_resource_view.cc
new file mode 100644
index 000000000..9ae55fdec
--- /dev/null
+++ b/tests/cpp/common/test_ref_resource_view.cc
@@ -0,0 +1,108 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+
+#include <cstddef>  // for size_t
+#include <memory>   // for make_shared, make_unique
+#include <numeric>  // for iota
+#include <vector>   // for vector
+
+#include "../../../src/common/ref_resource_view.h"
+#include "dmlc/filesystem.h"  // for TemporaryDirectory
+
+namespace xgboost::common {
+TEST(RefResourceView, Basic) {
+  std::size_t n_bytes = 1024;
+  auto mem = std::make_shared<MallocResource>(n_bytes);
+  {
+    RefResourceView view{reinterpret_cast<float*>(mem->Data()), mem->Size() / sizeof(float), mem};
+
+    RefResourceView kview{reinterpret_cast<float const*>(mem->Data()), mem->Size() / sizeof(float),
+                          mem};
+    ASSERT_EQ(mem.use_count(), 3);
+    ASSERT_EQ(view.size(), n_bytes / sizeof(1024));
+    ASSERT_EQ(kview.size(), n_bytes / sizeof(1024));
+  }
+  {
+    RefResourceView view{reinterpret_cast<float*>(mem->Data()), mem->Size() / sizeof(float), mem,
+                         1.5f};
+    for (auto v : view) {
+      ASSERT_EQ(v, 1.5f);
+    }
+    std::iota(view.begin(), view.end(), 0.0f);
+    ASSERT_EQ(view.front(), 0.0f);
+    ASSERT_EQ(view.back(), static_cast<float>(view.size() - 1));
+
+    view.front() = 1.0f;
+    view.back() = 2.0f;
+    ASSERT_EQ(view.front(), 1.0f);
+    ASSERT_EQ(view.back(), 2.0f);
+  }
+  ASSERT_EQ(mem.use_count(), 1);
+}
+
+TEST(RefResourceView, IO) {
+  dmlc::TemporaryDirectory tmpdir;
+  auto path = tmpdir.path + "/testfile";
+  auto data = MakeFixedVecWithMalloc(123, std::size_t{1});
+
+  {
+    auto fo = std::make_unique<AlignedFileWriteStream>(StringView{path}, "wb");
+    ASSERT_EQ(fo->Write(data.data(), data.size_bytes()), data.size_bytes());
+  }
+  {
+    auto fo = std::make_unique<AlignedFileWriteStream>(StringView{path}, "wb");
+    ASSERT_EQ(WriteVec(fo.get(), data),
+              data.size_bytes() + sizeof(RefResourceView<std::size_t>::size_type));
+  }
+  {
+    auto fi = std::make_unique<PrivateMmapConstStream>(
+        path, 0, data.size_bytes() + sizeof(RefResourceView<std::size_t>::size_type));
+    auto read = MakeFixedVecWithMalloc(123, std::size_t{1});
+    ASSERT_TRUE(ReadVec(fi.get(), &read));
+    for (auto v : read) {
+      ASSERT_EQ(v, 1ul);
+    }
+  }
+}
+
+TEST(RefResourceView, IOAligned) {
+  dmlc::TemporaryDirectory tmpdir;
+  auto path = tmpdir.path + "/testfile";
+  auto data = MakeFixedVecWithMalloc(123, 1.0f);
+
+  {
+    auto fo = std::make_unique<AlignedFileWriteStream>(StringView{path}, "wb");
+    // + sizeof(float) for alignment
+    ASSERT_EQ(WriteVec(fo.get(), data),
+              data.size_bytes() + sizeof(RefResourceView<std::size_t>::size_type) + sizeof(float));
+  }
+  {
+    auto fi = std::make_unique<PrivateMmapConstStream>(
+        path, 0, data.size_bytes() + sizeof(RefResourceView<std::size_t>::size_type));
+    // wrong type, float vs. double
+    auto read = MakeFixedVecWithMalloc(123, 2.0);
+    ASSERT_FALSE(ReadVec(fi.get(), &read));
+  }
+  {
+    auto fi = std::make_unique<PrivateMmapConstStream>(
+        path, 0, data.size_bytes() + sizeof(RefResourceView<std::size_t>::size_type));
+    auto read = MakeFixedVecWithMalloc(123, 2.0f);
+    ASSERT_TRUE(ReadVec(fi.get(), &read));
+    for (auto v : read) {
+      ASSERT_EQ(v, 1ul);
+    }
+  }
+  {
+    // Test std::vector
+    std::vector<float> data(123);
+    std::iota(data.begin(), data.end(), 0.0f);
+    auto fo = std::make_unique<AlignedFileWriteStream>(StringView{path}, "wb");
+    // + sizeof(float) for alignment
+    ASSERT_EQ(WriteVec(fo.get(), data), data.size() * sizeof(float) +
+                                            sizeof(RefResourceView<std::size_t>::size_type) +
+                                            sizeof(float));
+  }
+}
+}  // namespace xgboost::common
diff --git a/tests/cpp/data/test_ellpack_page_raw_format.cu b/tests/cpp/data/test_ellpack_page_raw_format.cu
index 66d4024ec..f69b7b63a 100644
--- a/tests/cpp/data/test_ellpack_page_raw_format.cu
+++ b/tests/cpp/data/test_ellpack_page_raw_format.cu
@@ -4,14 +4,14 @@
 #include <gtest/gtest.h>
 #include <xgboost/data.h>
 
+#include "../../../src/common/io.h"  // for PrivateMmapConstStream, AlignedResourceReadStream...
 #include "../../../src/data/ellpack_page.cuh"
 #include "../../../src/data/sparse_page_source.h"
 #include "../../../src/tree/param.h"  // TrainParam
 #include "../filesystem.h"            // dmlc::TemporaryDirectory
 #include "../helpers.h"
 
-namespace xgboost {
-namespace data {
+namespace xgboost::data {
 TEST(EllpackPageRawFormat, IO) {
   Context ctx{MakeCUDACtx(0)};
   auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
@@ -22,15 +22,17 @@ TEST(EllpackPageRawFormat, IO) {
   dmlc::TemporaryDirectory tmpdir;
   std::string path = tmpdir.path + "/ellpack.page";
 
+  std::size_t n_bytes{0};
   {
-    std::unique_ptr<dmlc::Stream> fo{dmlc::Stream::Create(path.c_str(), "w")};
+    auto fo = std::make_unique<common::AlignedFileWriteStream>(StringView{path}, "wb");
     for (auto const &ellpack : m->GetBatches<EllpackPage>(&ctx, param)) {
-      format->Write(ellpack, fo.get());
+      n_bytes += format->Write(ellpack, fo.get());
     }
   }
 
   EllpackPage page;
-  std::unique_ptr<dmlc::SeekStream> fi{dmlc::SeekStream::CreateForRead(path.c_str())};
+  std::unique_ptr<common::AlignedResourceReadStream> fi{
+      std::make_unique<common::PrivateMmapConstStream>(path.c_str(), 0, n_bytes)};
   format->Read(&page, fi.get());
 
   for (auto const &ellpack : m->GetBatches<EllpackPage>(&ctx, param)) {
@@ -44,5 +46,4 @@ TEST(EllpackPageRawFormat, IO) {
     ASSERT_EQ(loaded->gidx_buffer.HostVector(), orig->gidx_buffer.HostVector());
   }
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
diff --git a/tests/cpp/data/test_gradient_index.cc b/tests/cpp/data/test_gradient_index.cc
index 22eb849ee..bd29c87b0 100644
--- a/tests/cpp/data/test_gradient_index.cc
+++ b/tests/cpp/data/test_gradient_index.cc
@@ -26,8 +26,7 @@
 #include "xgboost/context.h"                    // for Context
 #include "xgboost/host_device_vector.h"         // for HostDeviceVector
 
-namespace xgboost {
-namespace data {
+namespace xgboost::data {
 TEST(GradientIndex, ExternalMemory) {
   Context ctx;
   std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(10000);
@@ -171,7 +170,7 @@ class GHistIndexMatrixTest : public testing::TestWithParam<std::tuple<float, flo
     gpu_ctx.gpu_id = 0;
     for (auto const &page : Xy->GetBatches<EllpackPage>(
              &gpu_ctx, BatchParam{kBins, tree::TrainParam::DftSparseThreshold()})) {
-      from_ellpack.reset(new GHistIndexMatrix{&ctx, Xy->Info(), page, p});
+      from_ellpack = std::make_unique<GHistIndexMatrix>(&ctx, Xy->Info(), page, p);
     }
 
     for (auto const &from_sparse_page : Xy->GetBatches<GHistIndexMatrix>(&ctx, p)) {
@@ -199,13 +198,15 @@ class GHistIndexMatrixTest : public testing::TestWithParam<std::tuple<float, flo
 
       std::string from_sparse_buf;
       {
-        common::MemoryBufferStream fo{&from_sparse_buf};
-        columns_from_sparse.Write(&fo);
+        common::AlignedMemWriteStream fo{&from_sparse_buf};
+        auto n_bytes = columns_from_sparse.Write(&fo);
+        ASSERT_EQ(fo.Tell(), n_bytes);
       }
       std::string from_ellpack_buf;
       {
-        common::MemoryBufferStream fo{&from_ellpack_buf};
-        columns_from_sparse.Write(&fo);
+        common::AlignedMemWriteStream fo{&from_ellpack_buf};
+        auto n_bytes = columns_from_sparse.Write(&fo);
+        ASSERT_EQ(fo.Tell(), n_bytes);
       }
       ASSERT_EQ(from_sparse_buf, from_ellpack_buf);
     }
@@ -229,5 +230,4 @@ INSTANTIATE_TEST_SUITE_P(GHistIndexMatrix, GHistIndexMatrixTest,
                                          std::make_tuple(.6f, .4)));  // dense columns
 
 #endif  // defined(XGBOOST_USE_CUDA)
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
diff --git a/tests/cpp/data/test_gradient_index_page_raw_format.cc b/tests/cpp/data/test_gradient_index_page_raw_format.cc
index 570d1dbca..e2397c7b7 100644
--- a/tests/cpp/data/test_gradient_index_page_raw_format.cc
+++ b/tests/cpp/data/test_gradient_index_page_raw_format.cc
@@ -2,14 +2,18 @@
  * Copyright 2021-2023, XGBoost contributors
  */
 #include <gtest/gtest.h>
+#include <xgboost/context.h>  // for Context
+
+#include <cstddef>  // for size_t
+#include <memory>   // for unique_ptr
 
 #include "../../../src/common/column_matrix.h"
-#include "../../../src/data/gradient_index.h"
+#include "../../../src/common/io.h"            // for MmapResource, AlignedResourceReadStream...
+#include "../../../src/data/gradient_index.h"  // for GHistIndexMatrix
 #include "../../../src/data/sparse_page_source.h"
-#include "../helpers.h"
+#include "../helpers.h"  // for RandomDataGenerator
 
-namespace xgboost {
-namespace data {
+namespace xgboost::data {
 TEST(GHistIndexPageRawFormat, IO) {
   Context ctx;
 
@@ -20,15 +24,18 @@ TEST(GHistIndexPageRawFormat, IO) {
   std::string path = tmpdir.path + "/ghistindex.page";
   auto batch = BatchParam{256, 0.5};
 
+  std::size_t bytes{0};
   {
-    std::unique_ptr<dmlc::Stream> fo{dmlc::Stream::Create(path.c_str(), "w")};
+    auto fo = std::make_unique<common::AlignedFileWriteStream>(StringView{path}, "wb");
     for (auto const &index : m->GetBatches<GHistIndexMatrix>(&ctx, batch)) {
-      format->Write(index, fo.get());
+      bytes += format->Write(index, fo.get());
     }
   }
 
   GHistIndexMatrix page;
-  std::unique_ptr<dmlc::SeekStream> fi{dmlc::SeekStream::CreateForRead(path.c_str())};
+
+  std::unique_ptr<common::AlignedResourceReadStream> fi{
+      std::make_unique<common::PrivateMmapConstStream>(path, 0, bytes)};
   format->Read(&page, fi.get());
 
   for (auto const &gidx : m->GetBatches<GHistIndexMatrix>(&ctx, batch)) {
@@ -37,6 +44,8 @@ TEST(GHistIndexPageRawFormat, IO) {
     ASSERT_EQ(loaded.cut.MinValues(), page.cut.MinValues());
     ASSERT_EQ(loaded.cut.Values(), page.cut.Values());
     ASSERT_EQ(loaded.base_rowid, page.base_rowid);
+    ASSERT_EQ(loaded.row_ptr.size(), page.row_ptr.size());
+    ASSERT_TRUE(std::equal(loaded.row_ptr.cbegin(), loaded.row_ptr.cend(), page.row_ptr.cbegin()));
     ASSERT_EQ(loaded.IsDense(), page.IsDense());
     ASSERT_TRUE(std::equal(loaded.index.begin(), loaded.index.end(), page.index.begin()));
     ASSERT_TRUE(std::equal(loaded.index.Offset(), loaded.index.Offset() + loaded.index.OffsetSize(),
@@ -45,5 +54,4 @@ TEST(GHistIndexPageRawFormat, IO) {
     ASSERT_EQ(loaded.Transpose().GetTypeSize(), loaded.Transpose().GetTypeSize());
   }
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
diff --git a/tests/cpp/data/test_sparse_page_raw_format.cc b/tests/cpp/data/test_sparse_page_raw_format.cc
index 722655880..bd0f97dcc 100644
--- a/tests/cpp/data/test_sparse_page_raw_format.cc
+++ b/tests/cpp/data/test_sparse_page_raw_format.cc
@@ -2,20 +2,20 @@
  * Copyright 2021-2023, XGBoost contributors
  */
 #include <gtest/gtest.h>
-#include <xgboost/data.h>                          // for CSCPage, SortedCSCPage, SparsePage
+#include <xgboost/data.h>  // for CSCPage, SortedCSCPage, SparsePage
 
-#include <memory>                                  // for allocator, unique_ptr, __shared_ptr_ac...
-#include <string>                                  // for char_traits, operator+, basic_string
+#include <memory>  // for allocator, unique_ptr, __shared_ptr_ac...
+#include <string>  // for char_traits, operator+, basic_string
 
+#include "../../../src/common/io.h"  // for PrivateMmapConstStream, AlignedResourceReadStream...
 #include "../../../src/data/sparse_page_writer.h"  // for CreatePageFormat
 #include "../helpers.h"                            // for RandomDataGenerator
 #include "dmlc/filesystem.h"                       // for TemporaryDirectory
-#include "dmlc/io.h"                               // for SeekStream, Stream
+#include "dmlc/io.h"                               // for Stream
 #include "gtest/gtest_pred_impl.h"                 // for Test, AssertionResult, ASSERT_EQ, TEST
 #include "xgboost/context.h"                       // for Context
 
-namespace xgboost {
-namespace data {
+namespace xgboost::data {
 template <typename S> void TestSparsePageRawFormat() {
   std::unique_ptr<SparsePageFormat<S>> format{CreatePageFormat<S>("raw")};
   Context ctx;
@@ -25,17 +25,19 @@ template <typename S> void TestSparsePageRawFormat() {
   dmlc::TemporaryDirectory tmpdir;
   std::string path = tmpdir.path + "/sparse.page";
   S orig;
+  std::size_t n_bytes{0};
   {
     // block code to flush the stream
-    std::unique_ptr<dmlc::Stream> fo{dmlc::Stream::Create(path.c_str(), "w")};
+    auto fo = std::make_unique<common::AlignedFileWriteStream>(StringView{path}, "wb");
     for (auto const &page : m->GetBatches<S>(&ctx)) {
       orig.Push(page);
-      format->Write(page, fo.get());
+      n_bytes = format->Write(page, fo.get());
     }
   }
 
   S page;
-  std::unique_ptr<dmlc::SeekStream> fi{dmlc::SeekStream::CreateForRead(path.c_str())};
+  std::unique_ptr<common::AlignedResourceReadStream> fi{
+      std::make_unique<common::PrivateMmapConstStream>(path.c_str(), 0, n_bytes)};
   format->Read(&page, fi.get());
   for (size_t i = 0; i < orig.data.Size(); ++i) {
     ASSERT_EQ(page.data.HostVector()[i].fvalue,
@@ -59,5 +61,4 @@ TEST(SparsePageRawFormat, CSCPage) {
 TEST(SparsePageRawFormat, SortedCSCPage) {
   TestSparsePageRawFormat<SortedCSCPage>();
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data

From f4798718c7729ed351471231bb13e0d4862551af Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 27 Jun 2023 23:04:24 +0800
Subject: [PATCH 011/136] Use hist as the default tree method. (#9320)

---
 R-package/tests/testthat/test_basic.R  | 43 +++++++-----
 R-package/tests/testthat/test_update.R |  5 +-
 src/gbm/gbtree.cc                      | 96 +++++---------------------
 src/gbm/gbtree.h                       | 19 +----
 tests/ci_build/lint_python.py          |  1 +
 tests/cpp/test_learner.cc              |  2 +
 tests/python/test_demos.py             |  5 +-
 tests/python/test_shap.py              | 73 ++++++++++----------
 tests/python/test_survival.py          | 48 ++++++++-----
 tests/python/test_with_sklearn.py      | 24 ++++---
 10 files changed, 138 insertions(+), 178 deletions(-)

diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
index 94cd1ded3..a21b03d77 100644
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -85,9 +85,18 @@ test_that("dart prediction works", {
     rnorm(100)
 
   set.seed(1994)
-  booster_by_xgboost <- xgboost(data = d, label = y, max_depth = 2, booster = "dart",
-                                rate_drop = 0.5, one_drop = TRUE,
-                                eta = 1, nthread = 2, nrounds = nrounds, objective = "reg:squarederror")
+  booster_by_xgboost <- xgboost(
+    data = d,
+    label = y,
+    max_depth = 2,
+    booster = "dart",
+    rate_drop = 0.5,
+    one_drop = TRUE,
+    eta = 1,
+    nthread = 2,
+    nrounds = nrounds,
+    objective = "reg:squarederror"
+  )
   pred_by_xgboost_0 <- predict(booster_by_xgboost, newdata = d, ntreelimit = 0)
   pred_by_xgboost_1 <- predict(booster_by_xgboost, newdata = d, ntreelimit = nrounds)
   expect_true(all(matrix(pred_by_xgboost_0, byrow = TRUE) == matrix(pred_by_xgboost_1, byrow = TRUE)))
@@ -97,19 +106,19 @@ test_that("dart prediction works", {
 
   set.seed(1994)
   dtrain <- xgb.DMatrix(data = d, info = list(label = y))
-  booster_by_train <- xgb.train(params = list(
-                                    booster = "dart",
-                                    max_depth = 2,
-                                    eta = 1,
-                                    rate_drop = 0.5,
-                                    one_drop = TRUE,
-                                    nthread = 1,
-                                    tree_method = "exact",
-                                    objective = "reg:squarederror"
-                                ),
-                                data = dtrain,
-                                nrounds = nrounds
-                                )
+  booster_by_train <- xgb.train(
+    params = list(
+      booster = "dart",
+      max_depth = 2,
+      eta = 1,
+      rate_drop = 0.5,
+      one_drop = TRUE,
+      nthread = 1,
+      objective = "reg:squarederror"
+    ),
+    data = dtrain,
+    nrounds = nrounds
+  )
   pred_by_train_0 <- predict(booster_by_train, newdata = dtrain, ntreelimit = 0)
   pred_by_train_1 <- predict(booster_by_train, newdata = dtrain, ntreelimit = nrounds)
   pred_by_train_2 <- predict(booster_by_train, newdata = dtrain, training = TRUE)
@@ -399,7 +408,7 @@ test_that("colsample_bytree works", {
   xgb.importance(model = bst)
   # If colsample_bytree works properly, a variety of features should be used
   # in the 100 trees
-  expect_gte(nrow(xgb.importance(model = bst)), 30)
+  expect_gte(nrow(xgb.importance(model = bst)), 28)
 })
 
 test_that("Configuration works", {
diff --git a/R-package/tests/testthat/test_update.R b/R-package/tests/testthat/test_update.R
index 887ffeb06..c961bab1a 100644
--- a/R-package/tests/testthat/test_update.R
+++ b/R-package/tests/testthat/test_update.R
@@ -13,7 +13,10 @@ test_that("updating the model works", {
   watchlist <- list(train = dtrain, test = dtest)
 
   # no-subsampling
-  p1 <- list(objective = "binary:logistic", max_depth = 2, eta = 0.05, nthread = 2)
+  p1 <- list(
+    objective = "binary:logistic", max_depth = 2, eta = 0.05, nthread = 2,
+    updater = "grow_colmaker,prune"
+  )
   set.seed(11)
   bst1 <- xgb.train(p1, dtrain, nrounds = 10, watchlist, verbose = 0)
   tr1 <- xgb.model.dt.tree(model = bst1)
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index 9d268b8d7..a4f91abe3 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -39,7 +39,6 @@ namespace xgboost::gbm {
 DMLC_REGISTRY_FILE_TAG(gbtree);
 
 void GBTree::Configure(Args const& cfg) {
-  this->cfg_ = cfg;
   std::string updater_seq = tparam_.updater_seq;
   tparam_.UpdateAllowUnknown(cfg);
   tree_param_.UpdateAllowUnknown(cfg);
@@ -78,10 +77,9 @@ void GBTree::Configure(Args const& cfg) {
 
   monitor_.Init("GBTree");
 
-  specified_updater_ = std::any_of(cfg.cbegin(), cfg.cend(),
-                   [](std::pair<std::string, std::string> const& arg) {
-                     return arg.first == "updater";
-                   });
+  specified_updater_ = std::any_of(
+      cfg.cbegin(), cfg.cend(),
+      [](std::pair<std::string, std::string> const& arg) { return arg.first == "updater"; });
 
   if (specified_updater_ && !showed_updater_warning_) {
     LOG(WARNING) << "DANGER AHEAD: You have manually specified `updater` "
@@ -93,12 +91,19 @@ void GBTree::Configure(Args const& cfg) {
     showed_updater_warning_ = true;
   }
 
+  if (model_.learner_model_param->IsVectorLeaf()) {
+    CHECK(tparam_.tree_method == TreeMethod::kHist || tparam_.tree_method == TreeMethod::kAuto)
+        << "Only the hist tree method is supported for building multi-target trees with vector "
+           "leaf.";
+  }
+  LOG(DEBUG) << "Using tree method: " << static_cast<int>(tparam_.tree_method);
   this->ConfigureUpdaters();
+
   if (updater_seq != tparam_.updater_seq) {
     updaters_.clear();
     this->InitUpdater(cfg);
   } else {
-    for (auto &up : updaters_) {
+    for (auto& up : updaters_) {
       up->Configure(cfg);
     }
   }
@@ -106,66 +111,6 @@ void GBTree::Configure(Args const& cfg) {
   configured_ = true;
 }
 
-// FIXME(trivialfis): This handles updaters.  Because the choice of updaters depends on
-// whether external memory is used and how large is dataset.  We can remove the dependency
-// on DMatrix once `hist` tree method can handle external memory so that we can make it
-// default.
-void GBTree::ConfigureWithKnownData(Args const& cfg, DMatrix* fmat) {
-  CHECK(this->configured_);
-  std::string updater_seq = tparam_.updater_seq;
-  CHECK(tparam_.GetInitialised());
-
-  tparam_.UpdateAllowUnknown(cfg);
-
-  this->PerformTreeMethodHeuristic(fmat);
-  this->ConfigureUpdaters();
-
-  // initialize the updaters only when needed.
-  if (updater_seq != tparam_.updater_seq) {
-    LOG(DEBUG) << "Using updaters: " << tparam_.updater_seq;
-    this->updaters_.clear();
-    this->InitUpdater(cfg);
-  }
-}
-
-void GBTree::PerformTreeMethodHeuristic(DMatrix* fmat) {
-  if (specified_updater_) {
-    // This method is disabled when `updater` parameter is explicitly
-    // set, since only experts are expected to do so.
-    return;
-  }
-  if (model_.learner_model_param->IsVectorLeaf()) {
-    CHECK(tparam_.tree_method == TreeMethod::kHist)
-        << "Only the hist tree method is supported for building multi-target trees with vector "
-           "leaf.";
-  }
-
-  // tparam_ is set before calling this function.
-  if (tparam_.tree_method != TreeMethod::kAuto) {
-    return;
-  }
-
-  if (collective::IsDistributed()) {
-    LOG(INFO) << "Tree method is automatically selected to be 'approx' "
-                 "for distributed training.";
-    tparam_.tree_method = TreeMethod::kApprox;
-  } else if (!fmat->SingleColBlock()) {
-    LOG(INFO) << "Tree method is automatically set to 'approx' "
-                 "since external-memory data matrix is used.";
-    tparam_.tree_method = TreeMethod::kApprox;
-  } else if (fmat->Info().num_row_ >= (4UL << 20UL)) {
-    /* Choose tree_method='approx' automatically for large data matrix */
-    LOG(INFO) << "Tree method is automatically selected to be "
-                 "'approx' for faster speed. To use old behavior "
-                 "(exact greedy algorithm on single machine), "
-                 "set tree_method to 'exact'.";
-    tparam_.tree_method = TreeMethod::kApprox;
-  } else {
-    tparam_.tree_method = TreeMethod::kExact;
-  }
-  LOG(DEBUG) << "Using tree method: " << static_cast<int>(tparam_.tree_method);
-}
-
 void GBTree::ConfigureUpdaters() {
   if (specified_updater_) {
     return;
@@ -173,31 +118,25 @@ void GBTree::ConfigureUpdaters() {
   // `updater` parameter was manually specified
   /* Choose updaters according to tree_method parameters */
   switch (tparam_.tree_method) {
-    case TreeMethod::kAuto:
-      // Use heuristic to choose between 'exact' and 'approx' This
-      // choice is carried out in PerformTreeMethodHeuristic() before
-      // calling this function.
+    case TreeMethod::kAuto:  // Use hist as default in 2.0
+    case TreeMethod::kHist: {
+      tparam_.updater_seq = "grow_quantile_histmaker";
       break;
+    }
     case TreeMethod::kApprox:
       tparam_.updater_seq = "grow_histmaker";
       break;
     case TreeMethod::kExact:
       tparam_.updater_seq = "grow_colmaker,prune";
       break;
-    case TreeMethod::kHist: {
-      LOG(INFO) << "Tree method is selected to be 'hist', which uses a single updater "
-                   "grow_quantile_histmaker.";
-      tparam_.updater_seq = "grow_quantile_histmaker";
-      break;
-    }
     case TreeMethod::kGPUHist: {
       common::AssertGPUSupport();
       tparam_.updater_seq = "grow_gpu_hist";
       break;
     }
     default:
-      LOG(FATAL) << "Unknown tree_method ("
-                 << static_cast<int>(tparam_.tree_method) << ") detected";
+      LOG(FATAL) << "Unknown tree_method (" << static_cast<int>(tparam_.tree_method)
+                 << ") detected";
   }
 }
 
@@ -253,7 +192,6 @@ void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
                      PredictionCacheEntry* predt, ObjFunction const* obj) {
   TreesOneIter new_trees;
   bst_target_t const n_groups = model_.learner_model_param->OutputLength();
-  ConfigureWithKnownData(this->cfg_, p_fmat);
   monitor_.Start("BoostNewTrees");
 
   // Weird case that tree method is cpu-based but gpu_id is set.  Ideally we should let
diff --git a/src/gbm/gbtree.h b/src/gbm/gbtree.h
index 6e7da77ac..dc5fc975d 100644
--- a/src/gbm/gbtree.h
+++ b/src/gbm/gbtree.h
@@ -56,9 +56,7 @@ DECLARE_FIELD_ENUM_CLASS(xgboost::TreeMethod);
 DECLARE_FIELD_ENUM_CLASS(xgboost::TreeProcessType);
 DECLARE_FIELD_ENUM_CLASS(xgboost::PredictorType);
 
-namespace xgboost {
-namespace gbm {
-
+namespace xgboost::gbm {
 /*! \brief training parameters */
 struct GBTreeTrainParam : public XGBoostParameter<GBTreeTrainParam> {
   /*! \brief tree updater sequence */
@@ -192,12 +190,8 @@ class GBTree : public GradientBooster {
       : GradientBooster{ctx}, model_(booster_config, ctx_) {}
 
   void Configure(const Args& cfg) override;
-  // Revise `tree_method` and `updater` parameters after seeing the training
-  // data matrix, only useful when tree_method is auto.
-  void PerformTreeMethodHeuristic(DMatrix* fmat);
   /*! \brief Map `tree_method` parameter to `updater` parameter */
   void ConfigureUpdaters();
-  void ConfigureWithKnownData(Args const& cfg, DMatrix* fmat);
 
   /**
    * \brief Optionally update the leaf value.
@@ -222,11 +216,7 @@ class GBTree : public GradientBooster {
     return tparam_;
   }
 
-  void Load(dmlc::Stream* fi) override {
-    model_.Load(fi);
-    this->cfg_.clear();
-  }
-
+  void Load(dmlc::Stream* fi) override { model_.Load(fi); }
   void Save(dmlc::Stream* fo) const override {
     model_.Save(fo);
   }
@@ -416,8 +406,6 @@ class GBTree : public GradientBooster {
   bool showed_updater_warning_ {false};
   bool specified_updater_   {false};
   bool configured_ {false};
-  // configurations for tree
-  Args cfg_;
   // the updaters that can be applied to each of tree
   std::vector<std::unique_ptr<TreeUpdater>> updaters_;
   // Predictors
@@ -431,7 +419,6 @@ class GBTree : public GradientBooster {
   common::Monitor monitor_;
 };
 
-}  // namespace gbm
-}  // namespace xgboost
+}  // namespace xgboost::gbm
 
 #endif  // XGBOOST_GBM_GBTREE_H_
diff --git a/tests/ci_build/lint_python.py b/tests/ci_build/lint_python.py
index 85ece676e..90a7781c2 100644
--- a/tests/ci_build/lint_python.py
+++ b/tests/ci_build/lint_python.py
@@ -23,6 +23,7 @@ class LintersPaths:
         "tests/python/test_predict.py",
         "tests/python/test_quantile_dmatrix.py",
         "tests/python/test_tree_regularization.py",
+        "tests/python/test_shap.py",
         "tests/python-gpu/test_gpu_data_iterator.py",
         "tests/test_distributed/test_with_spark/",
         "tests/test_distributed/test_gpu_with_spark/",
diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc
index a3bb30fcd..5c561d2a4 100644
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@@ -379,6 +379,8 @@ TEST(Learner, Seed) {
 TEST(Learner, ConstantSeed) {
   auto m = RandomDataGenerator{10, 10, 0}.GenerateDMatrix(true);
   std::unique_ptr<Learner> learner{Learner::Create({m})};
+  // Use exact as it doesn't initialize column sampler at construction, which alters the rng.
+  learner->SetParam("tree_method", "exact");
   learner->Configure();  // seed the global random
 
   std::uniform_real_distribution<float> dist;
diff --git a/tests/python/test_demos.py b/tests/python/test_demos.py
index c54f35046..58cfb0d4c 100644
--- a/tests/python/test_demos.py
+++ b/tests/python/test_demos.py
@@ -18,9 +18,8 @@ CLI_DEMO_DIR = os.path.join(DEMO_DIR, 'CLI')
 def test_basic_walkthrough():
     script = os.path.join(PYTHON_DEMO_DIR, 'basic_walkthrough.py')
     cmd = ['python', script]
-    subprocess.check_call(cmd)
-    os.remove('dump.nice.txt')
-    os.remove('dump.raw.txt')
+    with tempfile.TemporaryDirectory() as tmpdir:
+        subprocess.check_call(cmd, cwd=tmpdir)
 
 
 @pytest.mark.skipif(**tm.no_matplotlib())
diff --git a/tests/python/test_shap.py b/tests/python/test_shap.py
index 2585da088..bbbdcedc0 100644
--- a/tests/python/test_shap.py
+++ b/tests/python/test_shap.py
@@ -6,35 +6,34 @@ import scipy
 import scipy.special
 
 import xgboost as xgb
-
-dpath = 'demo/data/'
-rng = np.random.RandomState(1994)
+from xgboost import testing as tm
 
 
 class TestSHAP:
-
-    def test_feature_importances(self):
-        data = np.random.randn(100, 5)
+    def test_feature_importances(self) -> None:
+        rng = np.random.RandomState(1994)
+        data = rng.randn(100, 5)
         target = np.array([0, 1] * 50)
 
-        features = ['Feature1', 'Feature2', 'Feature3', 'Feature4', 'Feature5']
+        features = ["Feature1", "Feature2", "Feature3", "Feature4", "Feature5"]
 
-        dm = xgb.DMatrix(data, label=target,
-                         feature_names=features)
-        params = {'objective': 'multi:softprob',
-                  'eval_metric': 'mlogloss',
-                  'eta': 0.3,
-                  'num_class': 3}
+        dm = xgb.DMatrix(data, label=target, feature_names=features)
+        params = {
+            "objective": "multi:softprob",
+            "eval_metric": "mlogloss",
+            "eta": 0.3,
+            "num_class": 3,
+        }
 
         bst = xgb.train(params, dm, num_boost_round=10)
 
         # number of feature importances should == number of features
         scores1 = bst.get_score()
-        scores2 = bst.get_score(importance_type='weight')
-        scores3 = bst.get_score(importance_type='cover')
-        scores4 = bst.get_score(importance_type='gain')
-        scores5 = bst.get_score(importance_type='total_cover')
-        scores6 = bst.get_score(importance_type='total_gain')
+        scores2 = bst.get_score(importance_type="weight")
+        scores3 = bst.get_score(importance_type="cover")
+        scores4 = bst.get_score(importance_type="gain")
+        scores5 = bst.get_score(importance_type="total_cover")
+        scores6 = bst.get_score(importance_type="total_gain")
         assert len(scores1) == len(features)
         assert len(scores2) == len(features)
         assert len(scores3) == len(features)
@@ -46,12 +45,11 @@ class TestSHAP:
         fscores = bst.get_fscore()
         assert scores1 == fscores
 
-        dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train?format=libsvm')
-        dtest = xgb.DMatrix(dpath + 'agaricus.txt.test?format=libsvm')
+        dtrain, dtest = tm.load_agaricus(__file__)
 
-        def fn(max_depth, num_rounds):
+        def fn(max_depth: int, num_rounds: int) -> None:
             # train
-            params = {'max_depth': max_depth, 'eta': 1, 'verbosity': 0}
+            params = {"max_depth": max_depth, "eta": 1, "verbosity": 0}
             bst = xgb.train(params, dtrain, num_boost_round=num_rounds)
 
             # predict
@@ -82,7 +80,7 @@ class TestSHAP:
         assert out[0, 1] == 0.375
         assert out[0, 2] == 0.25
 
-        def parse_model(model):
+        def parse_model(model: xgb.Booster) -> list:
             trees = []
             r_exp = r"([0-9]+):\[f([0-9]+)<([0-9\.e-]+)\] yes=([0-9]+),no=([0-9]+).*cover=([0-9e\.]+)"
             r_exp_leaf = r"([0-9]+):leaf=([0-9\.e-]+),cover=([0-9e\.]+)"
@@ -93,7 +91,9 @@ class TestSHAP:
                     match = re.search(r_exp, line)
                     if match is not None:
                         ind = int(match.group(1))
+                        assert trees[-1] is not None
                         while ind >= len(trees[-1]):
+                            assert isinstance(trees[-1], list)
                             trees[-1].append(None)
                         trees[-1][ind] = {
                             "yes_ind": int(match.group(4)),
@@ -101,17 +101,16 @@ class TestSHAP:
                             "value": None,
                             "threshold": float(match.group(3)),
                             "feature_index": int(match.group(2)),
-                            "cover": float(match.group(6))
+                            "cover": float(match.group(6)),
                         }
                     else:
-
                         match = re.search(r_exp_leaf, line)
                         ind = int(match.group(1))
                         while ind >= len(trees[-1]):
                             trees[-1].append(None)
                         trees[-1][ind] = {
                             "value": float(match.group(2)),
-                            "cover": float(match.group(3))
+                            "cover": float(match.group(3)),
                         }
             return trees
 
@@ -121,7 +120,8 @@ class TestSHAP:
             else:
                 ind = tree[i]["feature_index"]
                 if z[ind] == 1:
-                    if x[ind] < tree[i]["threshold"]:
+                    # 1e-6 for numeric error from parsing text dump.
+                    if x[ind] + 1e-6 <= tree[i]["threshold"]:
                         return exp_value_rec(tree, z, x, tree[i]["yes_ind"])
                     else:
                         return exp_value_rec(tree, z, x, tree[i]["no_ind"])
@@ -136,10 +136,13 @@ class TestSHAP:
                     return val
 
         def exp_value(trees, z, x):
+            "E[f(z)|Z_s = X_s]"
             return np.sum([exp_value_rec(tree, z, x) for tree in trees])
 
         def all_subsets(ss):
-            return itertools.chain(*map(lambda x: itertools.combinations(ss, x), range(0, len(ss) + 1)))
+            return itertools.chain(
+                *map(lambda x: itertools.combinations(ss, x), range(0, len(ss) + 1))
+            )
 
         def shap_value(trees, x, i, cond=None, cond_value=None):
             M = len(x)
@@ -196,7 +199,9 @@ class TestSHAP:
                 z[i] = 0
                 v01 = exp_value(trees, z, x)
                 z[j] = 0
-                total += (v11 - v01 - v10 + v00) / (scipy.special.binom(M - 2, len(subset)) * (M - 1))
+                total += (v11 - v01 - v10 + v00) / (
+                    scipy.special.binom(M - 2, len(subset)) * (M - 1)
+                )
                 z[list(subset)] = 0
             return total
 
@@ -220,11 +225,10 @@ class TestSHAP:
         assert np.linalg.norm(brute_force - fast_method[0, :, :]) < 1e-4
 
         # test a random function
-        np.random.seed(0)
         M = 2
         N = 4
-        X = np.random.randn(N, M)
-        y = np.random.randn(N)
+        X = rng.randn(N, M)
+        y = rng.randn(N)
         param = {"max_depth": 2, "base_score": 0.0, "eta": 1.0, "lambda": 0}
         bst = xgb.train(param, xgb.DMatrix(X, label=y), 1)
         brute_force = shap_values(parse_model(bst), X[0, :])
@@ -236,11 +240,10 @@ class TestSHAP:
         assert np.linalg.norm(brute_force - fast_method[0, :, :]) < 1e-4
 
         # test another larger more complex random function
-        np.random.seed(0)
         M = 5
         N = 100
-        X = np.random.randn(N, M)
-        y = np.random.randn(N)
+        X = rng.randn(N, M)
+        y = rng.randn(N)
         base_score = 1.0
         param = {"max_depth": 5, "base_score": base_score, "eta": 0.1, "gamma": 2.0}
         bst = xgb.train(param, xgb.DMatrix(X, label=y), 10)
diff --git a/tests/python/test_survival.py b/tests/python/test_survival.py
index 35de79ce6..e5ca30fff 100644
--- a/tests/python/test_survival.py
+++ b/tests/python/test_survival.py
@@ -1,6 +1,6 @@
 import json
 import os
-from typing import Optional, Tuple
+from typing import List, Optional, Tuple, cast
 
 import numpy as np
 import pytest
@@ -62,8 +62,8 @@ def test_aft_survival_toy_data(
     X = np.array([1, 2, 3, 4, 5]).reshape((-1, 1))
     dmat, y_lower, y_upper = toy_data
 
-    # "Accuracy" = the number of data points whose ranged label (y_lower, y_upper) includes
-    #              the corresponding predicted label (y_pred)
+    # "Accuracy" = the number of data points whose ranged label (y_lower, y_upper)
+    #              includes the corresponding predicted label (y_pred)
     acc_rec = []
 
     class Callback(xgb.callback.TrainingCallback):
@@ -71,21 +71,33 @@ def test_aft_survival_toy_data(
             super().__init__()
 
         def after_iteration(
-            self, model: xgb.Booster,
+            self,
+            model: xgb.Booster,
             epoch: int,
-            evals_log: xgb.callback.TrainingCallback.EvalsLog
+            evals_log: xgb.callback.TrainingCallback.EvalsLog,
         ):
             y_pred = model.predict(dmat)
-            acc = np.sum(np.logical_and(y_pred >= y_lower, y_pred <= y_upper)/len(X))
+            acc = np.sum(np.logical_and(y_pred >= y_lower, y_pred <= y_upper) / len(X))
             acc_rec.append(acc)
             return False
 
-    evals_result = {}
-    params = {'max_depth': 3, 'objective': 'survival:aft', 'min_child_weight': 0}
-    bst = xgb.train(params, dmat, 15, [(dmat, 'train')], evals_result=evals_result,
-                    callbacks=[Callback()])
+    evals_result: xgb.callback.TrainingCallback.EvalsLog = {}
+    params = {
+        "max_depth": 3,
+        "objective": "survival:aft",
+        "min_child_weight": 0,
+        "tree_method": "exact",
+    }
+    bst = xgb.train(
+        params,
+        dmat,
+        15,
+        [(dmat, "train")],
+        evals_result=evals_result,
+        callbacks=[Callback()],
+    )
 
-    nloglik_rec = evals_result['train']['aft-nloglik']
+    nloglik_rec = cast(List[float], evals_result["train"]["aft-nloglik"])
     # AFT metric (negative log likelihood) improve monotonically
     assert all(p >= q for p, q in zip(nloglik_rec, nloglik_rec[:1]))
     # "Accuracy" improve monotonically.
@@ -94,15 +106,17 @@ def test_aft_survival_toy_data(
     assert acc_rec[-1] == 1.0
 
     def gather_split_thresholds(tree):
-        if 'split_condition' in tree:
-            return (gather_split_thresholds(tree['children'][0])
-                    | gather_split_thresholds(tree['children'][1])
-                    | {tree['split_condition']})
+        if "split_condition" in tree:
+            return (
+                gather_split_thresholds(tree["children"][0])
+                | gather_split_thresholds(tree["children"][1])
+                | {tree["split_condition"]}
+            )
         return set()
 
     # Only 2.5, 3.5, and 4.5 are used as split thresholds.
-    model_json = [json.loads(e) for e in bst.get_dump(dump_format='json')]
-    for tree in model_json:
+    model_json = [json.loads(e) for e in bst.get_dump(dump_format="json")]
+    for i, tree in enumerate(model_json):
         assert gather_split_thresholds(tree).issubset({2.5, 3.5, 4.5})
 
 
diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
index d1915267b..26d18493c 100644
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -475,18 +475,22 @@ def test_rf_regression():
     run_housing_rf_regression("hist")
 
 
-def test_parameter_tuning():
+@pytest.mark.parametrize("tree_method", ["exact", "hist", "approx"])
+def test_parameter_tuning(tree_method: str) -> None:
     from sklearn.datasets import fetch_california_housing
     from sklearn.model_selection import GridSearchCV
 
     X, y = fetch_california_housing(return_X_y=True)
-    xgb_model = xgb.XGBRegressor(learning_rate=0.1)
-    clf = GridSearchCV(xgb_model, {'max_depth': [2, 4],
-                                   'n_estimators': [50, 200]},
-                       cv=2, verbose=1)
-    clf.fit(X, y)
-    assert clf.best_score_ < 0.7
-    assert clf.best_params_ == {'n_estimators': 200, 'max_depth': 4}
+    reg = xgb.XGBRegressor(learning_rate=0.1, tree_method=tree_method)
+    grid_cv = GridSearchCV(
+        reg, {"max_depth": [2, 4], "n_estimators": [50, 200]}, cv=2, verbose=1
+    )
+    grid_cv.fit(X, y)
+    assert grid_cv.best_score_ < 0.7
+    assert grid_cv.best_params_ == {
+        "n_estimators": 200,
+        "max_depth": 4 if tree_method == "exact" else 2,
+    }
 
 
 def test_regression_with_custom_objective():
@@ -750,7 +754,7 @@ def test_parameters_access():
         ]["tree_method"]
         return tm
 
-    assert get_tm(clf) == "exact"
+    assert get_tm(clf) == "auto"  # Kept as auto, immutable since 2.0
 
     clf = pickle.loads(pickle.dumps(clf))
 
@@ -758,7 +762,7 @@ def test_parameters_access():
     assert clf.n_estimators == 2
     assert clf.get_params()["tree_method"] is None
     assert clf.get_params()["n_estimators"] == 2
-    assert get_tm(clf) == "exact"  # preserved for pickle
+    assert get_tm(clf) == "auto"  # preserved for pickle
 
     clf = save_load(clf)
 

From f90771eec64cea7b1cb4e41a31000c8c54fcc6d8 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Wed, 28 Jun 2023 19:34:30 -0700
Subject: [PATCH 012/136] Fix device communicator dependency (#9346)

---
 src/collective/communicator.cu                |   6 +-
 .../device_communicator_adapter.cuh           |  33 +++---
 src/collective/nccl_device_communicator.cu    |  52 ++++-----
 src/collective/nccl_device_communicator.cuh   |  10 +-
 .../test_nccl_device_communicator.cu          |   7 +-
 tests/cpp/plugin/helpers.h                    |   9 +-
 tests/cpp/plugin/test_federated_adapter.cu    | 103 +++++++++---------
 .../cpp/plugin/test_federated_communicator.cc |   4 +-
 tests/cpp/plugin/test_federated_data.cc       |   2 +-
 tests/cpp/plugin/test_federated_server.cc     |   4 +-
 10 files changed, 107 insertions(+), 123 deletions(-)

diff --git a/src/collective/communicator.cu b/src/collective/communicator.cu
index 8bd10382d..8cdb7f2fd 100644
--- a/src/collective/communicator.cu
+++ b/src/collective/communicator.cu
@@ -30,12 +30,12 @@ DeviceCommunicator* Communicator::GetDevice(int device_ordinal) {
     old_world_size = communicator_->GetWorldSize();
 #ifdef XGBOOST_USE_NCCL
     if (type_ != CommunicatorType::kFederated) {
-      device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, Get()));
+      device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal));
     } else {
-      device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal, Get()));
+      device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal));
     }
 #else
-    device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal, Get()));
+    device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal));
 #endif
   }
   return device_communicator_.get();
diff --git a/src/collective/device_communicator_adapter.cuh b/src/collective/device_communicator_adapter.cuh
index 06637c5b4..f8135fb94 100644
--- a/src/collective/device_communicator_adapter.cuh
+++ b/src/collective/device_communicator_adapter.cuh
@@ -11,21 +11,18 @@ namespace collective {
 
 class DeviceCommunicatorAdapter : public DeviceCommunicator {
  public:
-  DeviceCommunicatorAdapter(int device_ordinal, Communicator *communicator)
-      : device_ordinal_{device_ordinal}, communicator_{communicator} {
+  explicit DeviceCommunicatorAdapter(int device_ordinal)
+      : device_ordinal_{device_ordinal}, world_size_{GetWorldSize()}, rank_{GetRank()} {
     if (device_ordinal_ < 0) {
       LOG(FATAL) << "Invalid device ordinal: " << device_ordinal_;
     }
-    if (communicator_ == nullptr) {
-      LOG(FATAL) << "Communicator cannot be null.";
-    }
   }
 
   ~DeviceCommunicatorAdapter() override = default;
 
   void AllReduce(void *send_receive_buffer, std::size_t count, DataType data_type,
                  Operation op) override {
-    if (communicator_->GetWorldSize() == 1) {
+    if (world_size_ == 1) {
       return;
     }
 
@@ -33,37 +30,34 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
     auto size = count * GetTypeSize(data_type);
     host_buffer_.reserve(size);
     dh::safe_cuda(cudaMemcpy(host_buffer_.data(), send_receive_buffer, size, cudaMemcpyDefault));
-    communicator_->AllReduce(host_buffer_.data(), count, data_type, op);
+    Allreduce(host_buffer_.data(), count, data_type, op);
     dh::safe_cuda(cudaMemcpy(send_receive_buffer, host_buffer_.data(), size, cudaMemcpyDefault));
   }
 
   void AllGatherV(void const *send_buffer, size_t length_bytes, std::vector<std::size_t> *segments,
                   dh::caching_device_vector<char> *receive_buffer) override {
-    if (communicator_->GetWorldSize() == 1) {
+    if (world_size_ == 1) {
       return;
     }
 
     dh::safe_cuda(cudaSetDevice(device_ordinal_));
-    int const world_size = communicator_->GetWorldSize();
-    int const rank = communicator_->GetRank();
 
     segments->clear();
-    segments->resize(world_size, 0);
-    segments->at(rank) = length_bytes;
-    communicator_->AllReduce(segments->data(), segments->size(), DataType::kUInt64,
-                             Operation::kMax);
+    segments->resize(world_size_, 0);
+    segments->at(rank_) = length_bytes;
+    Allreduce(segments->data(), segments->size(), DataType::kUInt64, Operation::kMax);
     auto total_bytes = std::accumulate(segments->cbegin(), segments->cend(), 0UL);
     receive_buffer->resize(total_bytes);
 
     host_buffer_.reserve(total_bytes);
     size_t offset = 0;
-    for (int32_t i = 0; i < world_size; ++i) {
+    for (int32_t i = 0; i < world_size_; ++i) {
       size_t as_bytes = segments->at(i);
-      if (i == rank) {
-        dh::safe_cuda(cudaMemcpy(host_buffer_.data() + offset, send_buffer, segments->at(rank),
+      if (i == rank_) {
+        dh::safe_cuda(cudaMemcpy(host_buffer_.data() + offset, send_buffer, segments->at(rank_),
                                  cudaMemcpyDefault));
       }
-      communicator_->Broadcast(host_buffer_.data() + offset, as_bytes, i);
+      Broadcast(host_buffer_.data() + offset, as_bytes, i);
       offset += as_bytes;
     }
     dh::safe_cuda(cudaMemcpy(receive_buffer->data().get(), host_buffer_.data(), total_bytes,
@@ -76,7 +70,8 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
 
  private:
   int const device_ordinal_;
-  Communicator *communicator_;
+  int const world_size_;
+  int const rank_;
   /// Host buffer used to call communicator functions.
   std::vector<char> host_buffer_{};
 };
diff --git a/src/collective/nccl_device_communicator.cu b/src/collective/nccl_device_communicator.cu
index 631193db4..57419b947 100644
--- a/src/collective/nccl_device_communicator.cu
+++ b/src/collective/nccl_device_communicator.cu
@@ -7,31 +7,24 @@
 namespace xgboost {
 namespace collective {
 
-NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, Communicator *communicator)
-    : device_ordinal_{device_ordinal}, communicator_{communicator} {
+NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal)
+    : device_ordinal_{device_ordinal}, world_size_{GetWorldSize()}, rank_{GetRank()} {
   if (device_ordinal_ < 0) {
     LOG(FATAL) << "Invalid device ordinal: " << device_ordinal_;
   }
-  if (communicator_ == nullptr) {
-    LOG(FATAL) << "Communicator cannot be null.";
-  }
-
-  int32_t const rank = communicator_->GetRank();
-  int32_t const world = communicator_->GetWorldSize();
-
-  if (world == 1) {
+  if (world_size_ == 1) {
     return;
   }
 
-  std::vector<uint64_t> uuids(world * kUuidLength, 0);
+  std::vector<uint64_t> uuids(world_size_ * kUuidLength, 0);
   auto s_uuid = xgboost::common::Span<uint64_t>{uuids.data(), uuids.size()};
-  auto s_this_uuid = s_uuid.subspan(rank * kUuidLength, kUuidLength);
+  auto s_this_uuid = s_uuid.subspan(rank_ * kUuidLength, kUuidLength);
   GetCudaUUID(s_this_uuid);
 
   // TODO(rongou): replace this with allgather.
-  communicator_->AllReduce(uuids.data(), uuids.size(), DataType::kUInt64, Operation::kSum);
+  Allreduce(uuids.data(), uuids.size(), DataType::kUInt64, Operation::kSum);
 
-  std::vector<xgboost::common::Span<uint64_t, kUuidLength>> converted(world);
+  std::vector<xgboost::common::Span<uint64_t, kUuidLength>> converted(world_size_);
   size_t j = 0;
   for (size_t i = 0; i < uuids.size(); i += kUuidLength) {
     converted[j] = xgboost::common::Span<uint64_t, kUuidLength>{uuids.data() + i, kUuidLength};
@@ -41,18 +34,18 @@ NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, Communicator
   auto iter = std::unique(converted.begin(), converted.end());
   auto n_uniques = std::distance(converted.begin(), iter);
 
-  CHECK_EQ(n_uniques, world)
+  CHECK_EQ(n_uniques, world_size_)
       << "Multiple processes within communication group running on same CUDA "
       << "device is not supported. " << PrintUUID(s_this_uuid) << "\n";
 
   nccl_unique_id_ = GetUniqueId();
   dh::safe_cuda(cudaSetDevice(device_ordinal_));
-  dh::safe_nccl(ncclCommInitRank(&nccl_comm_, world, nccl_unique_id_, rank));
+  dh::safe_nccl(ncclCommInitRank(&nccl_comm_, world_size_, nccl_unique_id_, rank_));
   dh::safe_cuda(cudaStreamCreate(&cuda_stream_));
 }
 
 NcclDeviceCommunicator::~NcclDeviceCommunicator() {
-  if (communicator_->GetWorldSize() == 1) {
+  if (world_size_ == 1) {
     return;
   }
   if (cuda_stream_) {
@@ -139,9 +132,8 @@ void RunBitwiseAllreduce(char *out_buffer, char const *device_buffer, Func func,
 
 void NcclDeviceCommunicator::BitwiseAllReduce(void *send_receive_buffer, std::size_t count,
                                               DataType data_type, Operation op) {
-  auto const world_size = communicator_->GetWorldSize();
   auto const size = count * GetTypeSize(data_type);
-  dh::caching_device_vector<char> buffer(size * world_size);
+  dh::caching_device_vector<char> buffer(size * world_size_);
   auto *device_buffer = buffer.data().get();
 
   // First gather data from all the workers.
@@ -152,15 +144,15 @@ void NcclDeviceCommunicator::BitwiseAllReduce(void *send_receive_buffer, std::si
   auto *out_buffer = static_cast<char *>(send_receive_buffer);
   switch (op) {
     case Operation::kBitwiseAND:
-      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_and<char>(), world_size, size,
+      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_and<char>(), world_size_, size,
                           cuda_stream_);
       break;
     case Operation::kBitwiseOR:
-      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_or<char>(), world_size, size,
+      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_or<char>(), world_size_, size,
                           cuda_stream_);
       break;
     case Operation::kBitwiseXOR:
-      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_xor<char>(), world_size, size,
+      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_xor<char>(), world_size_, size,
                           cuda_stream_);
       break;
     default:
@@ -170,7 +162,7 @@ void NcclDeviceCommunicator::BitwiseAllReduce(void *send_receive_buffer, std::si
 
 void NcclDeviceCommunicator::AllReduce(void *send_receive_buffer, std::size_t count,
                                        DataType data_type, Operation op) {
-  if (communicator_->GetWorldSize() == 1) {
+  if (world_size_ == 1) {
     return;
   }
 
@@ -189,24 +181,22 @@ void NcclDeviceCommunicator::AllReduce(void *send_receive_buffer, std::size_t co
 void NcclDeviceCommunicator::AllGatherV(void const *send_buffer, size_t length_bytes,
                                         std::vector<std::size_t> *segments,
                                         dh::caching_device_vector<char> *receive_buffer) {
-  if (communicator_->GetWorldSize() == 1) {
+  if (world_size_ == 1) {
     return;
   }
 
   dh::safe_cuda(cudaSetDevice(device_ordinal_));
-  int const world_size = communicator_->GetWorldSize();
-  int const rank = communicator_->GetRank();
 
   segments->clear();
-  segments->resize(world_size, 0);
-  segments->at(rank) = length_bytes;
-  communicator_->AllReduce(segments->data(), segments->size(), DataType::kUInt64, Operation::kMax);
+  segments->resize(world_size_, 0);
+  segments->at(rank_) = length_bytes;
+  Allreduce(segments->data(), segments->size(), DataType::kUInt64, Operation::kMax);
   auto total_bytes = std::accumulate(segments->cbegin(), segments->cend(), 0UL);
   receive_buffer->resize(total_bytes);
 
   size_t offset = 0;
   dh::safe_nccl(ncclGroupStart());
-  for (int32_t i = 0; i < world_size; ++i) {
+  for (int32_t i = 0; i < world_size_; ++i) {
     size_t as_bytes = segments->at(i);
     dh::safe_nccl(ncclBroadcast(send_buffer, receive_buffer->data().get() + offset, as_bytes,
                                 ncclChar, i, nccl_comm_, cuda_stream_));
@@ -216,7 +206,7 @@ void NcclDeviceCommunicator::AllGatherV(void const *send_buffer, size_t length_b
 }
 
 void NcclDeviceCommunicator::Synchronize() {
-  if (communicator_->GetWorldSize() == 1) {
+  if (world_size_ == 1) {
     return;
   }
   dh::safe_cuda(cudaSetDevice(device_ordinal_));
diff --git a/src/collective/nccl_device_communicator.cuh b/src/collective/nccl_device_communicator.cuh
index e5f76119d..925603d21 100644
--- a/src/collective/nccl_device_communicator.cuh
+++ b/src/collective/nccl_device_communicator.cuh
@@ -12,7 +12,7 @@ namespace collective {
 
 class NcclDeviceCommunicator : public DeviceCommunicator {
  public:
-  NcclDeviceCommunicator(int device_ordinal, Communicator *communicator);
+  explicit NcclDeviceCommunicator(int device_ordinal);
   ~NcclDeviceCommunicator() override;
   void AllReduce(void *send_receive_buffer, std::size_t count, DataType data_type,
                  Operation op) override;
@@ -49,11 +49,10 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
   ncclUniqueId GetUniqueId() {
     static const int kRootRank = 0;
     ncclUniqueId id;
-    if (communicator_->GetRank() == kRootRank) {
+    if (rank_ == kRootRank) {
       dh::safe_nccl(ncclGetUniqueId(&id));
     }
-    communicator_->Broadcast(static_cast<void *>(&id), sizeof(ncclUniqueId),
-                             static_cast<int>(kRootRank));
+    Broadcast(static_cast<void *>(&id), sizeof(ncclUniqueId), static_cast<int>(kRootRank));
     return id;
   }
 
@@ -61,7 +60,8 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
                         Operation op);
 
   int const device_ordinal_;
-  Communicator *communicator_;
+  int const world_size_;
+  int const rank_;
   ncclComm_t nccl_comm_{};
   cudaStream_t cuda_stream_{};
   ncclUniqueId nccl_unique_id_{};
diff --git a/tests/cpp/collective/test_nccl_device_communicator.cu b/tests/cpp/collective/test_nccl_device_communicator.cu
index 6ac861a55..81dd3d46d 100644
--- a/tests/cpp/collective/test_nccl_device_communicator.cu
+++ b/tests/cpp/collective/test_nccl_device_communicator.cu
@@ -16,12 +16,7 @@ namespace xgboost {
 namespace collective {
 
 TEST(NcclDeviceCommunicatorSimpleTest, ThrowOnInvalidDeviceOrdinal) {
-  auto construct = []() { NcclDeviceCommunicator comm{-1, nullptr}; };
-  EXPECT_THROW(construct(), dmlc::Error);
-}
-
-TEST(NcclDeviceCommunicatorSimpleTest, ThrowOnInvalidCommunicator) {
-  auto construct = []() { NcclDeviceCommunicator comm{0, nullptr}; };
+  auto construct = []() { NcclDeviceCommunicator comm{-1}; };
   EXPECT_THROW(construct(), dmlc::Error);
 }
 
diff --git a/tests/cpp/plugin/helpers.h b/tests/cpp/plugin/helpers.h
index c4d303bb5..20b4afc30 100644
--- a/tests/cpp/plugin/helpers.h
+++ b/tests/cpp/plugin/helpers.h
@@ -37,7 +37,14 @@ class ServerForTest {
   }
 
   ~ServerForTest() {
+    using namespace std::chrono_literals;
+    while (!server_) {
+      std::this_thread::sleep_for(100ms);
+    }
     server_->Shutdown();
+    while (!server_thread_) {
+      std::this_thread::sleep_for(100ms);
+    }
     server_thread_->join();
   }
 
@@ -56,7 +63,7 @@ class BaseFederatedTest : public ::testing::Test {
 
   void TearDown() override { server_.reset(nullptr); }
 
-  static int constexpr kWorldSize{3};
+  static int constexpr kWorldSize{2};
   std::unique_ptr<ServerForTest> server_;
 };
 
diff --git a/tests/cpp/plugin/test_federated_adapter.cu b/tests/cpp/plugin/test_federated_adapter.cu
index 3fb793fa7..134446f11 100644
--- a/tests/cpp/plugin/test_federated_adapter.cu
+++ b/tests/cpp/plugin/test_federated_adapter.cu
@@ -9,6 +9,7 @@
 #include <thread>
 
 #include "../../../plugin/federated/federated_communicator.h"
+#include "../../../src/collective/communicator-inl.cuh"
 #include "../../../src/collective/device_communicator_adapter.cuh"
 #include "./helpers.h"
 
@@ -17,67 +18,63 @@ namespace xgboost::collective {
 class FederatedAdapterTest : public BaseFederatedTest {};
 
 TEST(FederatedAdapterSimpleTest, ThrowOnInvalidDeviceOrdinal) {
-  auto construct = []() { DeviceCommunicatorAdapter adapter{-1, nullptr}; };
+  auto construct = []() { DeviceCommunicatorAdapter adapter{-1}; };
   EXPECT_THROW(construct(), dmlc::Error);
 }
 
-TEST(FederatedAdapterSimpleTest, ThrowOnInvalidCommunicator) {
-  auto construct = []() { DeviceCommunicatorAdapter adapter{0, nullptr}; };
-  EXPECT_THROW(construct(), dmlc::Error);
-}
-
-TEST_F(FederatedAdapterTest, DeviceAllReduceSum) {
-  std::vector<std::thread> threads;
-  for (auto rank = 0; rank < kWorldSize; rank++) {
-    threads.emplace_back([rank, server_address = server_->Address()] {
-      FederatedCommunicator comm{kWorldSize, rank, server_address};
-      // Assign device 0 to all workers, since we run gtest in a single-GPU machine
-      DeviceCommunicatorAdapter adapter{0, &comm};
-      int count = 3;
-      thrust::device_vector<double> buffer(count, 0);
-      thrust::sequence(buffer.begin(), buffer.end());
-      adapter.AllReduce(buffer.data().get(), count, DataType::kDouble, Operation::kSum);
-      thrust::host_vector<double> host_buffer = buffer;
-      EXPECT_EQ(host_buffer.size(), count);
-      for (auto i = 0; i < count; i++) {
-        EXPECT_EQ(host_buffer[i], i * kWorldSize);
-      }
-    });
-  }
-  for (auto& thread : threads) {
-    thread.join();
+namespace {
+void VerifyAllReduceSum() {
+  auto const world_size = collective::GetWorldSize();
+  auto const rank = collective::GetRank();
+  int count = 3;
+  thrust::device_vector<double> buffer(count, 0);
+  thrust::sequence(buffer.begin(), buffer.end());
+  collective::AllReduce<collective::Operation::kSum>(rank, buffer.data().get(), count);
+  thrust::host_vector<double> host_buffer = buffer;
+  EXPECT_EQ(host_buffer.size(), count);
+  for (auto i = 0; i < count; i++) {
+    EXPECT_EQ(host_buffer[i], i * world_size);
   }
 }
+}  // anonymous namespace
 
-TEST_F(FederatedAdapterTest, DeviceAllGatherV) {
-  std::vector<std::thread> threads;
-  for (auto rank = 0; rank < kWorldSize; rank++) {
-    threads.emplace_back([rank, server_address = server_->Address()] {
-      FederatedCommunicator comm{kWorldSize, rank, server_address};
-      // Assign device 0 to all workers, since we run gtest in a single-GPU machine
-      DeviceCommunicatorAdapter adapter{0, &comm};
-
-      int const count = rank + 2;
-      thrust::device_vector<char> buffer(count, 0);
-      thrust::sequence(buffer.begin(), buffer.end());
-      std::vector<std::size_t> segments(kWorldSize);
-      dh::caching_device_vector<char> receive_buffer{};
-
-      adapter.AllGatherV(buffer.data().get(), count, &segments, &receive_buffer);
-
-      EXPECT_EQ(segments[0], 2);
-      EXPECT_EQ(segments[1], 3);
-      thrust::host_vector<char> host_buffer = receive_buffer;
-      EXPECT_EQ(host_buffer.size(), 9);
-      int expected[] = {0, 1, 0, 1, 2, 0, 1, 2, 3};
-      for (auto i = 0; i < 9; i++) {
-        EXPECT_EQ(host_buffer[i], expected[i]);
-      }
-    });
+TEST_F(FederatedAdapterTest, MGPUAllReduceSum) {
+  auto const n_gpus = common::AllVisibleGPUs();
+  if (n_gpus <= 1) {
+    GTEST_SKIP() << "Skipping MGPUAllReduceSum test with # GPUs = " << n_gpus;
   }
-  for (auto& thread : threads) {
-    thread.join();
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyAllReduceSum);
+}
+
+namespace {
+void VerifyAllGatherV() {
+  auto const world_size = collective::GetWorldSize();
+  auto const rank = collective::GetRank();
+  int const count = rank + 2;
+  thrust::device_vector<char> buffer(count, 0);
+  thrust::sequence(buffer.begin(), buffer.end());
+  std::vector<std::size_t> segments(world_size);
+  dh::caching_device_vector<char> receive_buffer{};
+
+  collective::AllGatherV(rank, buffer.data().get(), count, &segments, &receive_buffer);
+
+  EXPECT_EQ(segments[0], 2);
+  EXPECT_EQ(segments[1], 3);
+  thrust::host_vector<char> host_buffer = receive_buffer;
+  EXPECT_EQ(host_buffer.size(), 5);
+  int expected[] = {0, 1, 0, 1, 2};
+  for (auto i = 0; i < 5; i++) {
+    EXPECT_EQ(host_buffer[i], expected[i]);
   }
 }
+}  // anonymous namespace
+
+TEST_F(FederatedAdapterTest, MGPUAllGatherV) {
+  auto const n_gpus = common::AllVisibleGPUs();
+  if (n_gpus <= 1) {
+    GTEST_SKIP() << "Skipping MGPUAllGatherV test with # GPUs = " << n_gpus;
+  }
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyAllGatherV);
+}
 
 }  // namespace xgboost::collective
diff --git a/tests/cpp/plugin/test_federated_communicator.cc b/tests/cpp/plugin/test_federated_communicator.cc
index 62f33d5ee..8b0e1039a 100644
--- a/tests/cpp/plugin/test_federated_communicator.cc
+++ b/tests/cpp/plugin/test_federated_communicator.cc
@@ -31,7 +31,7 @@ class FederatedCommunicatorTest : public BaseFederatedTest {
 
  protected:
   static void CheckAllgather(FederatedCommunicator &comm, int rank) {
-    int buffer[kWorldSize] = {0, 0, 0};
+    int buffer[kWorldSize] = {0, 0};
     buffer[rank] = rank;
     comm.AllGather(buffer, sizeof(buffer));
     for (auto i = 0; i < kWorldSize; i++) {
@@ -42,7 +42,7 @@ class FederatedCommunicatorTest : public BaseFederatedTest {
   static void CheckAllreduce(FederatedCommunicator &comm) {
     int buffer[] = {1, 2, 3, 4, 5};
     comm.AllReduce(buffer, sizeof(buffer) / sizeof(buffer[0]), DataType::kInt32, Operation::kSum);
-    int expected[] = {3, 6, 9, 12, 15};
+    int expected[] = {2, 4, 6, 8, 10};
     for (auto i = 0; i < 5; i++) {
       EXPECT_EQ(buffer[i], expected[i]);
     }
diff --git a/tests/cpp/plugin/test_federated_data.cc b/tests/cpp/plugin/test_federated_data.cc
index c6efb84d5..6a8233a0f 100644
--- a/tests/cpp/plugin/test_federated_data.cc
+++ b/tests/cpp/plugin/test_federated_data.cc
@@ -30,7 +30,7 @@ void VerifyLoadUri() {
   std::string uri = path + "?format=csv";
   dmat.reset(DMatrix::Load(uri, false, DataSplitMode::kCol));
 
-  ASSERT_EQ(dmat->Info().num_col_, 8 * collective::GetWorldSize() + 3);
+  ASSERT_EQ(dmat->Info().num_col_, 8 * collective::GetWorldSize() + 1);
   ASSERT_EQ(dmat->Info().num_row_, kRows);
 
   for (auto const& page : dmat->GetBatches<SparsePage>()) {
diff --git a/tests/cpp/plugin/test_federated_server.cc b/tests/cpp/plugin/test_federated_server.cc
index 4dd2f3c40..633d64df1 100644
--- a/tests/cpp/plugin/test_federated_server.cc
+++ b/tests/cpp/plugin/test_federated_server.cc
@@ -39,7 +39,7 @@ class FederatedServerTest : public BaseFederatedTest {
 
  protected:
   static void CheckAllgather(federated::FederatedClient& client, int rank) {
-    int data[kWorldSize] = {0, 0, 0};
+    int data[kWorldSize] = {0, 0};
     data[rank] = rank;
     std::string send_buffer(reinterpret_cast<char const*>(data), sizeof(data));
     auto reply = client.Allgather(send_buffer);
@@ -54,7 +54,7 @@ class FederatedServerTest : public BaseFederatedTest {
     std::string send_buffer(reinterpret_cast<char const*>(data), sizeof(data));
     auto reply = client.Allreduce(send_buffer, federated::INT32, federated::SUM);
     auto const* result = reinterpret_cast<int const*>(reply.data());
-    int expected[] = {3, 6, 9, 12, 15};
+    int expected[] = {2, 4, 6, 8, 10};
     for (auto i = 0; i < 5; i++) {
       EXPECT_EQ(result[i], expected[i]);
     }

From 3a0f78770351ba93544e5796b5d0bf467bfc38bc Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Sun, 2 Jul 2023 13:05:34 -0700
Subject: [PATCH 013/136] Support column split in GPU predictor (#9343)

---
 src/collective/nccl_device_communicator.cu |   5 +-
 src/predictor/cpu_predictor.cc             |  17 +-
 src/predictor/gpu_predictor.cu             | 209 ++++++++++++++++++++-
 src/predictor/predict_fn.h                 |  20 +-
 tests/cpp/predictor/test_gpu_predictor.cu  |  62 ++++++
 5 files changed, 288 insertions(+), 25 deletions(-)

diff --git a/src/collective/nccl_device_communicator.cu b/src/collective/nccl_device_communicator.cu
index 57419b947..7f5686075 100644
--- a/src/collective/nccl_device_communicator.cu
+++ b/src/collective/nccl_device_communicator.cu
@@ -122,10 +122,11 @@ template <typename Func>
 void RunBitwiseAllreduce(char *out_buffer, char const *device_buffer, Func func, int world_size,
                          std::size_t size, cudaStream_t stream) {
   dh::LaunchN(size, stream, [=] __device__(std::size_t idx) {
-    out_buffer[idx] = device_buffer[idx];
+    auto result = device_buffer[idx];
     for (auto rank = 1; rank < world_size; rank++) {
-      out_buffer[idx] = func(out_buffer[idx], device_buffer[rank * size + idx]);
+      result = func(result, device_buffer[rank * size + idx]);
     }
+    out_buffer[idx] = result;
   });
 }
 }  // anonymous namespace
diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index 26b5a85b6..56362a112 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -467,7 +467,6 @@ class ColumnSplitHelper {
   void MaskOneTree(RegTree::FVec const &feat, std::size_t tree_id, std::size_t row_id) {
     auto const &tree = *model_.trees[tree_id];
     auto const &cats = tree.GetCategoriesMatrix();
-    auto const has_categorical = tree.HasCategoricalSplit();
     bst_node_t n_nodes = tree.GetNodes().size();
 
     for (bst_node_t nid = 0; nid < n_nodes; nid++) {
@@ -484,16 +483,10 @@ class ColumnSplitHelper {
       }
 
       auto const fvalue = feat.GetFvalue(split_index);
-      if (has_categorical && common::IsCat(cats.split_type, nid)) {
-        auto const node_categories =
-            cats.categories.subspan(cats.node_ptr[nid].beg, cats.node_ptr[nid].size);
-        if (!common::Decision(node_categories, fvalue)) {
-          decision_bits_.Set(bit_index);
-        }
-        continue;
-      }
-
-      if (fvalue >= node.SplitCond()) {
+      auto const decision = tree.HasCategoricalSplit()
+                                ? GetDecision<true>(node, nid, fvalue, cats)
+                                : GetDecision<false>(node, nid, fvalue, cats);
+      if (decision) {
         decision_bits_.Set(bit_index);
       }
     }
@@ -511,7 +504,7 @@ class ColumnSplitHelper {
     if (missing_bits_.Check(bit_index)) {
       return node.DefaultChild();
     } else {
-      return node.LeftChild() + decision_bits_.Check(bit_index);
+      return node.LeftChild() + !decision_bits_.Check(bit_index);
     }
   }
 
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index 98e380682..4ca0e33ff 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -11,9 +11,11 @@
 #include <any>  // for any, any_cast
 #include <memory>
 
+#include "../collective/communicator-inl.cuh"
 #include "../common/bitfield.h"
 #include "../common/categorical.h"
 #include "../common/common.h"
+#include "../common/cuda_context.cuh"
 #include "../common/device_helpers.cuh"
 #include "../data/device_adapter.cuh"
 #include "../data/ellpack_page.cuh"
@@ -110,13 +112,11 @@ struct SparsePageLoader {
   bool use_shared;
   SparsePageView data;
   float* smem;
-  size_t entry_start;
 
   __device__ SparsePageLoader(SparsePageView data, bool use_shared, bst_feature_t num_features,
                               bst_row_t num_rows, size_t entry_start, float)
       : use_shared(use_shared),
-        data(data),
-        entry_start(entry_start) {
+        data(data) {
     extern __shared__ float _smem[];
     smem = _smem;
     // Copy instances
@@ -622,6 +622,199 @@ size_t SharedMemoryBytes(size_t cols, size_t max_shared_memory_bytes) {
   }
   return shared_memory_bytes;
 }
+
+using BitVector = LBitField64;
+
+__global__ void MaskBitVectorKernel(
+    SparsePageView data, common::Span<RegTree::Node const> d_nodes,
+    common::Span<std::size_t const> d_tree_segments, common::Span<int const> d_tree_group,
+    common::Span<FeatureType const> d_tree_split_types,
+    common::Span<std::uint32_t const> d_cat_tree_segments,
+    common::Span<RegTree::CategoricalSplitMatrix::Segment const> d_cat_node_segments,
+    common::Span<std::uint32_t const> d_categories, BitVector decision_bits, BitVector missing_bits,
+    std::size_t tree_begin, std::size_t tree_end, std::size_t num_features, std::size_t num_rows,
+    std::size_t entry_start, std::size_t num_nodes, bool use_shared, float missing) {
+  auto const row_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (row_idx >= num_rows) {
+    return;
+  }
+  SparsePageLoader loader(data, use_shared, num_features, num_rows, entry_start, missing);
+
+  std::size_t tree_offset = 0;
+  for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
+    TreeView d_tree{tree_begin,          tree_idx,           d_nodes,
+                    d_tree_segments,     d_tree_split_types, d_cat_tree_segments,
+                    d_cat_node_segments, d_categories};
+    auto const tree_nodes = d_tree.d_tree.size();
+    for (auto nid = 0; nid < tree_nodes; nid++) {
+      auto const& node = d_tree.d_tree[nid];
+      if (node.IsDeleted() || node.IsLeaf()) {
+          continue;
+      }
+      auto const fvalue = loader.GetElement(row_idx, node.SplitIndex());
+      auto const is_missing = common::CheckNAN(fvalue);
+      auto const bit_index = row_idx * num_nodes + tree_offset + nid;
+      if (is_missing) {
+          missing_bits.Set(bit_index);
+      } else {
+          auto const decision = d_tree.HasCategoricalSplit()
+                                    ? GetDecision<true>(node, nid, fvalue, d_tree.cats)
+                                    : GetDecision<false>(node, nid, fvalue, d_tree.cats);
+          if (decision) {
+            decision_bits.Set(bit_index);
+          }
+      }
+    }
+    tree_offset += tree_nodes;
+  }
+}
+
+__device__ float GetLeafWeightByBitVector(bst_row_t ridx, TreeView const& tree,
+                                          BitVector const& decision_bits,
+                                          BitVector const& missing_bits, std::size_t num_nodes,
+                                          std::size_t tree_offset) {
+  bst_node_t nidx = 0;
+  RegTree::Node n = tree.d_tree[nidx];
+  while (!n.IsLeaf()) {
+    auto const bit_index = ridx * num_nodes + tree_offset + nidx;
+    if (missing_bits.Check(bit_index)) {
+      nidx = n.DefaultChild();
+    } else {
+      nidx = n.LeftChild() + !decision_bits.Check(bit_index);
+    }
+    n = tree.d_tree[nidx];
+  }
+  return tree.d_tree[nidx].LeafValue();
+}
+
+__global__ void PredictByBitVectorKernel(
+    common::Span<RegTree::Node const> d_nodes, common::Span<float> d_out_predictions,
+    common::Span<std::size_t const> d_tree_segments, common::Span<int const> d_tree_group,
+    common::Span<FeatureType const> d_tree_split_types,
+    common::Span<std::uint32_t const> d_cat_tree_segments,
+    common::Span<RegTree::CategoricalSplitMatrix::Segment const> d_cat_node_segments,
+    common::Span<std::uint32_t const> d_categories, BitVector decision_bits, BitVector missing_bits,
+    std::size_t tree_begin, std::size_t tree_end, std::size_t num_rows, std::size_t num_nodes,
+    std::uint32_t num_group) {
+  auto const row_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (row_idx >= num_rows) {
+    return;
+  }
+
+  std::size_t tree_offset = 0;
+  if (num_group == 1) {
+    float sum = 0;
+    for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
+      TreeView d_tree{tree_begin,          tree_idx,           d_nodes,
+                      d_tree_segments,     d_tree_split_types, d_cat_tree_segments,
+                      d_cat_node_segments, d_categories};
+      sum += GetLeafWeightByBitVector(row_idx, d_tree, decision_bits, missing_bits, num_nodes,
+                                      tree_offset);
+      tree_offset += d_tree.d_tree.size();
+    }
+    d_out_predictions[row_idx] += sum;
+  } else {
+    for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
+      auto const tree_group = d_tree_group[tree_idx];
+      TreeView d_tree{tree_begin,          tree_idx,           d_nodes,
+                      d_tree_segments,     d_tree_split_types, d_cat_tree_segments,
+                      d_cat_node_segments, d_categories};
+      bst_uint out_prediction_idx = row_idx * num_group + tree_group;
+      d_out_predictions[out_prediction_idx] += GetLeafWeightByBitVector(
+          row_idx, d_tree, decision_bits, missing_bits, num_nodes, tree_offset);
+      tree_offset += d_tree.d_tree.size();
+    }
+  }
+}
+
+class ColumnSplitHelper {
+ public:
+  explicit ColumnSplitHelper(Context const* ctx) : ctx_{ctx} {}
+
+  void PredictBatch(DMatrix* dmat, HostDeviceVector<float>* out_preds,
+                    gbm::GBTreeModel const& model, DeviceModel const& d_model) const {
+    CHECK(dmat->PageExists<SparsePage>()) << "Column split for external memory is not support.";
+    PredictDMatrix(dmat, out_preds, d_model, model.learner_model_param->num_feature,
+                   model.learner_model_param->num_output_group);
+  }
+
+ private:
+  using BitType = BitVector::value_type;
+
+  void PredictDMatrix(DMatrix* dmat, HostDeviceVector<float>* out_preds, DeviceModel const& model,
+                      bst_feature_t num_features, std::uint32_t num_group) const {
+    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
+    dh::caching_device_vector<BitType> decision_storage{};
+    dh::caching_device_vector<BitType> missing_storage{};
+
+    auto constexpr kBlockThreads = 128;
+    auto const max_shared_memory_bytes = dh::MaxSharedMemory(ctx_->gpu_id);
+    auto const shared_memory_bytes =
+        SharedMemoryBytes<kBlockThreads>(num_features, max_shared_memory_bytes);
+    auto const use_shared = shared_memory_bytes != 0;
+
+    auto const num_nodes = model.nodes.Size();
+    std::size_t batch_offset = 0;
+    for (auto const& batch : dmat->GetBatches<SparsePage>()) {
+      auto const num_rows = batch.Size();
+      ResizeBitVectors(&decision_storage, &missing_storage, num_rows * num_nodes);
+      BitVector decision_bits{dh::ToSpan(decision_storage)};
+      BitVector missing_bits{dh::ToSpan(missing_storage)};
+
+      batch.offset.SetDevice(ctx_->gpu_id);
+      batch.data.SetDevice(ctx_->gpu_id);
+      std::size_t entry_start = 0;
+      SparsePageView data(batch.data.DeviceSpan(), batch.offset.DeviceSpan(), num_features);
+
+      auto const grid = static_cast<uint32_t>(common::DivRoundUp(num_rows, kBlockThreads));
+      dh::LaunchKernel {grid, kBlockThreads, shared_memory_bytes, ctx_->CUDACtx()->Stream()} (
+          MaskBitVectorKernel, data, model.nodes.ConstDeviceSpan(),
+          model.tree_segments.ConstDeviceSpan(), model.tree_group.ConstDeviceSpan(),
+          model.split_types.ConstDeviceSpan(), model.categories_tree_segments.ConstDeviceSpan(),
+          model.categories_node_segments.ConstDeviceSpan(), model.categories.ConstDeviceSpan(),
+          decision_bits, missing_bits, model.tree_beg_, model.tree_end_, num_features, num_rows,
+          entry_start, num_nodes, use_shared, nan(""));
+
+      AllReduceBitVectors(&decision_storage, &missing_storage);
+
+      dh::LaunchKernel {grid, kBlockThreads, 0, ctx_->CUDACtx()->Stream()} (
+          PredictByBitVectorKernel, model.nodes.ConstDeviceSpan(),
+          out_preds->DeviceSpan().subspan(batch_offset), model.tree_segments.ConstDeviceSpan(),
+          model.tree_group.ConstDeviceSpan(), model.split_types.ConstDeviceSpan(),
+          model.categories_tree_segments.ConstDeviceSpan(),
+          model.categories_node_segments.ConstDeviceSpan(), model.categories.ConstDeviceSpan(),
+          decision_bits, missing_bits, model.tree_beg_, model.tree_end_, num_rows, num_nodes,
+          num_group);
+
+      batch_offset += batch.Size() * num_group;
+    }
+  }
+
+  void AllReduceBitVectors(dh::caching_device_vector<BitType>* decision_storage,
+                           dh::caching_device_vector<BitType>* missing_storage) const {
+    collective::AllReduce<collective::Operation::kBitwiseOR>(
+        ctx_->gpu_id, decision_storage->data().get(), decision_storage->size());
+    collective::AllReduce<collective::Operation::kBitwiseAND>(
+        ctx_->gpu_id, missing_storage->data().get(), missing_storage->size());
+    collective::Synchronize(ctx_->gpu_id);
+  }
+
+  void ResizeBitVectors(dh::caching_device_vector<BitType>* decision_storage,
+                               dh::caching_device_vector<BitType>* missing_storage,
+                               std::size_t total_bits) const {
+    auto const size = BitVector::ComputeStorageSize(total_bits);
+    if (decision_storage->size() < size) {
+      decision_storage->resize(size);
+    }
+    thrust::fill(ctx_->CUDACtx()->CTP(), decision_storage->begin(), decision_storage->end(), 0);
+    if (missing_storage->size() < size) {
+      missing_storage->resize(size);
+    }
+    thrust::fill(ctx_->CUDACtx()->CTP(), missing_storage->begin(), missing_storage->end(), 0);
+  }
+
+  Context const* ctx_;
+};
 }  // anonymous namespace
 
 class GPUPredictor : public xgboost::Predictor {
@@ -697,6 +890,11 @@ class GPUPredictor : public xgboost::Predictor {
     DeviceModel d_model;
     d_model.Init(model, tree_begin, tree_end, ctx_->gpu_id);
 
+    if (dmat->Info().IsColumnSplit()) {
+      column_split_helper_.PredictBatch(dmat, out_preds, model, d_model);
+      return;
+    }
+
     if (dmat->PageExists<SparsePage>()) {
       size_t batch_offset = 0;
       for (auto &batch : dmat->GetBatches<SparsePage>()) {
@@ -720,7 +918,8 @@ class GPUPredictor : public xgboost::Predictor {
   }
 
  public:
-  explicit GPUPredictor(Context const* ctx) : Predictor::Predictor{ctx} {}
+  explicit GPUPredictor(Context const* ctx)
+      : Predictor::Predictor{ctx}, column_split_helper_{ctx} {}
 
   ~GPUPredictor() override {
     if (ctx_->gpu_id >= 0 && ctx_->gpu_id < common::AllVisibleGPUs()) {
@@ -1019,6 +1218,8 @@ class GPUPredictor : public xgboost::Predictor {
     }
     return 0;
   }
+
+  ColumnSplitHelper column_split_helper_;
 };
 
 XGBOOST_REGISTER_PREDICTOR(GPUPredictor, "gpu_predictor")
diff --git a/src/predictor/predict_fn.h b/src/predictor/predict_fn.h
index dbaf4a75e..044832010 100644
--- a/src/predictor/predict_fn.h
+++ b/src/predictor/predict_fn.h
@@ -7,6 +7,18 @@
 #include "xgboost/tree_model.h"
 
 namespace xgboost::predictor {
+/** @brief Whether it should traverse to the left branch of a tree. */
+template <bool has_categorical>
+XGBOOST_DEVICE bool GetDecision(RegTree::Node const &node, bst_node_t nid, float fvalue,
+                                RegTree::CategoricalSplitMatrix const &cats) {
+  if (has_categorical && common::IsCat(cats.split_type, nid)) {
+    auto node_categories = cats.categories.subspan(cats.node_ptr[nid].beg, cats.node_ptr[nid].size);
+    return common::Decision(node_categories, fvalue);
+  } else {
+    return fvalue < node.SplitCond();
+  }
+}
+
 template <bool has_missing, bool has_categorical>
 inline XGBOOST_DEVICE bst_node_t GetNextNode(const RegTree::Node &node, const bst_node_t nid,
                                              float fvalue, bool is_missing,
@@ -14,13 +26,7 @@ inline XGBOOST_DEVICE bst_node_t GetNextNode(const RegTree::Node &node, const bs
   if (has_missing && is_missing) {
     return node.DefaultChild();
   } else {
-    if (has_categorical && common::IsCat(cats.split_type, nid)) {
-      auto node_categories =
-          cats.categories.subspan(cats.node_ptr[nid].beg, cats.node_ptr[nid].size);
-      return common::Decision(node_categories, fvalue) ? node.LeftChild() : node.RightChild();
-    } else {
-      return node.LeftChild() + !(fvalue < node.SplitCond());
-    }
+    return node.LeftChild() + !GetDecision<has_categorical>(node, nid, fvalue, cats);
   }
 }
 
diff --git a/tests/cpp/predictor/test_gpu_predictor.cu b/tests/cpp/predictor/test_gpu_predictor.cu
index 4cf2970c1..6911824a9 100644
--- a/tests/cpp/predictor/test_gpu_predictor.cu
+++ b/tests/cpp/predictor/test_gpu_predictor.cu
@@ -57,6 +57,68 @@ TEST(GPUPredictor, Basic) {
   }
 }
 
+namespace {
+void VerifyBasicColumnSplit(std::array<std::vector<float>, 32> const& expected_result) {
+  auto const world_size = collective::GetWorldSize();
+  auto const rank = collective::GetRank();
+
+  auto ctx = MakeCUDACtx(rank);
+  std::unique_ptr<Predictor> predictor =
+      std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", &ctx));
+  predictor->Configure({});
+
+  for (size_t i = 1; i < 33; i *= 2) {
+    size_t n_row = i, n_col = i;
+    auto dmat = RandomDataGenerator(n_row, n_col, 0).GenerateDMatrix();
+    std::unique_ptr<DMatrix> sliced{dmat->SliceCol(world_size, rank)};
+
+    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.gpu_id)};
+    gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);
+
+    // Test predict batch
+    PredictionCacheEntry out_predictions;
+
+    predictor->InitOutPredictions(sliced->Info(), &out_predictions.predictions, model);
+    predictor->PredictBatch(sliced.get(), &out_predictions, model, 0);
+
+    std::vector<float>& out_predictions_h = out_predictions.predictions.HostVector();
+    EXPECT_EQ(out_predictions_h, expected_result[i - 1]);
+  }
+}
+}  // anonymous namespace
+
+TEST(GPUPredictor, MGPUBasicColumnSplit) {
+  auto const n_gpus = common::AllVisibleGPUs();
+  if (n_gpus <= 1) {
+    GTEST_SKIP() << "Skipping MGPUIBasicColumnSplit test with # GPUs = " << n_gpus;
+  }
+
+  auto ctx = MakeCUDACtx(0);
+  std::unique_ptr<Predictor> predictor =
+      std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", &ctx));
+  predictor->Configure({});
+
+  std::array<std::vector<float>, 32> result{};
+  for (size_t i = 1; i < 33; i *= 2) {
+    size_t n_row = i, n_col = i;
+    auto dmat = RandomDataGenerator(n_row, n_col, 0).GenerateDMatrix();
+
+    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.gpu_id)};
+    gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);
+
+    // Test predict batch
+    PredictionCacheEntry out_predictions;
+
+    predictor->InitOutPredictions(dmat->Info(), &out_predictions.predictions, model);
+    predictor->PredictBatch(dmat.get(), &out_predictions, model, 0);
+
+    std::vector<float>& out_predictions_h = out_predictions.predictions.HostVector();
+    result[i - 1] = out_predictions_h;
+  }
+
+  RunWithInMemoryCommunicator(n_gpus, VerifyBasicColumnSplit, result);
+}
+
 TEST(GPUPredictor, EllpackBasic) {
   size_t constexpr kCols {8};
   for (size_t bins = 2; bins < 258; bins += 16) {

From 39390cc2ee8d8a866991c160e6d018dc005cc070 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 3 Jul 2023 19:23:54 +0800
Subject: [PATCH 014/136] [breaking] Remove the `predictor` param, allow
 fallback to prediction using `DMatrix`. (#9129)

- A `DeviceOrd` struct is implemented to indicate the device. It will eventually replace the `gpu_id` parameter.
- The `predictor` parameter is removed.
- Fallback to `DMatrix` when `inplace_predict` is not available.
- The heuristic for choosing a predictor is only used during training.
---
 doc/gpu/index.rst                             |   2 +-
 doc/parameter.rst                             |  12 -
 doc/prediction.rst                            |  19 --
 doc/tutorials/dask.rst                        |   4 +-
 doc/tutorials/saving_model.rst                |   1 -
 include/xgboost/base.h                        |   7 +-
 include/xgboost/c_api.h                       |  12 +
 include/xgboost/context.h                     | 139 ++++++++--
 include/xgboost/gbm.h                         |  16 +-
 .../dmlc/xgboost4j/gpu/java/BoosterTest.java  |   1 -
 .../scala/rapids/spark/GpuPreXGBoost.scala    |   1 -
 python-package/xgboost/core.py                |  17 +-
 python-package/xgboost/sklearn.py             |  22 +-
 python-package/xgboost/testing/__init__.py    |  22 ++
 src/c_api/c_api.cc                            |   2 +-
 src/c_api/c_api.cu                            |  15 +-
 src/common/error_msg.h                        |  14 +
 src/data/adapter.h                            |   9 +-
 src/data/iterative_dmatrix.cu                 |  12 +-
 src/data/proxy_dmatrix.cc                     |  46 +++-
 src/data/proxy_dmatrix.cu                     |  25 +-
 src/data/proxy_dmatrix.cuh                    |  33 ++-
 src/data/proxy_dmatrix.h                      |  48 +++-
 src/data/sparse_page_source.cu                |  28 +-
 src/gbm/gblinear.cc                           |   7 +-
 src/gbm/gbtree.cc                             | 189 +++++++------
 src/gbm/gbtree.cu                             |  12 +-
 src/gbm/gbtree.h                              | 102 ++-----
 src/learner.cc                                |   7 +-
 src/objective/lambdarank_obj.cu               |   4 +-
 src/predictor/cpu_predictor.cc                |   5 +-
 src/tree/updater_gpu_hist.cu                  |   4 +-
 tests/ci_build/lint_python.py                 |   4 +
 tests/cpp/common/test_json.cc                 |   1 -
 tests/cpp/gbm/test_gbtree.cc                  |  43 +--
 tests/cpp/gbm/test_gbtree.cu                  |  88 ++++++
 tests/cpp/helpers.cc                          |   3 +
 tests/cpp/helpers.h                           |  20 +-
 tests/cpp/predictor/test_cpu_predictor.cc     |  69 +++--
 tests/cpp/predictor/test_gpu_predictor.cu     |  40 +--
 tests/cpp/predictor/test_predictor.cc         | 227 ++++++++--------
 tests/cpp/predictor/test_predictor.h          |  43 +--
 tests/cpp/test_learner.cc                     |  10 -
 tests/cpp/test_serialization.cc               |   4 -
 tests/python-gpu/load_pickle.py               |  44 ++-
 .../test_device_quantile_dmatrix.py           |   2 +-
 tests/python-gpu/test_from_cupy.py            |   7 +-
 tests/python-gpu/test_gpu_pickling.py         |  98 +++----
 tests/python-gpu/test_gpu_prediction.py       | 250 ++++++++++--------
 tests/python-gpu/test_gpu_ranking.py          |   2 -
 tests/python-gpu/test_gpu_updaters.py         |   1 -
 tests/python/test_predict.py                  |   9 +-
 tests/python/test_updaters.py                 |   8 +-
 tests/python/test_with_sklearn.py             |  17 --
 54 files changed, 1049 insertions(+), 778 deletions(-)
 create mode 100644 tests/cpp/gbm/test_gbtree.cu

diff --git a/doc/gpu/index.rst b/doc/gpu/index.rst
index 716ad0d58..97c9799fd 100644
--- a/doc/gpu/index.rst
+++ b/doc/gpu/index.rst
@@ -45,7 +45,7 @@ XGBoost makes use of `GPUTreeShap <https://github.com/rapidsai/gputreeshap>`_ as
 
 .. code-block:: python
 
-  model.set_param({"predictor": "gpu_predictor"})
+  model.set_param({"gpu_id": "0", "tree_method": "gpu_hist"})
   shap_values = model.predict(dtrain, pred_contribs=True)
   shap_interaction_values = model.predict(dtrain, pred_interactions=True)
 
diff --git a/doc/parameter.rst b/doc/parameter.rst
index f6d3a06b6..22893e400 100644
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -199,18 +199,6 @@ Parameters for Tree Booster
   - Maximum number of discrete bins to bucket continuous features.
   - Increasing this number improves the optimality of splits at the cost of higher computation time.
 
-* ``predictor``, [default= ``auto``]
-
-  - The type of predictor algorithm to use. Provides the same results but allows the use of GPU or CPU.
-
-    - ``auto``: Configure predictor based on heuristics.
-    - ``cpu_predictor``: Multicore CPU prediction algorithm.
-    - ``gpu_predictor``: Prediction using GPU.  Used when ``tree_method`` is ``gpu_hist``.
-      When ``predictor`` is set to default value ``auto``, the ``gpu_hist`` tree method is
-      able to provide GPU based prediction without copying training data to GPU memory.
-      If ``gpu_predictor`` is explicitly specified, then all data is copied into GPU, only
-      recommended for performing prediction tasks.
-
 * ``num_parallel_tree``, [default=1]
 
   - Number of parallel trees constructed during each iteration. This option is used to support boosted random forest.
diff --git a/doc/prediction.rst b/doc/prediction.rst
index 85d8b6150..b98c2fc6b 100644
--- a/doc/prediction.rst
+++ b/doc/prediction.rst
@@ -87,15 +87,6 @@ with the native Python interface :py:meth:`xgboost.Booster.predict` and
 behavior.  Also the ``save_best`` parameter from :py:obj:`xgboost.callback.EarlyStopping`
 might be useful.
 
-*********
-Predictor
-*********
-
-There are 2 predictors in XGBoost (3 if you have the one-api plugin enabled), namely
-``cpu_predictor`` and ``gpu_predictor``.  The default option is ``auto`` so that XGBoost
-can employ some heuristics for saving GPU memory during training.  They might have slight
-different outputs due to floating point errors.
-
 
 ***********
 Base Margin
@@ -134,15 +125,6 @@ it.  Be aware that the output of in-place prediction depends on input data type,
 input is on GPU data output is :py:obj:`cupy.ndarray`, otherwise a :py:obj:`numpy.ndarray`
 is returned.
 
-****************
-Categorical Data
-****************
-
-Other than users performing encoding, XGBoost has experimental support for categorical
-data using ``gpu_hist`` and ``gpu_predictor``.  No special operation needs to be done on
-input test data since the information about categories is encoded into the model during
-training.
-
 *************
 Thread Safety
 *************
@@ -159,7 +141,6 @@ instance we might accidentally call ``clf.set_params()`` inside a predict functi
 
     def predict_fn(clf: xgb.XGBClassifier, X):
         X = preprocess(X)
-        clf.set_params(predictor="gpu_predictor")  # NOT safe!
         clf.set_params(n_jobs=1)  # NOT safe!
         return clf.predict_proba(X, iteration_range=(0, 10))
 
diff --git a/doc/tutorials/dask.rst b/doc/tutorials/dask.rst
index fa487f1c8..8cb2e6ee2 100644
--- a/doc/tutorials/dask.rst
+++ b/doc/tutorials/dask.rst
@@ -148,8 +148,8 @@ Also for inplace prediction:
 
 .. code-block:: python
 
-  booster.set_param({'predictor': 'gpu_predictor'})
-  # where X is a dask DataFrame or dask Array containing cupy or cuDF backed data.
+  # where X is a dask DataFrame or dask Array backed by cupy or cuDF.
+  booster.set_param({"gpu_id": "0"})
   prediction = xgb.dask.inplace_predict(client, booster, X)
 
 When input is ``da.Array`` object, output is always ``da.Array``.  However, if the input
diff --git a/doc/tutorials/saving_model.rst b/doc/tutorials/saving_model.rst
index 1fdca35e1..e536f3fcc 100644
--- a/doc/tutorials/saving_model.rst
+++ b/doc/tutorials/saving_model.rst
@@ -173,7 +173,6 @@ Will print out something similar to (not actual output as it's too long for demo
         "gradient_booster": {
           "gbtree_train_param": {
             "num_parallel_tree": "1",
-            "predictor": "gpu_predictor",
             "process_type": "default",
             "tree_method": "gpu_hist",
             "updater": "grow_gpu_hist",
diff --git a/include/xgboost/base.h b/include/xgboost/base.h
index 43540beea..6ccd168f3 100644
--- a/include/xgboost/base.h
+++ b/include/xgboost/base.h
@@ -10,6 +10,7 @@
 #include <dmlc/omp.h>
 
 #include <cmath>
+#include <cstdint>
 #include <iostream>
 #include <string>
 #include <utility>
@@ -112,7 +113,7 @@ using bst_row_t = std::size_t;   // NOLINT
 /*! \brief Type for tree node index. */
 using bst_node_t = std::int32_t;      // NOLINT
 /*! \brief Type for ranking group index. */
-using bst_group_t = std::uint32_t;      // NOLINT
+using bst_group_t = std::uint32_t;  // NOLINT
 /**
  * \brief Type for indexing into output targets.
  */
@@ -125,6 +126,10 @@ using bst_layer_t = std::int32_t;  // NOLINT
  * \brief Type for indexing trees.
  */
 using bst_tree_t = std::int32_t;  // NOLINT
+/**
+ * @brief Ordinal of a CUDA device.
+ */
+using bst_d_ordinal_t = std::int16_t;  // NOLINT
 
 namespace detail {
 /*! \brief Implementation of gradient statistics pair. Template specialisation
diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h
index 4b9d37335..3cfba0468 100644
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@@ -1067,6 +1067,9 @@ XGB_DLL int XGBoosterPredictFromDMatrix(BoosterHandle handle, DMatrixHandle dmat
 /**
  * \brief Inplace prediction from CPU dense matrix.
  *
+ * \note If the booster is configured to run on a CUDA device, XGBoost falls back to run
+ *       prediction with DMatrix with a performance warning.
+ *
  * \param handle        Booster handle.
  * \param values        JSON encoded __array_interface__ to values.
  * \param config        See \ref XGBoosterPredictFromDMatrix for more info.
@@ -1091,6 +1094,9 @@ XGB_DLL int XGBoosterPredictFromDense(BoosterHandle handle, char const *values,
 /**
  * \brief Inplace prediction from CPU CSR matrix.
  *
+ * \note If the booster is configured to run on a CUDA device, XGBoost falls back to run
+ *       prediction with DMatrix with a performance warning.
+ *
  * \param handle        Booster handle.
  * \param indptr        JSON encoded __array_interface__ to row pointer in CSR.
  * \param indices       JSON encoded __array_interface__ to column indices in CSR.
@@ -1116,6 +1122,9 @@ XGB_DLL int XGBoosterPredictFromCSR(BoosterHandle handle, char const *indptr, ch
 /**
  * \brief Inplace prediction from CUDA Dense matrix (cupy in Python).
  *
+ * \note If the booster is configured to run on a CPU, XGBoost falls back to run
+ *       prediction with DMatrix with a performance warning.
+ *
  * \param handle        Booster handle
  * \param values        JSON encoded __cuda_array_interface__ to values.
  * \param config        See \ref XGBoosterPredictFromDMatrix for more info.
@@ -1137,6 +1146,9 @@ XGB_DLL int XGBoosterPredictFromCudaArray(BoosterHandle handle, char const *valu
 /**
  * \brief Inplace prediction from CUDA dense dataframe (cuDF in Python).
  *
+ * \note If the booster is configured to run on a CPU, XGBoost falls back to run
+ *       prediction with DMatrix with a performance warning.
+ *
  * \param handle        Booster handle
  * \param values        List of __cuda_array_interface__ for all columns encoded in JSON list.
  * \param config        See \ref XGBoosterPredictFromDMatrix for more info.
diff --git a/include/xgboost/context.h b/include/xgboost/context.h
index f1cd391df..b11ca70ec 100644
--- a/include/xgboost/context.h
+++ b/include/xgboost/context.h
@@ -1,20 +1,79 @@
-/*!
- * Copyright 2014-2022 by Contributors
+/**
+ * Copyright 2014-2023, XGBoost Contributors
  * \file context.h
  */
 #ifndef XGBOOST_CONTEXT_H_
 #define XGBOOST_CONTEXT_H_
 
-#include <xgboost/logging.h>
-#include <xgboost/parameter.h>
+#include <xgboost/base.h>       // for bst_d_ordinal_t
+#include <xgboost/logging.h>    // for CHECK_GE
+#include <xgboost/parameter.h>  // for XGBoostParameter
 
-#include <memory>  // std::shared_ptr
-#include <string>
+#include <cstdint>  // for int16_t, int32_t, int64_t
+#include <memory>   // for shared_ptr
+#include <string>   // for string, to_string
 
 namespace xgboost {
 
 struct CUDAContext;
 
+/**
+ * @brief A type for device ordinal. The type is packed into 32-bit for efficient use in
+ *        viewing types like `linalg::TensorView`.
+ */
+struct DeviceOrd {
+  enum Type : std::int16_t { kCPU = 0, kCUDA = 1 } device{kCPU};
+  // CUDA device ordinal.
+  bst_d_ordinal_t ordinal{-1};
+
+  [[nodiscard]] bool IsCUDA() const { return device == kCUDA; }
+  [[nodiscard]] bool IsCPU() const { return device == kCPU; }
+
+  DeviceOrd() = default;
+  constexpr DeviceOrd(Type type, bst_d_ordinal_t ord) : device{type}, ordinal{ord} {}
+
+  DeviceOrd(DeviceOrd const& that) = default;
+  DeviceOrd& operator=(DeviceOrd const& that) = default;
+  DeviceOrd(DeviceOrd&& that) = default;
+  DeviceOrd& operator=(DeviceOrd&& that) = default;
+
+  /**
+   * @brief Constructor for CPU.
+   */
+  [[nodiscard]] constexpr static auto CPU() { return DeviceOrd{kCPU, -1}; }
+  /**
+   * @brief Constructor for CUDA device.
+   *
+   * @param ordinal CUDA device ordinal.
+   */
+  [[nodiscard]] static auto CUDA(bst_d_ordinal_t ordinal) { return DeviceOrd{kCUDA, ordinal}; }
+
+  [[nodiscard]] bool operator==(DeviceOrd const& that) const {
+    return device == that.device && ordinal == that.ordinal;
+  }
+  [[nodiscard]] bool operator!=(DeviceOrd const& that) const { return !(*this == that); }
+  /**
+   * @brief Get a string representation of the device and the ordinal.
+   */
+  [[nodiscard]] std::string Name() const {
+    switch (device) {
+      case DeviceOrd::kCPU:
+        return "CPU";
+      case DeviceOrd::kCUDA:
+        return "CUDA:" + std::to_string(ordinal);
+      default: {
+        LOG(FATAL) << "Unknown device.";
+        return "";
+      }
+    }
+  }
+};
+
+static_assert(sizeof(DeviceOrd) == sizeof(std::int32_t));
+
+/**
+ * @brief Runtime context for XGBoost. Contains information like threads and device.
+ */
 struct Context : public XGBoostParameter<Context> {
  public:
   // Constant representing the device ID of CPU.
@@ -36,29 +95,59 @@ struct Context : public XGBoostParameter<Context> {
   // fail when gpu_id is invalid
   bool fail_on_invalid_gpu_id{false};
   bool validate_parameters{false};
-
-  /*!
-   * \brief Configure the parameter `gpu_id'.
+  /**
+   * @brief Configure the parameter `gpu_id'.
    *
-   * \param require_gpu  Whether GPU is explicitly required from user.
+   * @param require_gpu Whether GPU is explicitly required by the user through other
+   *                    configurations.
    */
   void ConfigureGpuId(bool require_gpu);
-  /*!
-   * Return automatically chosen threads.
+  /**
+   * @brief Returns the automatically chosen number of threads based on the `nthread`
+   *        parameter and the system settting.
    */
-  std::int32_t Threads() const;
-
-  bool IsCPU() const { return gpu_id == kCpuId; }
-  bool IsCUDA() const { return !IsCPU(); }
-
-  CUDAContext const* CUDACtx() const;
-  // Make a CUDA context based on the current context.
-  Context MakeCUDA(std::int32_t device = 0) const {
+  [[nodiscard]] std::int32_t Threads() const;
+  /**
+   * @brief Is XGBoost running on CPU?
+   */
+  [[nodiscard]] bool IsCPU() const { return gpu_id == kCpuId; }
+  /**
+   * @brief Is XGBoost running on a CUDA device?
+   */
+  [[nodiscard]] bool IsCUDA() const { return !IsCPU(); }
+  /**
+   * @brief Get the current device and ordinal.
+   */
+  [[nodiscard]] DeviceOrd Device() const {
+    return IsCPU() ? DeviceOrd::CPU() : DeviceOrd::CUDA(static_cast<bst_d_ordinal_t>(gpu_id));
+  }
+  /**
+   * @brief Get the CUDA device ordinal. -1 if XGBoost is running on CPU.
+   */
+  [[nodiscard]] bst_d_ordinal_t Ordinal() const { return this->gpu_id; }
+  /**
+   * @brief Name of the current device.
+   */
+  [[nodiscard]] std::string DeviceName() const { return Device().Name(); }
+  /**
+   * @brief Get a CUDA device context for allocator and stream.
+   */
+  [[nodiscard]] CUDAContext const* CUDACtx() const;
+  /**
+   * @brief Make a CUDA context based on the current context.
+   *
+   * @param ordinal The CUDA device ordinal.
+   */
+  [[nodiscard]] Context MakeCUDA(std::int32_t ordinal = 0) const {
     Context ctx = *this;
-    ctx.gpu_id = device;
+    CHECK_GE(ordinal, 0);
+    ctx.gpu_id = ordinal;
     return ctx;
   }
-  Context MakeCPU() const {
+  /**
+   * @brief Make a CPU context based on the current context.
+   */
+  [[nodiscard]] Context MakeCPU() const {
     Context ctx = *this;
     ctx.gpu_id = kCpuId;
     return ctx;
@@ -87,9 +176,9 @@ struct Context : public XGBoostParameter<Context> {
   }
 
  private:
-  // mutable for lazy initialization for cuda context to avoid initializing CUDA at load.
-  // shared_ptr is used instead of unique_ptr as with unique_ptr it's difficult to define p_impl
-  // while trying to hide CUDA code from host compiler.
+  // mutable for lazy cuda context initialization. This avoids initializing CUDA at load.
+  // shared_ptr is used instead of unique_ptr as with unique_ptr it's difficult to define
+  // p_impl while trying to hide CUDA code from the host compiler.
   mutable std::shared_ptr<CUDAContext> cuctx_;
   // cached value for CFS CPU limit. (used in containerized env)
   std::int32_t cfs_cpu_count_;  // NOLINT
diff --git a/include/xgboost/gbm.h b/include/xgboost/gbm.h
index 4f690064f..6d3832093 100644
--- a/include/xgboost/gbm.h
+++ b/include/xgboost/gbm.h
@@ -149,18 +149,14 @@ class GradientBooster : public Model, public Configurable {
    * \param layer_begin Beginning of boosted tree layer used for prediction.
    * \param layer_end   End of booster layer. 0 means do not limit trees.
    * \param approximate use a faster (inconsistent) approximation of SHAP values
-   * \param condition condition on the condition_feature (0=no, -1=cond off, 1=cond on).
-   * \param condition_feature feature to condition on (i.e. fix) during calculations
    */
-  virtual void PredictContribution(DMatrix* dmat,
-                                   HostDeviceVector<bst_float>* out_contribs,
-                                   unsigned layer_begin, unsigned layer_end,
-                                   bool approximate = false, int condition = 0,
-                                   unsigned condition_feature = 0) = 0;
+  virtual void PredictContribution(DMatrix* dmat, HostDeviceVector<float>* out_contribs,
+                                   bst_layer_t layer_begin, bst_layer_t layer_end,
+                                   bool approximate = false) = 0;
 
-  virtual void PredictInteractionContributions(
-      DMatrix *dmat, HostDeviceVector<bst_float> *out_contribs,
-      unsigned layer_begin, unsigned layer_end, bool approximate) = 0;
+  virtual void PredictInteractionContributions(DMatrix* dmat, HostDeviceVector<float>* out_contribs,
+                                               bst_layer_t layer_begin, bst_layer_t layer_end,
+                                               bool approximate) = 0;
 
   /*!
    * \brief dump the model in the requested format
diff --git a/jvm-packages/xgboost4j-gpu/src/test/java/ml/dmlc/xgboost4j/gpu/java/BoosterTest.java b/jvm-packages/xgboost4j-gpu/src/test/java/ml/dmlc/xgboost4j/gpu/java/BoosterTest.java
index 25705fd1b..24a1491e1 100644
--- a/jvm-packages/xgboost4j-gpu/src/test/java/ml/dmlc/xgboost4j/gpu/java/BoosterTest.java
+++ b/jvm-packages/xgboost4j-gpu/src/test/java/ml/dmlc/xgboost4j/gpu/java/BoosterTest.java
@@ -78,7 +78,6 @@ public class BoosterTest {
         put("num_round", round);
         put("num_workers", 1);
         put("tree_method", "gpu_hist");
-        put("predictor", "gpu_predictor");
         put("max_bin", maxBin);
       }
     };
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala
index d28ae55e5..eef10a36d 100644
--- a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala
@@ -281,7 +281,6 @@ object GpuPreXGBoost extends PreXGBoostProvider {
             // - predictor: Force to gpu predictor since native doesn't save predictor.
             val gpuId = if (!isLocal) XGBoost.getGPUAddrFromResources else 0
             booster.setParam("gpu_id", gpuId.toString)
-            booster.setParam("predictor", "gpu_predictor")
             logger.info("GPU transform on device: " + gpuId)
             boosterFlag.isGpuParamsSet = true;
           }
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 200490d73..07e8d89cc 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -2187,20 +2187,25 @@ class Booster:
         base_margin: Any = None,
         strict_shape: bool = False,
     ) -> NumpyOrCupy:
-        """Run prediction in-place, Unlike :py:meth:`predict` method, inplace prediction
-        does not cache the prediction result.
+        """Run prediction in-place when possible, Unlike :py:meth:`predict` method,
+        inplace prediction does not cache the prediction result.
 
         Calling only ``inplace_predict`` in multiple threads is safe and lock
         free.  But the safety does not hold when used in conjunction with other
         methods. E.g. you can't train the booster in one thread and perform
         prediction in the other.
 
+        .. note::
+
+            If the device ordinal of the input data doesn't match the one configured for
+            the booster, data will be copied to the booster device.
+
         .. code-block:: python
 
-            booster.set_param({"predictor": "gpu_predictor"})
+            booster.set_param({"gpu_id": "0", "tree_method": "gpu_hist"})
             booster.inplace_predict(cupy_array)
 
-            booster.set_param({"predictor": "cpu_predictor"})
+            booster.set_param({"gpu_id": "-1", "tree_method": "hist"})
             booster.inplace_predict(numpy_array)
 
         .. versionadded:: 1.1.0
@@ -2208,9 +2213,7 @@ class Booster:
         Parameters
         ----------
         data :
-            The input data, must not be a view for numpy array.  Set
-            ``predictor`` to ``gpu_predictor`` for running prediction on CuPy
-            array or CuDF DataFrame.
+            The input data.
         iteration_range :
             See :py:meth:`predict` for details.
         predict_type :
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index 4cc8a174c..440cd34be 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -277,9 +277,6 @@ __model_doc = f"""
         Device ordinal.
     validate_parameters : Optional[bool]
         Give warnings for unknown parameter.
-    predictor : Optional[str]
-        Force XGBoost to use specific predictor, available choices are [cpu_predictor,
-        gpu_predictor].
     enable_categorical : bool
 
         .. versionadded:: 1.5.0
@@ -652,7 +649,6 @@ class XGBModel(XGBModelBase):
         importance_type: Optional[str] = None,
         gpu_id: Optional[int] = None,
         validate_parameters: Optional[bool] = None,
-        predictor: Optional[str] = None,
         enable_categorical: bool = False,
         feature_types: Optional[FeatureTypes] = None,
         max_cat_to_onehot: Optional[int] = None,
@@ -699,7 +695,6 @@ class XGBModel(XGBModelBase):
         self.importance_type = importance_type
         self.gpu_id = gpu_id
         self.validate_parameters = validate_parameters
-        self.predictor = predictor
         self.enable_categorical = enable_categorical
         self.feature_types = feature_types
         self.max_cat_to_onehot = max_cat_to_onehot
@@ -1093,12 +1088,7 @@ class XGBModel(XGBModelBase):
             return self
 
     def _can_use_inplace_predict(self) -> bool:
-        # When predictor is explicitly set, using `inplace_predict` might result into
-        # error with incompatible data type.
-        # Inplace predict doesn't handle as many data types as DMatrix, but it's
-        # sufficient for dask interface where input is simpiler.
-        predictor = self.get_xgb_params().get("predictor", None)
-        if predictor in ("auto", None) and self.booster != "gblinear":
+        if self.booster != "gblinear":
             return True
         return False
 
@@ -1124,9 +1114,9 @@ class XGBModel(XGBModelBase):
         iteration_range: Optional[Tuple[int, int]] = None,
     ) -> ArrayLike:
         """Predict with `X`.  If the model is trained with early stopping, then
-        :py:attr:`best_iteration` is used automatically.  For tree models, when data is
-        on GPU, like cupy array or cuDF dataframe and `predictor` is not specified, the
-        prediction is run on GPU automatically, otherwise it will run on CPU.
+        :py:attr:`best_iteration` is used automatically. The estimator uses
+        `inplace_predict` by default and falls back to using :py:class:`DMatrix` if
+        devices between the data and the estimator don't match.
 
         .. note:: This function is only thread safe for `gbtree` and `dart`.
 
@@ -1588,7 +1578,9 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
     ) -> np.ndarray:
         """Predict the probability of each `X` example being of a given class. If the
         model is trained with early stopping, then :py:attr:`best_iteration` is used
-        automatically.
+        automatically. The estimator uses `inplace_predict` by default and falls back to
+        using :py:class:`DMatrix` if devices between the data and the estimator don't
+        match.
 
         .. note:: This function is only thread safe for `gbtree` and `dart`.
 
diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py
index f6abb867e..862375026 100644
--- a/python-package/xgboost/testing/__init__.py
+++ b/python-package/xgboost/testing/__init__.py
@@ -25,6 +25,7 @@ from typing import (
     Set,
     Tuple,
     TypedDict,
+    TypeVar,
     Union,
 )
 
@@ -711,6 +712,27 @@ def predictor_equal(lhs: xgb.DMatrix, rhs: xgb.DMatrix) -> bool:
     )
 
 
+M = TypeVar("M", xgb.Booster, xgb.XGBModel)
+
+
+def set_ordinal(ordinal: int, booster: M) -> M:
+    """Temporary solution for setting the device ordinal until we move away from
+    `gpu_id`.
+
+    """
+    if ordinal < 0:
+        params = {"gpu_id": -1, "tree_method": "hist"}
+    else:
+        params = {"gpu_id": ordinal, "tree_method": "gpu_hist"}
+
+    if isinstance(booster, xgb.Booster):
+        booster.set_param(params)
+    elif isinstance(booster, xgb.XGBModel):
+        booster.set_params(**params)
+
+    return booster
+
+
 def eval_error_metric(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, np.float64]:
     """Evaluation metric for xgb.train"""
     label = dtrain.get_label()
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index b35879fd7..e0f2d47b0 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -1023,7 +1023,6 @@ void InplacePredictImpl(std::shared_ptr<DMatrix> p_m, char const *c_json_config,
                         const float **out_result) {
   xgboost_CHECK_C_ARG_PTR(c_json_config);
   auto config = Json::Load(StringView{c_json_config});
-  CHECK_EQ(get<Integer const>(config["cache_id"]), 0) << "Cache ID is not supported yet";
 
   HostDeviceVector<float> *p_predt{nullptr};
   auto type = PredictionType(RequiredArg<Integer>(config, "type", __func__));
@@ -1042,6 +1041,7 @@ void InplacePredictImpl(std::shared_ptr<DMatrix> p_m, char const *c_json_config,
   xgboost_CHECK_C_ARG_PTR(out_dim);
   CalcPredictShape(strict_shape, type, n_samples, n_features, chunksize, learner->Groups(),
                    learner->BoostedRounds(), &shape, out_dim);
+  CHECK_GE(p_predt->Size(), n_samples);
 
   xgboost_CHECK_C_ARG_PTR(out_result);
   xgboost_CHECK_C_ARG_PTR(out_shape);
diff --git a/src/c_api/c_api.cu b/src/c_api/c_api.cu
index e6201b0fd..af060f6dc 100644
--- a/src/c_api/c_api.cu
+++ b/src/c_api/c_api.cu
@@ -92,7 +92,7 @@ XGB_DLL int XGDMatrixCreateFromCudaArrayInterface(char const *data,
   API_END();
 }
 
-int InplacePreidctCuda(BoosterHandle handle, char const *c_array_interface,
+int InplacePreidctCUDA(BoosterHandle handle, char const *c_array_interface,
                        char const *c_json_config, std::shared_ptr<DMatrix> p_m,
                        xgboost::bst_ulong const **out_shape, xgboost::bst_ulong *out_dim,
                        const float **out_result) {
@@ -107,7 +107,6 @@ int InplacePreidctCuda(BoosterHandle handle, char const *c_array_interface,
   proxy->SetCUDAArray(c_array_interface);
 
   auto config = Json::Load(StringView{c_json_config});
-  CHECK_EQ(get<Integer const>(config["cache_id"]), 0) << "Cache ID is not supported yet";
   auto *learner = static_cast<Learner *>(handle);
 
   HostDeviceVector<float> *p_predt{nullptr};
@@ -118,7 +117,13 @@ int InplacePreidctCuda(BoosterHandle handle, char const *c_array_interface,
                           RequiredArg<Integer>(config, "iteration_begin", __func__),
                           RequiredArg<Integer>(config, "iteration_end", __func__));
   CHECK(p_predt);
-  CHECK(p_predt->DeviceCanRead() && !p_predt->HostCanRead());
+  if (learner->Ctx()->IsCPU()) {
+    // Prediction using DMatrix as fallback.
+    CHECK(p_predt->HostCanRead() && !p_predt->DeviceCanRead());
+  } else {
+    CHECK(p_predt->DeviceCanRead() && !p_predt->HostCanRead());
+  }
+  p_predt->SetDevice(proxy->DeviceIdx());
 
   auto &shape = learner->GetThreadLocal().prediction_shape;
   size_t n_samples = p_m->Info().num_row_;
@@ -146,7 +151,7 @@ XGB_DLL int XGBoosterPredictFromCudaColumnar(BoosterHandle handle, char const *c
   if (m) {
     p_m = *static_cast<std::shared_ptr<DMatrix> *>(m);
   }
-  return InplacePreidctCuda(handle, c_json_strs, c_json_config, p_m, out_shape, out_dim,
+  return InplacePreidctCUDA(handle, c_json_strs, c_json_config, p_m, out_shape, out_dim,
                             out_result);
 }
 
@@ -159,6 +164,6 @@ XGB_DLL int XGBoosterPredictFromCudaArray(BoosterHandle handle, char const *c_js
     p_m = *static_cast<std::shared_ptr<DMatrix> *>(m);
   }
   xgboost_CHECK_C_ARG_PTR(out_result);
-  return InplacePreidctCuda(handle, c_json_strs, c_json_config, p_m, out_shape, out_dim,
+  return InplacePreidctCUDA(handle, c_json_strs, c_json_config, p_m, out_shape, out_dim,
                             out_result);
 }
diff --git a/src/common/error_msg.h b/src/common/error_msg.h
index 3f57a63a3..e690a12f3 100644
--- a/src/common/error_msg.h
+++ b/src/common/error_msg.h
@@ -6,6 +6,11 @@
 #ifndef XGBOOST_COMMON_ERROR_MSG_H_
 #define XGBOOST_COMMON_ERROR_MSG_H_
 
+#include <cinttypes>  // for uint64_t
+#include <limits>     // for numeric_limits
+
+#include "xgboost/base.h"  // for bst_feature_t
+#include "xgboost/logging.h"
 #include "xgboost/string_view.h"  // for StringView
 
 namespace xgboost::error {
@@ -33,5 +38,14 @@ constexpr StringView InconsistentMaxBin() {
   return "Inconsistent `max_bin`. `max_bin` should be the same across different QuantileDMatrix, "
          "and consistent with the Booster being trained.";
 }
+
+constexpr StringView UnknownDevice() { return "Unknown device type."; }
+
+inline void MaxFeatureSize(std::uint64_t n_features) {
+  auto max_n_features = std::numeric_limits<bst_feature_t>::max();
+  CHECK_LE(n_features, max_n_features)
+      << "Unfortunately, XGBoost does not support data matrices with "
+      << std::numeric_limits<bst_feature_t>::max() << " features or greater";
+}
 }  // namespace xgboost::error
 #endif  // XGBOOST_COMMON_ERROR_MSG_H_
diff --git a/src/data/adapter.h b/src/data/adapter.h
index b027084aa..7776177ab 100644
--- a/src/data/adapter.h
+++ b/src/data/adapter.h
@@ -7,7 +7,7 @@
 #include <dmlc/data.h>
 
 #include <algorithm>
-#include <cstddef>  // std::size_t
+#include <cstddef>  // for size_t
 #include <functional>
 #include <limits>
 #include <map>
@@ -17,6 +17,7 @@
 #include <vector>
 
 #include "../c_api/c_api_error.h"
+#include "../common/error_msg.h"  // for MaxFeatureSize
 #include "../common/math.h"
 #include "array_interface.h"
 #include "arrow-cdi.h"
@@ -300,9 +301,9 @@ class ArrayAdapter : public detail::SingleBatchDataIter<ArrayAdapterBatch> {
     array_interface_ = ArrayInterface<2>(get<Object const>(j));
     batch_ = ArrayAdapterBatch{array_interface_};
   }
-  ArrayAdapterBatch const& Value() const override { return batch_; }
-  size_t NumRows() const { return array_interface_.Shape(0); }
-  size_t NumColumns() const { return array_interface_.Shape(1); }
+  [[nodiscard]] ArrayAdapterBatch const& Value() const override { return batch_; }
+  [[nodiscard]] std::size_t NumRows() const { return array_interface_.Shape(0); }
+  [[nodiscard]] std::size_t NumColumns() const { return array_interface_.Shape(1); }
 
  private:
   ArrayAdapterBatch batch_;
diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu
index 881b54297..a760ec9ab 100644
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@@ -31,10 +31,10 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
   dh::XGBCachingDeviceAllocator<char> alloc;
 
   auto num_rows = [&]() {
-    return Dispatch(proxy, [](auto const& value) { return value.NumRows(); });
+    return cuda_impl::Dispatch(proxy, [](auto const& value) { return value.NumRows(); });
   };
   auto num_cols = [&]() {
-    return Dispatch(proxy, [](auto const& value) { return value.NumCols(); });
+    return cuda_impl::Dispatch(proxy, [](auto const& value) { return value.NumCols(); });
   };
 
   size_t row_stride = 0;
@@ -74,7 +74,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
                                      get_device());
       auto* p_sketch = &sketch_containers.back();
       proxy->Info().weights_.SetDevice(get_device());
-      Dispatch(proxy, [&](auto const& value) {
+      cuda_impl::Dispatch(proxy, [&](auto const& value) {
         common::AdapterDeviceSketch(value, p.max_bin, proxy->Info(), missing, p_sketch);
       });
     }
@@ -82,7 +82,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
     accumulated_rows += batch_rows;
     dh::device_vector<size_t> row_counts(batch_rows + 1, 0);
     common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
-    row_stride = std::max(row_stride, Dispatch(proxy, [=](auto const& value) {
+    row_stride = std::max(row_stride, cuda_impl::Dispatch(proxy, [=](auto const& value) {
                             return GetRowCounts(value, row_counts_span, get_device(), missing);
                           }));
     nnz += thrust::reduce(thrust::cuda::par(alloc), row_counts.begin(), row_counts.end());
@@ -136,14 +136,14 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
     auto rows = num_rows();
     dh::device_vector<size_t> row_counts(rows + 1, 0);
     common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
-    Dispatch(proxy, [=](auto const& value) {
+    cuda_impl::Dispatch(proxy, [=](auto const& value) {
       return GetRowCounts(value, row_counts_span, get_device(), missing);
     });
     auto is_dense = this->IsDense();
 
     proxy->Info().feature_types.SetDevice(get_device());
     auto d_feature_types = proxy->Info().feature_types.ConstDeviceSpan();
-    auto new_impl = Dispatch(proxy, [&](auto const& value) {
+    auto new_impl = cuda_impl::Dispatch(proxy, [&](auto const& value) {
       return EllpackPageImpl(value, missing, get_device(), is_dense, row_counts_span,
                              d_feature_types, row_stride, rows, cuts);
     });
diff --git a/src/data/proxy_dmatrix.cc b/src/data/proxy_dmatrix.cc
index fc36f75f2..e0a28142d 100644
--- a/src/data/proxy_dmatrix.cc
+++ b/src/data/proxy_dmatrix.cc
@@ -1,14 +1,13 @@
-/*!
- * Copyright 2021 by Contributors
+/**
+ * Copyright 2021-2023, XGBoost Contributors
  * \file proxy_dmatrix.cc
  */
 
 #include "proxy_dmatrix.h"
 
-namespace xgboost {
-namespace data {
-void DMatrixProxy::SetArrayData(char const *c_interface) {
-  std::shared_ptr<ArrayAdapter> adapter{new ArrayAdapter(StringView{c_interface})};
+namespace xgboost::data {
+void DMatrixProxy::SetArrayData(StringView interface_str) {
+  std::shared_ptr<ArrayAdapter> adapter{new ArrayAdapter{interface_str}};
   this->batch_ = adapter;
   this->Info().num_col_ = adapter->NumColumns();
   this->Info().num_row_ = adapter->NumRows();
@@ -25,5 +24,36 @@ void DMatrixProxy::SetCSRData(char const *c_indptr, char const *c_indices,
   this->Info().num_row_ = adapter->NumRows();
   this->ctx_.gpu_id = Context::kCpuId;
 }
-}  // namespace data
-}  // namespace xgboost
+
+namespace cuda_impl {
+std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *ctx,
+                                                std::shared_ptr<DMatrixProxy> proxy, float missing);
+#if !defined(XGBOOST_USE_CUDA)
+std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *, std::shared_ptr<DMatrixProxy>,
+                                                float) {
+  return nullptr;
+}
+#endif  // XGBOOST_USE_CUDA
+}  // namespace cuda_impl
+
+std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *ctx,
+                                                std::shared_ptr<DMatrixProxy> proxy,
+                                                float missing) {
+  bool type_error{false};
+  std::shared_ptr<DMatrix> p_fmat{nullptr};
+  if (proxy->Ctx()->IsCPU()) {
+    p_fmat = data::HostAdapterDispatch<false>(
+        proxy.get(),
+        [&](auto const &adapter) {
+          auto p_fmat =
+              std::shared_ptr<DMatrix>(DMatrix::Create(adapter.get(), missing, ctx->Threads()));
+          return p_fmat;
+        },
+        &type_error);
+  } else {
+    p_fmat = cuda_impl::CreateDMatrixFromProxy(ctx, proxy, missing);
+  }
+
+  return p_fmat;
+}
+}  // namespace xgboost::data
diff --git a/src/data/proxy_dmatrix.cu b/src/data/proxy_dmatrix.cu
index 2c615c5fd..65abd1b7d 100644
--- a/src/data/proxy_dmatrix.cu
+++ b/src/data/proxy_dmatrix.cu
@@ -1,12 +1,11 @@
-/*!
- * Copyright 2020-2022, XGBoost contributors
+/**
+ * Copyright 2020-2023, XGBoost contributors
  */
-#include "proxy_dmatrix.h"
 #include "device_adapter.cuh"
+#include "proxy_dmatrix.cuh"
+#include "proxy_dmatrix.h"
 
-namespace xgboost {
-namespace data {
-
+namespace xgboost::data {
 void DMatrixProxy::FromCudaColumnar(StringView interface_str) {
   std::shared_ptr<data::CudfAdapter> adapter{new CudfAdapter{interface_str}};
   auto const& value = adapter->Value();
@@ -31,5 +30,15 @@ void DMatrixProxy::FromCudaArray(StringView interface_str) {
     ctx_.gpu_id = dh::CurrentDevice();
   }
 }
-}  // namespace data
-}  // namespace xgboost
+
+namespace cuda_impl {
+std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const* ctx,
+                                                std::shared_ptr<DMatrixProxy> proxy,
+                                                float missing) {
+  return Dispatch<false>(proxy.get(), [&](auto const& adapter) {
+    auto p_fmat = std::shared_ptr<DMatrix>{DMatrix::Create(adapter.get(), missing, ctx->Threads())};
+    return p_fmat;
+  });
+}
+}  // namespace cuda_impl
+}  // namespace xgboost::data
diff --git a/src/data/proxy_dmatrix.cuh b/src/data/proxy_dmatrix.cuh
index 6ea858e7e..db53b992d 100644
--- a/src/data/proxy_dmatrix.cuh
+++ b/src/data/proxy_dmatrix.cuh
@@ -6,19 +6,34 @@
 #include "device_adapter.cuh"
 #include "proxy_dmatrix.h"
 
-namespace xgboost::data {
-template <typename Fn>
+namespace xgboost::data::cuda_impl {
+template <bool get_value = true, typename Fn>
 decltype(auto) Dispatch(DMatrixProxy const* proxy, Fn fn) {
   if (proxy->Adapter().type() == typeid(std::shared_ptr<CupyAdapter>)) {
-    auto value = std::any_cast<std::shared_ptr<CupyAdapter>>(proxy->Adapter())->Value();
-    return fn(value);
+    if constexpr (get_value) {
+      auto value = std::any_cast<std::shared_ptr<CupyAdapter>>(proxy->Adapter())->Value();
+      return fn(value);
+    } else {
+      auto value = std::any_cast<std::shared_ptr<CupyAdapter>>(proxy->Adapter());
+      return fn(value);
+    }
   } else if (proxy->Adapter().type() == typeid(std::shared_ptr<CudfAdapter>)) {
-    auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
-    return fn(value);
+    if constexpr (get_value) {
+      auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
+      return fn(value);
+    } else {
+      auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter());
+      return fn(value);
+    }
   } else {
     LOG(FATAL) << "Unknown type: " << proxy->Adapter().type().name();
-    auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
-    return fn(value);
+    if constexpr (get_value) {
+      auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter())->Value();
+      return fn(value);
+    } else {
+      auto value = std::any_cast<std::shared_ptr<CudfAdapter>>(proxy->Adapter());
+      return fn(value);
+    }
   }
 }
-}  // namespace xgboost::data
+}  // namespace xgboost::data::cuda_impl
diff --git a/src/data/proxy_dmatrix.h b/src/data/proxy_dmatrix.h
index 2c18ffc79..59f0935be 100644
--- a/src/data/proxy_dmatrix.h
+++ b/src/data/proxy_dmatrix.h
@@ -62,7 +62,7 @@ class DMatrixProxy : public DMatrix {
 #endif  // defined(XGBOOST_USE_CUDA)
   }
 
-  void SetArrayData(char const* c_interface);
+  void SetArrayData(StringView interface_str);
   void SetCSRData(char const* c_indptr, char const* c_indices, char const* c_values,
                   bst_feature_t n_features, bool on_host);
 
@@ -114,28 +114,62 @@ inline DMatrixProxy* MakeProxy(DMatrixHandle proxy) {
   return typed;
 }
 
-template <typename Fn>
+/**
+ * @brief Dispatch function call based on input type.
+ *
+ * @tparam get_value Whether the funciton Fn accept an adapter batch or the adapter itself.
+ * @tparam Fn        The type of the function to be dispatched.
+ *
+ * @param proxy The proxy object holding the reference to the input.
+ * @param fn    The function to be dispatched.
+ * @param type_error[out] Set to ture if it's not null and the input data is not recognized by
+ *                        the host.
+ *
+ * @return The return value of the function being dispatched.
+ */
+template <bool get_value = true, typename Fn>
 decltype(auto) HostAdapterDispatch(DMatrixProxy const* proxy, Fn fn, bool* type_error = nullptr) {
   if (proxy->Adapter().type() == typeid(std::shared_ptr<CSRArrayAdapter>)) {
-    auto value = std::any_cast<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter())->Value();
+    if constexpr (get_value) {
+      auto value = std::any_cast<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter())->Value();
+      return fn(value);
+    } else {
+      auto value = std::any_cast<std::shared_ptr<CSRArrayAdapter>>(proxy->Adapter());
+      return fn(value);
+    }
     if (type_error) {
       *type_error = false;
     }
-    return fn(value);
   } else if (proxy->Adapter().type() == typeid(std::shared_ptr<ArrayAdapter>)) {
-    auto value = std::any_cast<std::shared_ptr<ArrayAdapter>>(proxy->Adapter())->Value();
+    if constexpr (get_value) {
+      auto value = std::any_cast<std::shared_ptr<ArrayAdapter>>(proxy->Adapter())->Value();
+      return fn(value);
+    } else {
+      auto value = std::any_cast<std::shared_ptr<ArrayAdapter>>(proxy->Adapter());
+      return fn(value);
+    }
     if (type_error) {
       *type_error = false;
     }
-    return fn(value);
   } else {
     if (type_error) {
       *type_error = true;
     } else {
       LOG(FATAL) << "Unknown type: " << proxy->Adapter().type().name();
     }
-    return std::result_of_t<Fn(decltype(std::declval<std::shared_ptr<ArrayAdapter>>()->Value()))>();
+    if constexpr (get_value) {
+      return std::result_of_t<Fn(
+          decltype(std::declval<std::shared_ptr<ArrayAdapter>>()->Value()))>();
+    } else {
+      return std::result_of_t<Fn(decltype(std::declval<std::shared_ptr<ArrayAdapter>>()))>();
+    }
   }
 }
+
+/**
+ * @brief Create a `SimpleDMatrix` instance from a `DMatrixProxy`.
+ */
+std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const* ctx,
+                                                std::shared_ptr<DMatrixProxy> proxy, float missing);
 }  // namespace xgboost::data
 #endif  // XGBOOST_DATA_PROXY_DMATRIX_H_
diff --git a/src/data/sparse_page_source.cu b/src/data/sparse_page_source.cu
index 41d4f3584..8d4adda17 100644
--- a/src/data/sparse_page_source.cu
+++ b/src/data/sparse_page_source.cu
@@ -1,33 +1,31 @@
-/*!
- * Copyright 2021 XGBoost contributors
+/**
+ * Copyright 2021-2023, XGBoost contributors
  */
+#include "../common/device_helpers.cuh"  // for CurrentDevice
+#include "proxy_dmatrix.cuh"             // for Dispatch, DMatrixProxy
+#include "simple_dmatrix.cuh"            // for CopyToSparsePage
 #include "sparse_page_source.h"
-#include "proxy_dmatrix.cuh"
-#include "simple_dmatrix.cuh"
-
-namespace xgboost {
-namespace data {
+#include "xgboost/data.h"  // for SparsePage
 
+namespace xgboost::data {
 namespace detail {
 std::size_t NSamplesDevice(DMatrixProxy *proxy) {
-  return Dispatch(proxy, [](auto const &value) { return value.NumRows(); });
+  return cuda_impl::Dispatch(proxy, [](auto const &value) { return value.NumRows(); });
 }
 
 std::size_t NFeaturesDevice(DMatrixProxy *proxy) {
-  return Dispatch(proxy, [](auto const &value) { return value.NumCols(); });
+  return cuda_impl::Dispatch(proxy, [](auto const &value) { return value.NumCols(); });
 }
 }  // namespace detail
 
-void DevicePush(DMatrixProxy* proxy, float missing, SparsePage* page) {
+void DevicePush(DMatrixProxy *proxy, float missing, SparsePage *page) {
   auto device = proxy->DeviceIdx();
   if (device < 0) {
     device = dh::CurrentDevice();
   }
   CHECK_GE(device, 0);
 
-  Dispatch(proxy, [&](auto const &value) {
-    CopyToSparsePage(value, device, missing, page);
-  });
+  cuda_impl::Dispatch(proxy,
+                      [&](auto const &value) { CopyToSparsePage(value, device, missing, page); });
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
diff --git a/src/gbm/gblinear.cc b/src/gbm/gblinear.cc
index f1189886c..64e9603de 100644
--- a/src/gbm/gblinear.cc
+++ b/src/gbm/gblinear.cc
@@ -172,8 +172,7 @@ class GBLinear : public GradientBooster {
   }
 
   void PredictContribution(DMatrix* p_fmat, HostDeviceVector<bst_float>* out_contribs,
-                           uint32_t layer_begin, uint32_t /*layer_end*/, bool, int,
-                           unsigned) override {
+                           bst_layer_t layer_begin, bst_layer_t /*layer_end*/, bool) override {
     model_.LazyInitModel();
     LinearCheckLayer(layer_begin);
     auto base_margin = p_fmat->Info().base_margin_.View(Context::kCpuId);
@@ -210,8 +209,8 @@ class GBLinear : public GradientBooster {
     }
   }
 
-  void PredictInteractionContributions(DMatrix* p_fmat, HostDeviceVector<bst_float>* out_contribs,
-                                       unsigned layer_begin, unsigned /*layer_end*/,
+  void PredictInteractionContributions(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,
+                                       bst_layer_t layer_begin, bst_layer_t /*layer_end*/,
                                        bool) override {
     LinearCheckLayer(layer_begin);
     std::vector<bst_float>& contribs = out_contribs->HostVector();
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index a4f91abe3..9d595c378 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -18,9 +18,11 @@
 #include <vector>
 
 #include "../common/common.h"
+#include "../common/error_msg.h"  // for UnknownDevice
 #include "../common/random.h"
 #include "../common/threading_utils.h"
 #include "../common/timer.h"
+#include "../data/proxy_dmatrix.h"  // for DMatrixProxy, HostAdapterDispatch
 #include "gbtree_model.h"
 #include "xgboost/base.h"
 #include "xgboost/data.h"
@@ -58,9 +60,8 @@ void GBTree::Configure(Args const& cfg) {
   cpu_predictor_->Configure(cfg);
 #if defined(XGBOOST_USE_CUDA)
   auto n_gpus = common::AllVisibleGPUs();
-  if (!gpu_predictor_ && n_gpus != 0) {
-    gpu_predictor_ = std::unique_ptr<Predictor>(
-        Predictor::Create("gpu_predictor", this->ctx_));
+  if (!gpu_predictor_) {
+    gpu_predictor_ = std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", this->ctx_));
   }
   if (n_gpus != 0) {
     gpu_predictor_->Configure(cfg);
@@ -374,12 +375,7 @@ void GBTree::LoadConfig(Json const& in) {
   // This would cause all trees to be pushed to trees_to_update
   // e.g. updating a model, then saving and loading it would result in an empty model
   tparam_.process_type = TreeProcessType::kDefault;
-  int32_t const n_gpus = xgboost::common::AllVisibleGPUs();
-  if (n_gpus == 0 && tparam_.predictor == PredictorType::kGPUPredictor) {
-    LOG(WARNING) << "Loading from a raw memory buffer on CPU only machine.  "
-                    "Changing predictor to auto.";
-    tparam_.UpdateAllowUnknown(Args{{"predictor", "auto"}});
-  }
+  std::int32_t const n_gpus = xgboost::common::AllVisibleGPUs();
 
   auto msg = StringView{
       R"(
@@ -505,8 +501,8 @@ void GBTree::Slice(bst_layer_t begin, bst_layer_t end, bst_layer_t step, Gradien
   out_model.param.num_parallel_tree = model_.param.num_parallel_tree;
 }
 
-void GBTree::PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool,
-                          bst_layer_t layer_begin, bst_layer_t layer_end) {
+void GBTree::PredictBatchImpl(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool is_training,
+                              bst_layer_t layer_begin, bst_layer_t layer_end) const {
   CHECK(configured_);
   if (layer_end == 0) {
     layer_end = this->BoostedRounds();
@@ -526,7 +522,7 @@ void GBTree::PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool
     CHECK_EQ(out_preds->version, 0);
   }
 
-  auto const& predictor = GetPredictor(&out_preds->predictions, p_fmat);
+  auto const& predictor = GetPredictor(is_training, &out_preds->predictions, p_fmat);
   if (out_preds->version == 0) {
     // out_preds->Size() can be non-zero as it's initialized here before any
     // tree is built at the 0^th iterator.
@@ -546,68 +542,69 @@ void GBTree::PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool
   }
 }
 
-std::unique_ptr<Predictor> const &
-GBTree::GetPredictor(HostDeviceVector<float> const *out_pred,
-                     DMatrix *f_dmat) const {
+void GBTree::PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool is_training,
+                          bst_layer_t layer_begin, bst_layer_t layer_end) {
+  // dispatch to const function.
+  this->PredictBatchImpl(p_fmat, out_preds, is_training, layer_begin, layer_end);
+}
+
+void GBTree::InplacePredict(std::shared_ptr<DMatrix> p_m, float missing,
+                            PredictionCacheEntry* out_preds, bst_layer_t layer_begin,
+                            bst_layer_t layer_end) const {
   CHECK(configured_);
-  if (tparam_.predictor != PredictorType::kAuto) {
-    if (tparam_.predictor == PredictorType::kGPUPredictor) {
-#if defined(XGBOOST_USE_CUDA)
-      CHECK_GE(common::AllVisibleGPUs(), 1) << "No visible GPU is found for XGBoost.";
-      CHECK(gpu_predictor_);
-      return gpu_predictor_;
-#else
-      common::AssertGPUSupport();
-#endif  // defined(XGBOOST_USE_CUDA)
-    }
-    if (tparam_.predictor == PredictorType::kOneAPIPredictor) {
-#if defined(XGBOOST_USE_ONEAPI)
-      CHECK(oneapi_predictor_);
-      return oneapi_predictor_;
-#else
-      common::AssertOneAPISupport();
-#endif  // defined(XGBOOST_USE_ONEAPI)
-    }
-    CHECK(cpu_predictor_);
-    return cpu_predictor_;
+  auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
+  CHECK_LE(tree_end, model_.trees.size()) << "Invalid number of trees.";
+  if (p_m->Ctx()->Device() != this->ctx_->Device()) {
+    LOG(WARNING) << "Falling back to prediction using DMatrix due to mismatched devices. XGBoost "
+                 << "is running on: " << this->ctx_->DeviceName()
+                 << ", while the input data is on: " << p_m->Ctx()->DeviceName() << ".";
+    CHECK_EQ(out_preds->version, 0);
+    auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_m);
+    auto any_adapter = proxy->Adapter();
+    auto p_fmat = data::CreateDMatrixFromProxy(ctx_, proxy, missing);
+    this->PredictBatchImpl(p_fmat.get(), out_preds, false, layer_begin, layer_end);
+    return;
   }
 
+  if (this->ctx_->IsCPU()) {
+    this->cpu_predictor_->InplacePredict(p_m, model_, missing, out_preds, tree_begin, tree_end);
+  } else if (p_m->Ctx()->IsCUDA()) {
+    CHECK(this->gpu_predictor_);
+    this->gpu_predictor_->InplacePredict(p_m, model_, missing, out_preds, tree_begin, tree_end);
+  } else {
+    LOG(FATAL) << error::UnknownDevice();
+  }
+}
+
+[[nodiscard]] std::unique_ptr<Predictor> const& GBTree::GetPredictor(
+    bool is_training, HostDeviceVector<float> const* out_pred, DMatrix* f_dmat) const {
+  CHECK(configured_);
+
   // Data comes from SparsePageDMatrix. Since we are loading data in pages, no need to
   // prevent data copy.
   if (f_dmat && !f_dmat->SingleColBlock()) {
     if (ctx_->IsCPU()) {
       return cpu_predictor_;
     } else {
-#if defined(XGBOOST_USE_CUDA)
-      CHECK_GE(common::AllVisibleGPUs(), 1) << "No visible GPU is found for XGBoost.";
-      return gpu_predictor_;
-#else
       common::AssertGPUSupport();
-      return cpu_predictor_;
-#endif  // defined(XGBOOST_USE_CUDA)
+      CHECK(gpu_predictor_);
+      return gpu_predictor_;
     }
   }
 
   // Data comes from Device DMatrix.
-  auto is_ellpack = f_dmat && f_dmat->PageExists<EllpackPage>() &&
-                    !f_dmat->PageExists<SparsePage>();
+  auto is_ellpack =
+      f_dmat && f_dmat->PageExists<EllpackPage>() && !f_dmat->PageExists<SparsePage>();
   // Data comes from device memory, like CuDF or CuPy.
-  auto is_from_device =
-      f_dmat && f_dmat->PageExists<SparsePage>() &&
-      (*(f_dmat->GetBatches<SparsePage>().begin())).data.DeviceCanRead();
+  auto is_from_device = f_dmat && f_dmat->PageExists<SparsePage>() &&
+                        (*(f_dmat->GetBatches<SparsePage>().begin())).data.DeviceCanRead();
   auto on_device = is_ellpack || is_from_device;
 
   // Use GPU Predictor if data is already on device and gpu_id is set.
-  if (on_device && ctx_->gpu_id >= 0) {
-#if defined(XGBOOST_USE_CUDA)
-    CHECK_GE(common::AllVisibleGPUs(), 1) << "No visible GPU is found for XGBoost.";
+  if (on_device && ctx_->IsCUDA()) {
+    common::AssertGPUSupport();
     CHECK(gpu_predictor_);
     return gpu_predictor_;
-#else
-    LOG(FATAL) << "Data is on CUDA device, but XGBoost is not compiled with "
-                  "CUDA support.";
-    return cpu_predictor_;
-#endif  // defined(XGBOOST_USE_CUDA)
   }
 
   // GPU_Hist by default has prediction cache calculated from quantile values,
@@ -619,23 +616,19 @@ GBTree::GetPredictor(HostDeviceVector<float> const *out_pred,
   if ((out_pred && out_pred->Size() == 0) && (model_.param.num_trees != 0) &&
       // FIXME(trivialfis): Implement a better method for testing whether data
       // is on device after DMatrix refactoring is done.
-      !on_device) {
+      !on_device && is_training) {
     CHECK(cpu_predictor_);
     return cpu_predictor_;
   }
 
-  if (tparam_.tree_method == TreeMethod::kGPUHist) {
-#if defined(XGBOOST_USE_CUDA)
-    CHECK_GE(common::AllVisibleGPUs(), 1) << "No visible GPU is found for XGBoost.";
+  if (ctx_->IsCPU()) {
+    return cpu_predictor_;
+  } else {
+    common::AssertGPUSupport();
     CHECK(gpu_predictor_);
     return gpu_predictor_;
-#else
-    common::AssertGPUSupport();
-    return cpu_predictor_;
-#endif  // defined(XGBOOST_USE_CUDA)
   }
 
-  CHECK(cpu_predictor_);
   return cpu_predictor_;
 }
 
@@ -750,7 +743,7 @@ class Dart : public GBTree {
                         bool training, unsigned layer_begin,
                         unsigned layer_end) const {
     CHECK(!this->model_.learner_model_param->IsVectorLeaf()) << "dart" << MTNotImplemented();
-    auto &predictor = this->GetPredictor(&p_out_preds->predictions, p_fmat);
+    auto& predictor = this->GetPredictor(training, &p_out_preds->predictions, p_fmat);
     CHECK(predictor);
     predictor->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions,
                                   model_);
@@ -814,49 +807,46 @@ class Dart : public GBTree {
     auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
     auto n_groups = model_.learner_model_param->num_output_group;
 
-    std::vector<Predictor const*> predictors {
-      cpu_predictor_.get(),
-#if defined(XGBOOST_USE_CUDA)
-      gpu_predictor_.get()
-#endif  // defined(XGBOOST_USE_CUDA)
-    };
-    Predictor const* predictor{nullptr};
-    StringView msg{"Unsupported data type for inplace predict."};
+    if (ctx_->Device() != p_fmat->Ctx()->Device()) {
+      LOG(WARNING) << "Falling back to prediction using DMatrix due to mismatched devices. XGBoost "
+                   << "is running on: " << this->ctx_->DeviceName()
+                   << ", while the input data is on: " << p_fmat->Ctx()->DeviceName() << ".";
+      auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_fmat);
+      auto any_adapter = proxy->Adapter();
+      auto p_fmat = data::CreateDMatrixFromProxy(ctx_, proxy, missing);
+      this->PredictBatchImpl(p_fmat.get(), p_out_preds, false, layer_begin, layer_end);
+      return;
+    }
 
+    StringView msg{"Unsupported data type for inplace predict."};
     PredictionCacheEntry predts;
     if (ctx_->gpu_id != Context::kCpuId) {
       predts.predictions.SetDevice(ctx_->gpu_id);
     }
     predts.predictions.Resize(p_fmat->Info().num_row_ * n_groups, 0);
 
+    auto get_predictor = [&]() -> Predictor const* {
+      if (ctx_->IsCPU()) {
+        return cpu_predictor_.get();
+      } else if (ctx_->IsCUDA()) {
+        CHECK(this->gpu_predictor_);
+        return gpu_predictor_.get();
+      } else {
+        LOG(FATAL) << error::UnknownDevice();
+        return nullptr;
+      }
+    };
     auto predict_impl = [&](size_t i) {
       predts.predictions.Fill(0);
-      if (tparam_.predictor == PredictorType::kAuto) {
-        // Try both predictor implementations
-        bool success = false;
-        for (auto const& p : predictors) {
-          if (p && p->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1)) {
-            success = true;
-            predictor = p;
-            break;
-          }
-        }
-        CHECK(success) << msg;
-      } else {
-        predictor = this->GetPredictor().get();
-        bool success = predictor->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1);
-        CHECK(success) << msg << std::endl
-                       << "Current Predictor: "
-                       << (tparam_.predictor == PredictorType::kCPUPredictor ? "cpu_predictor"
-                                                                             : "gpu_predictor");
-      }
+      bool success{get_predictor()->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1)};
+      CHECK(success) << msg;
     };
 
     // Inplace predict is not used for training, so no need to drop tree.
     for (bst_tree_t i = tree_begin; i < tree_end; ++i) {
       predict_impl(i);
       if (i == tree_begin) {
-        predictor->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions, model_);
+        get_predictor()->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions, model_);
       }
       // Multiple the tree weight
       auto w = this->weight_drop_.at(i);
@@ -886,25 +876,24 @@ class Dart : public GBTree {
                        std::vector<bst_float> *out_preds,
                        unsigned layer_begin, unsigned layer_end) override {
     DropTrees(false);
-    auto &predictor = this->GetPredictor();
+    auto &predictor = this->GetPredictor(false);
     uint32_t _, tree_end;
     std::tie(_, tree_end) = detail::LayerToTree(model_, layer_begin, layer_end);
     predictor->PredictInstance(inst, out_preds, model_, tree_end);
   }
 
-  void PredictContribution(DMatrix* p_fmat,
-                           HostDeviceVector<bst_float>* out_contribs,
-                           unsigned layer_begin, unsigned layer_end, bool approximate, int,
-                           unsigned) override {
+  void PredictContribution(DMatrix* p_fmat, HostDeviceVector<bst_float>* out_contribs,
+                           bst_layer_t layer_begin, bst_layer_t layer_end,
+                           bool approximate) override {
     CHECK(configured_);
     auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
     cpu_predictor_->PredictContribution(p_fmat, out_contribs, model_, tree_end, &weight_drop_,
                                         approximate);
   }
 
-  void PredictInteractionContributions(
-      DMatrix *p_fmat, HostDeviceVector<bst_float> *out_contribs,
-      unsigned layer_begin, unsigned layer_end, bool approximate) override {
+  void PredictInteractionContributions(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,
+                                       bst_layer_t layer_begin, bst_layer_t layer_end,
+                                       bool approximate) override {
     CHECK(configured_);
     auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
     cpu_predictor_->PredictInteractionContributions(p_fmat, out_contribs, model_, tree_end,
diff --git a/src/gbm/gbtree.cu b/src/gbm/gbtree.cu
index acff9de52..c1972b2fc 100644
--- a/src/gbm/gbtree.cu
+++ b/src/gbm/gbtree.cu
@@ -1,14 +1,11 @@
-/*!
- * Copyright 2021 by Contributors
+/**
+ * Copyright 2021-2023, XGBoost Contributors
  */
 #include "../common/device_helpers.cuh"
-#include "xgboost/context.h"
 #include "xgboost/linalg.h"
 #include "xgboost/span.h"
 
-namespace xgboost {
-namespace gbm {
-
+namespace xgboost::gbm {
 void GPUCopyGradient(HostDeviceVector<GradientPair> const *in_gpair,
                      bst_group_t n_groups, bst_group_t group_id,
                      HostDeviceVector<GradientPair> *out_gpair) {
@@ -41,5 +38,4 @@ void GPUDartInplacePredictInc(common::Span<float> out_predts, common::Span<float
     out_predts[offset] += (predts[offset] - base_score(0)) * tree_w;
   });
 }
-}  // namespace gbm
-}  // namespace xgboost
+}  // namespace xgboost::gbm
diff --git a/src/gbm/gbtree.h b/src/gbm/gbtree.h
index dc5fc975d..aa45433df 100644
--- a/src/gbm/gbtree.h
+++ b/src/gbm/gbtree.h
@@ -43,18 +43,10 @@ enum class TreeProcessType : int {
   kDefault = 0,
   kUpdate = 1
 };
-
-enum class PredictorType : int {
-  kAuto = 0,
-  kCPUPredictor,
-  kGPUPredictor,
-  kOneAPIPredictor
-};
 }  // namespace xgboost
 
 DECLARE_FIELD_ENUM_CLASS(xgboost::TreeMethod);
 DECLARE_FIELD_ENUM_CLASS(xgboost::TreeProcessType);
-DECLARE_FIELD_ENUM_CLASS(xgboost::PredictorType);
 
 namespace xgboost::gbm {
 /*! \brief training parameters */
@@ -63,8 +55,6 @@ struct GBTreeTrainParam : public XGBoostParameter<GBTreeTrainParam> {
   std::string updater_seq;
   /*! \brief type of boosting process to run */
   TreeProcessType process_type;
-  // predictor type
-  PredictorType predictor;
   // tree construction method
   TreeMethod tree_method;
   // declare parameters
@@ -79,13 +69,6 @@ struct GBTreeTrainParam : public XGBoostParameter<GBTreeTrainParam> {
         .describe("Whether to run the normal boosting process that creates new trees,"\
                   " or to update the trees in an existing model.");
     DMLC_DECLARE_ALIAS(updater_seq, updater);
-    DMLC_DECLARE_FIELD(predictor)
-        .set_default(PredictorType::kAuto)
-        .add_enum("auto", PredictorType::kAuto)
-        .add_enum("cpu_predictor", PredictorType::kCPUPredictor)
-        .add_enum("gpu_predictor", PredictorType::kGPUPredictor)
-        .add_enum("oneapi_predictor", PredictorType::kOneAPIPredictor)
-        .describe("Predictor algorithm type");
     DMLC_DECLARE_FIELD(tree_method)
         .set_default(TreeMethod::kAuto)
         .add_enum("auto",      TreeMethod::kAuto)
@@ -206,15 +189,9 @@ class GBTree : public GradientBooster {
   void DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
                PredictionCacheEntry* predt, ObjFunction const* obj) override;
 
-  bool UseGPU() const override {
-    return
-        tparam_.predictor == PredictorType::kGPUPredictor ||
-        tparam_.tree_method == TreeMethod::kGPUHist;
-  }
+  [[nodiscard]] bool UseGPU() const override { return tparam_.tree_method == TreeMethod::kGPUHist; }
 
-  GBTreeTrainParam const& GetTrainParam() const {
-    return tparam_;
-  }
+  [[nodiscard]] GBTreeTrainParam const& GetTrainParam() const { return tparam_; }
 
   void Load(dmlc::Stream* fi) override { model_.Load(fi); }
   void Save(dmlc::Stream* fo) const override {
@@ -236,39 +213,14 @@ class GBTree : public GradientBooster {
     return !model_.trees.empty() || !model_.trees_to_update.empty();
   }
 
+  void PredictBatchImpl(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool is_training,
+                        bst_layer_t layer_begin, bst_layer_t layer_end) const;
+
   void PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool training,
                     bst_layer_t layer_begin, bst_layer_t layer_end) override;
 
   void InplacePredict(std::shared_ptr<DMatrix> p_m, float missing, PredictionCacheEntry* out_preds,
-                      bst_layer_t layer_begin, bst_layer_t layer_end) const override {
-    CHECK(configured_);
-    auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
-    CHECK_LE(tree_end, model_.trees.size()) << "Invalid number of trees.";
-    std::vector<Predictor const *> predictors{
-      cpu_predictor_.get(),
-#if defined(XGBOOST_USE_CUDA)
-      gpu_predictor_.get()
-#endif  // defined(XGBOOST_USE_CUDA)
-    };
-    StringView msg{"Unsupported data type for inplace predict."};
-    if (tparam_.predictor == PredictorType::kAuto) {
-      // Try both predictor implementations
-      for (auto const &p : predictors) {
-        if (p && p->InplacePredict(p_m, model_, missing, out_preds, tree_begin, tree_end)) {
-          return;
-        }
-      }
-      LOG(FATAL) << msg;
-    } else {
-      bool success = this->GetPredictor()->InplacePredict(p_m, model_, missing, out_preds,
-                                                          tree_begin, tree_end);
-      CHECK(success) << msg << std::endl
-                     << "Current Predictor: "
-                     << (tparam_.predictor == PredictorType::kCPUPredictor
-                             ? "cpu_predictor"
-                             : "gpu_predictor");
-    }
-  }
+                      bst_layer_t layer_begin, bst_layer_t layer_end) const override;
 
   void FeatureScore(std::string const& importance_type, common::Span<int32_t const> trees,
                     std::vector<bst_feature_t>* features,
@@ -349,32 +301,29 @@ class GBTree : public GradientBooster {
     auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
     CHECK_EQ(tree_begin, 0) << "Predict leaf supports only iteration end: (0, "
                                "n_iteration), use model slicing instead.";
-    this->GetPredictor()->PredictLeaf(p_fmat, out_preds, model_, tree_end);
+    this->GetPredictor(false)->PredictLeaf(p_fmat, out_preds, model_, tree_end);
   }
 
-  void PredictContribution(DMatrix* p_fmat,
-                           HostDeviceVector<bst_float>* out_contribs,
-                           uint32_t layer_begin, uint32_t layer_end, bool approximate,
-                           int, unsigned) override {
+  void PredictContribution(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,
+                           bst_layer_t layer_begin, bst_layer_t layer_end,
+                           bool approximate) override {
     CHECK(configured_);
     auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
-    CHECK_EQ(tree_begin, 0)
-        << "Predict contribution supports only iteration end: (0, "
-           "n_iteration), using model slicing instead.";
-    this->GetPredictor()->PredictContribution(
-        p_fmat, out_contribs, model_, tree_end, nullptr, approximate);
+    CHECK_EQ(tree_begin, 0) << "Predict contribution supports only iteration end: (0, "
+                               "n_iteration), using model slicing instead.";
+    this->GetPredictor(false)->PredictContribution(p_fmat, out_contribs, model_, tree_end, nullptr,
+                                                   approximate);
   }
 
-  void PredictInteractionContributions(
-      DMatrix *p_fmat, HostDeviceVector<bst_float> *out_contribs,
-      uint32_t layer_begin, uint32_t layer_end, bool approximate) override {
+  void PredictInteractionContributions(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,
+                                       bst_layer_t layer_begin, bst_layer_t layer_end,
+                                       bool approximate) override {
     CHECK(configured_);
     auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
-    CHECK_EQ(tree_begin, 0)
-        << "Predict interaction contribution supports only iteration end: (0, "
-           "n_iteration), using model slicing instead.";
-    this->GetPredictor()->PredictInteractionContributions(
-        p_fmat, out_contribs, model_, tree_end, nullptr, approximate);
+    CHECK_EQ(tree_begin, 0) << "Predict interaction contribution supports only iteration end: (0, "
+                               "n_iteration), using model slicing instead.";
+    this->GetPredictor(false)->PredictInteractionContributions(p_fmat, out_contribs, model_,
+                                                               tree_end, nullptr, approximate);
   }
 
   [[nodiscard]] std::vector<std::string> DumpModel(const FeatureMap& fmap, bool with_stats,
@@ -390,8 +339,9 @@ class GBTree : public GradientBooster {
                      std::vector<HostDeviceVector<bst_node_t>>* out_position,
                      std::vector<std::unique_ptr<RegTree>>* ret);
 
-  std::unique_ptr<Predictor> const& GetPredictor(HostDeviceVector<float> const* out_pred = nullptr,
-                                                 DMatrix* f_dmat = nullptr) const;
+  [[nodiscard]] std::unique_ptr<Predictor> const& GetPredictor(
+      bool is_training, HostDeviceVector<float> const* out_pred = nullptr,
+      DMatrix* f_dmat = nullptr) const;
 
   // commit new trees all at once
   virtual void CommitModel(TreesOneIter&& new_trees);
@@ -410,9 +360,7 @@ class GBTree : public GradientBooster {
   std::vector<std::unique_ptr<TreeUpdater>> updaters_;
   // Predictors
   std::unique_ptr<Predictor> cpu_predictor_;
-#if defined(XGBOOST_USE_CUDA)
-  std::unique_ptr<Predictor> gpu_predictor_;
-#endif  // defined(XGBOOST_USE_CUDA)
+  std::unique_ptr<Predictor> gpu_predictor_{nullptr};
 #if defined(XGBOOST_USE_ONEAPI)
   std::unique_ptr<Predictor> oneapi_predictor_;
 #endif  // defined(XGBOOST_USE_ONEAPI)
diff --git a/src/learner.cc b/src/learner.cc
index 78297404b..d2f1c774d 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -40,6 +40,7 @@
 #include "common/api_entry.h"             // for XGBAPIThreadLocalEntry
 #include "common/charconv.h"              // for to_chars, to_chars_result, NumericLimits, from_...
 #include "common/common.h"                // for ToString, Split
+#include "common/error_msg.h"             // for MaxFeatureSize
 #include "common/io.h"                    // for PeekableInStream, ReadAll, FixedSizeStream, Mem...
 #include "common/observer.h"              // for TrainingObserver
 #include "common/random.h"                // for GlobalRandom
@@ -763,9 +764,7 @@ class LearnerConfiguration : public Learner {
         CHECK(matrix.first.ptr);
         CHECK(!matrix.second.ref.expired());
         const uint64_t num_col = matrix.first.ptr->Info().num_col_;
-        CHECK_LE(num_col, static_cast<uint64_t>(std::numeric_limits<unsigned>::max()))
-            << "Unfortunately, XGBoost does not support data matrices with "
-            << std::numeric_limits<unsigned>::max() << " features or greater";
+        error::MaxFeatureSize(num_col);
         num_feature = std::max(num_feature, static_cast<uint32_t>(num_col));
       }
 
@@ -1413,6 +1412,8 @@ class LearnerImpl : public LearnerIO {
     this->CheckModelInitialized();
 
     auto& out_predictions = this->GetThreadLocal().prediction_entry;
+    out_predictions.version = 0;
+
     this->gbm_->InplacePredict(p_m, missing, &out_predictions, iteration_begin, iteration_end);
     if (type == PredictionType::kValue) {
       obj_->PredTransform(&out_predictions.predictions);
diff --git a/src/objective/lambdarank_obj.cu b/src/objective/lambdarank_obj.cu
index 110e4ae87..2a7cac751 100644
--- a/src/objective/lambdarank_obj.cu
+++ b/src/objective/lambdarank_obj.cu
@@ -577,8 +577,8 @@ void LambdaRankUpdatePositionBias(Context const* ctx, linalg::VectorView<double
                        if (lj(0) >= Eps64()) {
                          tj_minus(i) = std::pow(lj(i) / lj(0), regularizer);
                        }
-                       assert(!std::isinf(ti_plus(i)));
-                       assert(!std::isinf(tj_minus(i)));
+                       assert(!isinf(ti_plus(i)));
+                       assert(!isinf(tj_minus(i)));
                      });
 }
 }  // namespace cuda_impl
diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index 56362a112..b9cb02d56 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -883,9 +883,8 @@ class CPUPredictor : public Predictor {
     for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
       auto page = batch.GetView();
       // parallel over local batch
-      const auto nsize = static_cast<bst_omp_uint>(batch.Size());
-      common::ParallelFor(nsize, n_threads, [&](bst_omp_uint i) {
-        auto row_idx = static_cast<size_t>(batch.base_rowid + i);
+      common::ParallelFor(batch.Size(), n_threads, [&](auto i) {
+        auto row_idx = batch.base_rowid + i;
         RegTree::FVec &feats = feat_vecs[omp_get_thread_num()];
         if (feats.Size() == 0) {
           feats.Init(num_feature);
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 2807dcfd7..9378bde20 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -226,9 +226,7 @@ struct GPUHistMakerDevice {
     monitor.Init(std::string("GPUHistMakerDevice") + std::to_string(ctx_->gpu_id));
   }
 
-  ~GPUHistMakerDevice() {  // NOLINT
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
-  }
+  ~GPUHistMakerDevice() = default;
 
   void InitFeatureGroupsOnce() {
     if (!feature_groups) {
diff --git a/tests/ci_build/lint_python.py b/tests/ci_build/lint_python.py
index 90a7781c2..dda2746bf 100644
--- a/tests/ci_build/lint_python.py
+++ b/tests/ci_build/lint_python.py
@@ -25,6 +25,9 @@ class LintersPaths:
         "tests/python/test_tree_regularization.py",
         "tests/python/test_shap.py",
         "tests/python-gpu/test_gpu_data_iterator.py",
+        "tests/python-gpu/test_gpu_prediction.py",
+        "tests/python-gpu/load_pickle.py",
+        "tests/python-gpu/test_gpu_pickling.py",
         "tests/test_distributed/test_with_spark/",
         "tests/test_distributed/test_gpu_with_spark/",
         # demo
@@ -68,6 +71,7 @@ class LintersPaths:
         "tests/python/test_dt.py",
         "tests/python/test_data_iterator.py",
         "tests/python-gpu/test_gpu_data_iterator.py",
+        "tests/python-gpu/load_pickle.py",
         "tests/test_distributed/test_with_spark/test_data.py",
         "tests/test_distributed/test_gpu_with_spark/test_data.py",
         "tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py",
diff --git a/tests/cpp/common/test_json.cc b/tests/cpp/common/test_json.cc
index 3e2038e13..1b4ed76ec 100644
--- a/tests/cpp/common/test_json.cc
+++ b/tests/cpp/common/test_json.cc
@@ -41,7 +41,6 @@ std::string GetModelStr() {
     "num_class": "0",
     "num_feature": "10",
     "objective": "reg:linear",
-    "predictor": "gpu_predictor",
     "tree_method": "gpu_hist",
     "updater": "grow_gpu_hist"
   },
diff --git a/tests/cpp/gbm/test_gbtree.cc b/tests/cpp/gbm/test_gbtree.cc
index 93d0cf525..2bc0b2c6b 100644
--- a/tests/cpp/gbm/test_gbtree.cc
+++ b/tests/cpp/gbm/test_gbtree.cc
@@ -1,17 +1,20 @@
-/*!
- * Copyright 2019-2022 XGBoost contributors
+/**
+ * Copyright 2019-2023, XGBoost contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/context.h>
+#include <xgboost/host_device_vector.h>  // for HostDeviceVector
+#include <xgboost/learner.h>             // for Learner
 
-#include "../../../src/data/adapter.h"
-#include "../../../src/data/proxy_dmatrix.h"
+#include <limits>  // for numeric_limits
+#include <memory>  // for shared_ptr
+#include <string>  // for string
+
+#include "../../../src/data/proxy_dmatrix.h"  // for DMatrixProxy
 #include "../../../src/gbm/gbtree.h"
 #include "../filesystem.h"  // dmlc::TemporaryDirectory
 #include "../helpers.h"
 #include "xgboost/base.h"
-#include "xgboost/host_device_vector.h"
-#include "xgboost/learner.h"
 #include "xgboost/predictor.h"
 
 namespace xgboost {
@@ -113,12 +116,11 @@ TEST(GBTree, WrongUpdater) {
 #ifdef XGBOOST_USE_CUDA
 TEST(GBTree, ChoosePredictor) {
   // The test ensures data don't get pulled into device.
-  size_t constexpr kRows = 17;
-  size_t constexpr kCols = 15;
+  std::size_t constexpr kRows = 17, kCols = 15;
 
   auto p_dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
 
-  auto& data = (*(p_dmat->GetBatches<SparsePage>().begin())).data;
+  auto const& data = (*(p_dmat->GetBatches<SparsePage>().begin())).data;
   p_dmat->Info().labels.Reshape(kRows);
 
   auto learner = std::unique_ptr<Learner>(Learner::Create({p_dmat}));
@@ -127,14 +129,13 @@ TEST(GBTree, ChoosePredictor) {
     learner->UpdateOneIter(i, p_dmat);
   }
   ASSERT_TRUE(data.HostCanWrite());
+
   dmlc::TemporaryDirectory tempdir;
   const std::string fname = tempdir.path + "/model_param.bst";
-
   {
     std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(fname.c_str(), "w"));
     learner->Save(fo.get());
   }
-
   // a new learner
   learner = std::unique_ptr<Learner>(Learner::Create({p_dmat}));
   {
@@ -146,6 +147,8 @@ TEST(GBTree, ChoosePredictor) {
     learner->UpdateOneIter(i, p_dmat);
   }
   ASSERT_TRUE(data.HostCanWrite());
+  ASSERT_FALSE(data.DeviceCanWrite());
+  ASSERT_FALSE(data.DeviceCanRead());
 
   // pull data into device.
   data.HostVector();
@@ -232,14 +235,15 @@ TEST(Dart, JsonIO) {
 namespace {
 class Dart : public testing::TestWithParam<char const*> {
  public:
-  void Run(std::string predictor) {
+  void Run(std::string device) {
     size_t constexpr kRows = 16, kCols = 10;
 
     HostDeviceVector<float> data;
-    auto rng = RandomDataGenerator(kRows, kCols, 0);
-    if (predictor == "gpu_predictor") {
-      rng.Device(0);
+    Context ctx;
+    if (device == "GPU") {
+      ctx = MakeCUDACtx(0);
     }
+    auto rng = RandomDataGenerator(kRows, kCols, 0).Device(ctx.gpu_id);
     auto array_str = rng.GenerateArrayInterface(&data);
     auto p_mat = GetDMatrixFromData(data.HostVector(), kRows, kCols);
 
@@ -258,14 +262,14 @@ class Dart : public testing::TestWithParam<char const*> {
       learner->UpdateOneIter(i, p_mat);
     }
 
-    learner->SetParam("predictor", predictor);
+    ConfigLearnerByCtx(&ctx, learner.get());
 
     HostDeviceVector<float> predts_training;
     learner->Predict(p_mat, false, &predts_training, 0, 0, true);
 
     HostDeviceVector<float>* inplace_predts;
     std::shared_ptr<data::DMatrixProxy> x{new data::DMatrixProxy{}};
-    if (predictor == "gpu_predictor") {
+    if (ctx.IsCUDA()) {
       x->SetCUDAArray(array_str.c_str());
     } else {
       x->SetArrayData(array_str.c_str());
@@ -295,10 +299,9 @@ class Dart : public testing::TestWithParam<char const*> {
 TEST_P(Dart, Prediction) { this->Run(GetParam()); }
 
 #if defined(XGBOOST_USE_CUDA)
-INSTANTIATE_TEST_SUITE_P(PredictorTypes, Dart,
-                         testing::Values("auto", "cpu_predictor", "gpu_predictor"));
+INSTANTIATE_TEST_SUITE_P(PredictorTypes, Dart, testing::Values("CPU", "GPU"));
 #else
-INSTANTIATE_TEST_SUITE_P(PredictorTypes, Dart, testing::Values("auto", "cpu_predictor"));
+INSTANTIATE_TEST_SUITE_P(PredictorTypes, Dart, testing::Values("CPU"));
 #endif  // defined(XGBOOST_USE_CUDA)
 
 
diff --git a/tests/cpp/gbm/test_gbtree.cu b/tests/cpp/gbm/test_gbtree.cu
new file mode 100644
index 000000000..2393bfabd
--- /dev/null
+++ b/tests/cpp/gbm/test_gbtree.cu
@@ -0,0 +1,88 @@
+/**
+ * Copyright 2023, XGBoost contributors
+ */
+#include <xgboost/context.h>      // for Context
+#include <xgboost/learner.h>      // for Learner
+#include <xgboost/string_view.h>  // for StringView
+
+#include <limits>  // for numeric_limits
+#include <memory>  // for shared_ptr
+#include <string>  // for string
+
+#include "../../../src/data/adapter.h"           // for ArrayAdapter
+#include "../../../src/data/device_adapter.cuh"  // for CupyAdapter
+#include "../../../src/data/proxy_dmatrix.h"     // for DMatrixProxy
+#include "../helpers.h"                          // for RandomDataGenerator
+
+namespace xgboost {
+void TestInplaceFallback(Context const* ctx) {
+  // prepare data
+  bst_row_t n_samples{1024};
+  bst_feature_t n_features{32};
+  HostDeviceVector<float> X_storage;
+  // use a different device than the learner
+  std::int32_t data_ordinal = ctx->IsCPU() ? 0 : -1;
+  auto X = RandomDataGenerator{n_samples, n_features, 0.0}
+               .Device(data_ordinal)
+               .GenerateArrayInterface(&X_storage);
+  HostDeviceVector<float> y_storage;
+  auto y = RandomDataGenerator{n_samples, 1u, 0.0}.GenerateArrayInterface(&y_storage);
+
+  std::shared_ptr<DMatrix> Xy;
+  if (data_ordinal == Context::kCpuId) {
+    auto X_adapter = data::ArrayAdapter{StringView{X}};
+    Xy.reset(DMatrix::Create(&X_adapter, std::numeric_limits<float>::quiet_NaN(), ctx->Threads()));
+  } else {
+    auto X_adapter = data::CupyAdapter{StringView{X}};
+    Xy.reset(DMatrix::Create(&X_adapter, std::numeric_limits<float>::quiet_NaN(), ctx->Threads()));
+  }
+
+  Xy->SetInfo("label", y);
+
+  // learner is configured to the device specified by ctx
+  std::unique_ptr<Learner> learner{Learner::Create({Xy})};
+  ConfigLearnerByCtx(ctx, learner.get());
+  for (std::int32_t i = 0; i < 3; ++i) {
+    learner->UpdateOneIter(i, Xy);
+  }
+
+  std::shared_ptr<DMatrix> p_m{new data::DMatrixProxy};
+  auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_m);
+  if (data_ordinal == Context::kCpuId) {
+    proxy->SetArrayData(StringView{X});
+  } else {
+    proxy->SetCUDAArray(X.c_str());
+  }
+
+  HostDeviceVector<float>* out_predt{nullptr};
+  ConsoleLogger::Configure(Args{{"verbosity", "1"}});
+  // test whether the warning is raised
+  ::testing::internal::CaptureStderr();
+  learner->InplacePredict(p_m, PredictionType::kValue, std::numeric_limits<float>::quiet_NaN(),
+                          &out_predt, 0, 0);
+  auto output = testing::internal::GetCapturedStderr();
+  std::cout << "output:" << output << std::endl;
+  ASSERT_NE(output.find("Falling back"), std::string::npos);
+
+  // test when the contexts match
+  Context new_ctx = *proxy->Ctx();
+  ASSERT_NE(new_ctx.gpu_id, ctx->gpu_id);
+
+  ConfigLearnerByCtx(&new_ctx, learner.get());
+  HostDeviceVector<float>* out_predt_1{nullptr};
+  // no warning is raised
+  ::testing::internal::CaptureStderr();
+  learner->InplacePredict(p_m, PredictionType::kValue, std::numeric_limits<float>::quiet_NaN(),
+                          &out_predt_1, 0, 0);
+  output = testing::internal::GetCapturedStderr();
+
+  ASSERT_TRUE(output.empty());
+
+  ASSERT_EQ(out_predt->ConstHostVector(), out_predt_1->ConstHostVector());
+}
+
+TEST(GBTree, InplacePredictFallback) {
+  auto ctx = MakeCUDACtx(0);
+  TestInplaceFallback(&ctx);
+}
+}  // namespace xgboost
diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index 7c81b96f9..49ff5e412 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -395,6 +395,9 @@ std::shared_ptr<DMatrix> RandomDataGenerator::GenerateDMatrix(bool with_label, b
     for (auto const& page : out->GetBatches<SparsePage>()) {
       page.data.SetDevice(device_);
       page.offset.SetDevice(device_);
+      // pull to device
+      page.data.ConstDeviceSpan();
+      page.offset.ConstDeviceSpan();
     }
   }
   if (!ft_.empty()) {
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index 004a64ce4..035baf22a 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -183,7 +183,7 @@ class SimpleRealUniformDistribution {
 
     for (size_t k = m; k != 0; --k) {
       sum_value += static_cast<ResultT>((*rng)() - rng->Min()) * r_k;
-      r_k *= r;
+      r_k *= static_cast<ResultT>(r);
     }
 
     ResultT res = sum_value / r_k;
@@ -322,15 +322,14 @@ inline std::shared_ptr<DMatrix> EmptyDMatrix() {
   return RandomDataGenerator{0, 0, 0.0}.GenerateDMatrix();
 }
 
-inline std::vector<float>
-GenerateRandomCategoricalSingleColumn(int n, size_t num_categories) {
+inline std::vector<float> GenerateRandomCategoricalSingleColumn(int n, size_t num_categories) {
   std::vector<float> x(n);
   std::mt19937 rng(0);
   std::uniform_int_distribution<size_t> dist(0, num_categories - 1);
   std::generate(x.begin(), x.end(), [&]() { return dist(rng); });
   // Make sure each category is present
-  for(size_t i = 0; i < num_categories; i++) {
-    x[i] = i;
+  for (size_t i = 0; i < num_categories; i++) {
+    x[i] = static_cast<decltype(x)::value_type>(i);
   }
   return x;
 }
@@ -549,4 +548,15 @@ class DeclareUnifiedDistributedTest(MetricTest) : public ::testing::Test {
   }
 };
 
+// A temporary solution before we move away from gpu_id.
+inline void ConfigLearnerByCtx(Context const* ctx, Learner* learner) {
+  if (ctx->IsCPU()) {
+    learner->SetParam("tree_method", "hist");
+  } else {
+    learner->SetParam("tree_method", "gpu_hist");
+  }
+  learner->SetParam("gpu_id", std::to_string(ctx->gpu_id));
+  learner->Configure();
+  ASSERT_EQ(learner->Ctx()->gpu_id, ctx->gpu_id);
+}
 }  // namespace xgboost
diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc
index 7f1803414..087543cfe 100644
--- a/tests/cpp/predictor/test_cpu_predictor.cc
+++ b/tests/cpp/predictor/test_cpu_predictor.cc
@@ -122,11 +122,13 @@ TEST(CpuPredictor, BasicColumnSplit) {
 }
 
 TEST(CpuPredictor, IterationRange) {
-  TestIterationRange("cpu_predictor");
+  Context ctx;
+  TestIterationRange(&ctx);
 }
 
 TEST(CpuPredictor, IterationRangeColmnSplit) {
-  TestIterationRangeColumnSplit("cpu_predictor");
+  Context ctx;
+  TestIterationRangeColumnSplit(&ctx);
 }
 
 TEST(CpuPredictor, ExternalMemory) {
@@ -139,7 +141,8 @@ TEST(CpuPredictor, ExternalMemory) {
 TEST(CpuPredictor, InplacePredict) {
   bst_row_t constexpr kRows{128};
   bst_feature_t constexpr kCols{64};
-  auto gen = RandomDataGenerator{kRows, kCols, 0.5}.Device(-1);
+  Context ctx;
+  auto gen = RandomDataGenerator{kRows, kCols, 0.5}.Device(ctx.gpu_id);
   {
     HostDeviceVector<float> data;
     gen.GenerateDense(&data);
@@ -149,7 +152,7 @@ TEST(CpuPredictor, InplacePredict) {
     std::string arr_str;
     Json::Dump(array_interface, &arr_str);
     x->SetArrayData(arr_str.data());
-    TestInplacePrediction(x, "cpu_predictor", kRows, kCols, Context::kCpuId);
+    TestInplacePrediction(&ctx, x, kRows, kCols);
   }
 
   {
@@ -166,50 +169,50 @@ TEST(CpuPredictor, InplacePredict) {
     Json::Dump(col_interface, &col_str);
     std::shared_ptr<data::DMatrixProxy> x{new data::DMatrixProxy};
     x->SetCSRData(rptr_str.data(), col_str.data(), data_str.data(), kCols, true);
-    TestInplacePrediction(x, "cpu_predictor", kRows, kCols, Context::kCpuId);
+    TestInplacePrediction(&ctx, x, kRows, kCols);
   }
 }
 
+namespace {
 void TestUpdatePredictionCache(bool use_subsampling) {
-  size_t constexpr kRows = 64, kCols = 16, kClasses = 4;
+  std::size_t constexpr kRows = 64, kCols = 16, kClasses = 4;
   LearnerModelParam mparam{MakeMP(kCols, .0, kClasses)};
   Context ctx;
 
   std::unique_ptr<gbm::GBTree> gbm;
   gbm.reset(static_cast<gbm::GBTree*>(GradientBooster::Create("gbtree", &ctx, &mparam)));
-  std::map<std::string, std::string> cfg;
-  cfg["tree_method"] = "hist";
-  cfg["predictor"]   = "cpu_predictor";
+  Args args{{"tree_method", "hist"}};
   if (use_subsampling) {
-    cfg["subsample"] = "0.5";
+    args.emplace_back("subsample", "0.5");
   }
-  Args args = {cfg.cbegin(), cfg.cend()};
   gbm->Configure(args);
 
   auto dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix(true, true, kClasses);
 
   HostDeviceVector<GradientPair> gpair;
   auto& h_gpair = gpair.HostVector();
-  h_gpair.resize(kRows*kClasses);
-  for (size_t i = 0; i < kRows*kClasses; ++i) {
+  h_gpair.resize(kRows * kClasses);
+  for (size_t i = 0; i < kRows * kClasses; ++i) {
     h_gpair[i] = {static_cast<float>(i), 1};
   }
 
   PredictionCacheEntry predtion_cache;
-  predtion_cache.predictions.Resize(kRows*kClasses, 0);
-  // after one training iteration predtion_cache is filled with cached in QuantileHistMaker::Builder prediction values
+  predtion_cache.predictions.Resize(kRows * kClasses, 0);
+  // after one training iteration predtion_cache is filled with cached in QuantileHistMaker
+  // prediction values
   gbm->DoBoost(dmat.get(), &gpair, &predtion_cache, nullptr);
 
   PredictionCacheEntry out_predictions;
-  // perform fair prediction on the same input data, should be equal to cached result
+  // perform prediction from scratch on the same input data, should be equal to cached result
   gbm->PredictBatch(dmat.get(), &out_predictions, false, 0, 0);
 
-  std::vector<float> &out_predictions_h = out_predictions.predictions.HostVector();
-  std::vector<float> &predtion_cache_from_train = predtion_cache.predictions.HostVector();
+  std::vector<float>& out_predictions_h = out_predictions.predictions.HostVector();
+  std::vector<float>& predtion_cache_from_train = predtion_cache.predictions.HostVector();
   for (size_t i = 0; i < out_predictions_h.size(); ++i) {
     ASSERT_NEAR(out_predictions_h[i], predtion_cache_from_train[i], kRtEps);
   }
 }
+}  // namespace
 
 TEST(CPUPredictor, GHistIndex) {
   size_t constexpr kRows{128}, kCols{16}, kBins{64};
@@ -223,19 +226,23 @@ TEST(CPUPredictor, GHistIndex) {
 }
 
 TEST(CPUPredictor, CategoricalPrediction) {
-  TestCategoricalPrediction("cpu_predictor");
+  Context ctx;
+  TestCategoricalPrediction(&ctx, false);
 }
 
 TEST(CPUPredictor, CategoricalPredictionColumnSplit) {
-  TestCategoricalPredictionColumnSplit("cpu_predictor");
+  Context ctx;
+  TestCategoricalPredictionColumnSplit(&ctx);
 }
 
 TEST(CPUPredictor, CategoricalPredictLeaf) {
-  TestCategoricalPredictLeaf(StringView{"cpu_predictor"});
+  Context ctx;
+  TestCategoricalPredictLeaf(&ctx, false);
 }
 
 TEST(CPUPredictor, CategoricalPredictLeafColumnSplit) {
-  TestCategoricalPredictLeafColumnSplit(StringView{"cpu_predictor"});
+  Context ctx;
+  TestCategoricalPredictLeafColumnSplit(&ctx);
 }
 
 TEST(CpuPredictor, UpdatePredictionCache) {
@@ -244,21 +251,25 @@ TEST(CpuPredictor, UpdatePredictionCache) {
 }
 
 TEST(CpuPredictor, LesserFeatures) {
-  TestPredictionWithLesserFeatures("cpu_predictor");
+  Context ctx;
+  TestPredictionWithLesserFeatures(&ctx);
 }
 
 TEST(CpuPredictor, LesserFeaturesColumnSplit) {
-  TestPredictionWithLesserFeaturesColumnSplit("cpu_predictor");
+  Context ctx;
+  TestPredictionWithLesserFeaturesColumnSplit(&ctx);
 }
 
 TEST(CpuPredictor, Sparse) {
-  TestSparsePrediction(0.2, "cpu_predictor");
-  TestSparsePrediction(0.8, "cpu_predictor");
+  Context ctx;
+  TestSparsePrediction(&ctx, 0.2);
+  TestSparsePrediction(&ctx, 0.8);
 }
 
 TEST(CpuPredictor, SparseColumnSplit) {
-  TestSparsePredictionColumnSplit(0.2, "cpu_predictor");
-  TestSparsePredictionColumnSplit(0.8, "cpu_predictor");
+  Context ctx;
+  TestSparsePredictionColumnSplit(&ctx, 0.2);
+  TestSparsePredictionColumnSplit(&ctx, 0.8);
 }
 
 TEST(CpuPredictor, Multi) {
@@ -266,4 +277,6 @@ TEST(CpuPredictor, Multi) {
   ctx.nthread = 1;
   TestVectorLeafPrediction(&ctx);
 }
+
+TEST(CpuPredictor, Access) { TestPredictionDeviceAccess(); }
 }  // namespace xgboost
diff --git a/tests/cpp/predictor/test_gpu_predictor.cu b/tests/cpp/predictor/test_gpu_predictor.cu
index 6911824a9..30fbaf997 100644
--- a/tests/cpp/predictor/test_gpu_predictor.cu
+++ b/tests/cpp/predictor/test_gpu_predictor.cu
@@ -15,8 +15,7 @@
 #include "../helpers.h"
 #include "test_predictor.h"
 
-namespace xgboost {
-namespace predictor {
+namespace xgboost::predictor {
 
 TEST(GPUPredictor, Basic) {
   auto cpu_lparam = MakeCUDACtx(-1);
@@ -120,13 +119,14 @@ TEST(GPUPredictor, MGPUBasicColumnSplit) {
 }
 
 TEST(GPUPredictor, EllpackBasic) {
-  size_t constexpr kCols {8};
+  size_t constexpr kCols{8};
+  auto ctx = MakeCUDACtx(0);
   for (size_t bins = 2; bins < 258; bins += 16) {
     size_t rows = bins * 16;
     auto p_m = RandomDataGenerator{rows, kCols, 0.0}.Bins(bins).Device(0).GenerateDeviceDMatrix();
     ASSERT_FALSE(p_m->PageExists<SparsePage>());
-    TestPredictionFromGradientIndex<EllpackPage>("gpu_predictor", rows, kCols, p_m);
-    TestPredictionFromGradientIndex<EllpackPage>("gpu_predictor", bins, kCols, p_m);
+    TestPredictionFromGradientIndex<EllpackPage>(&ctx, rows, kCols, p_m);
+    TestPredictionFromGradientIndex<EllpackPage>(&ctx, bins, kCols, p_m);
   }
 }
 
@@ -181,29 +181,32 @@ TEST(GPUPredictor, ExternalMemoryTest) {
 }
 
 TEST(GPUPredictor, InplacePredictCupy) {
+  auto ctx = MakeCUDACtx(0);
   size_t constexpr kRows{128}, kCols{64};
   RandomDataGenerator gen(kRows, kCols, 0.5);
-  gen.Device(0);
+  gen.Device(ctx.gpu_id);
   HostDeviceVector<float> data;
   std::string interface_str = gen.GenerateArrayInterface(&data);
   std::shared_ptr<DMatrix> p_fmat{new data::DMatrixProxy};
   dynamic_cast<data::DMatrixProxy*>(p_fmat.get())->SetCUDAArray(interface_str.c_str());
-  TestInplacePrediction(p_fmat, "gpu_predictor", kRows, kCols, 0);
+  TestInplacePrediction(&ctx, p_fmat, kRows, kCols);
 }
 
 TEST(GPUPredictor, InplacePredictCuDF) {
+  auto ctx = MakeCUDACtx(0);
   size_t constexpr kRows{128}, kCols{64};
   RandomDataGenerator gen(kRows, kCols, 0.5);
-  gen.Device(0);
+  gen.Device(ctx.gpu_id);
   std::vector<HostDeviceVector<float>> storage(kCols);
   auto interface_str = gen.GenerateColumnarArrayInterface(&storage);
   std::shared_ptr<DMatrix> p_fmat{new data::DMatrixProxy};
   dynamic_cast<data::DMatrixProxy*>(p_fmat.get())->SetCUDAArray(interface_str.c_str());
-  TestInplacePrediction(p_fmat, "gpu_predictor", kRows, kCols, 0);
+  TestInplacePrediction(&ctx, p_fmat, kRows, kCols);
 }
 
 TEST(GpuPredictor, LesserFeatures) {
-  TestPredictionWithLesserFeatures("gpu_predictor");
+  auto ctx = MakeCUDACtx(0);
+  TestPredictionWithLesserFeatures(&ctx);
 }
 
 // Very basic test of empty model
@@ -268,15 +271,18 @@ TEST(GPUPredictor, Shap) {
 }
 
 TEST(GPUPredictor, IterationRange) {
-  TestIterationRange("gpu_predictor");
+  auto ctx = MakeCUDACtx(0);
+  TestIterationRange(&ctx);
 }
 
 TEST(GPUPredictor, CategoricalPrediction) {
-  TestCategoricalPrediction("gpu_predictor");
+  auto ctx = MakeCUDACtx(0);
+  TestCategoricalPrediction(&ctx, false);
 }
 
 TEST(GPUPredictor, CategoricalPredictLeaf) {
-  TestCategoricalPredictLeaf(StringView{"gpu_predictor"});
+  auto ctx = MakeCUDACtx(0);
+  TestCategoricalPredictLeaf(&ctx, false);
 }
 
 TEST(GPUPredictor, PredictLeafBasic) {
@@ -300,8 +306,8 @@ TEST(GPUPredictor, PredictLeafBasic) {
 }
 
 TEST(GPUPredictor, Sparse) {
-  TestSparsePrediction(0.2, "gpu_predictor");
-  TestSparsePrediction(0.8, "gpu_predictor");
+  auto ctx = MakeCUDACtx(0);
+  TestSparsePrediction(&ctx, 0.2);
+  TestSparsePrediction(&ctx, 0.8);
 }
-}  // namespace predictor
-}  // namespace xgboost
+}  // namespace xgboost::predictor
diff --git a/tests/cpp/predictor/test_predictor.cc b/tests/cpp/predictor/test_predictor.cc
index fb7e7fb8a..b85abf183 100644
--- a/tests/cpp/predictor/test_predictor.cc
+++ b/tests/cpp/predictor/test_predictor.cc
@@ -8,9 +8,11 @@
 #include <xgboost/data.h>                         // for DMatrix, BatchIterator, BatchSet, MetaInfo
 #include <xgboost/host_device_vector.h>           // for HostDeviceVector
 #include <xgboost/predictor.h>                    // for PredictionCacheEntry, Predictor, Predic...
+#include <xgboost/string_view.h>                  // for StringView
 
 #include <algorithm>                              // for max
 #include <limits>                                 // for numeric_limits
+#include <memory>                                 // for shared_ptr
 #include <unordered_map>                          // for unordered_map
 
 #include "../../../src/common/bitfield.h"         // for LBitField32
@@ -51,7 +53,7 @@ void TestTrainingPrediction(size_t rows, size_t bins,
   size_t constexpr kIters = 3;
 
   std::unique_ptr<Learner> learner;
-  auto train = [&](std::string predictor) {
+  auto train = [&](Context const& ctx) {
     p_hist->Info().labels.Reshape(rows, 1);
     auto &h_label = p_hist->Info().labels.Data()->HostVector();
 
@@ -65,7 +67,7 @@ void TestTrainingPrediction(size_t rows, size_t bins,
     learner->SetParam("num_feature", std::to_string(kCols));
     learner->SetParam("num_class", std::to_string(kClasses));
     learner->SetParam("max_bin", std::to_string(bins));
-    learner->SetParam("predictor", predictor);
+    ConfigLearnerByCtx(&ctx, learner.get());
     learner->Configure();
 
     for (size_t i = 0; i < kIters; ++i) {
@@ -77,7 +79,7 @@ void TestTrainingPrediction(size_t rows, size_t bins,
 
     learner.reset(Learner::Create({}));
     learner->LoadModel(model);
-    learner->SetParam("predictor", predictor);
+    ConfigLearnerByCtx(&ctx, learner.get());
     learner->Configure();
 
     HostDeviceVector<float> from_full;
@@ -93,16 +95,16 @@ void TestTrainingPrediction(size_t rows, size_t bins,
   };
 
   if (tree_method == "gpu_hist") {
-    train("gpu_predictor");
+    train(MakeCUDACtx(0));
   } else {
-    train("cpu_predictor");
+    train(Context{});
   }
 }
 
-void TestInplacePrediction(std::shared_ptr<DMatrix> x, std::string predictor, bst_row_t rows,
-                           bst_feature_t cols, int32_t device) {
-  size_t constexpr kClasses { 4 };
-  auto gen = RandomDataGenerator{rows, cols, 0.5}.Device(device);
+void TestInplacePrediction(Context const *ctx, std::shared_ptr<DMatrix> x, bst_row_t rows,
+                           bst_feature_t cols) {
+  std::size_t constexpr kClasses { 4 };
+  auto gen = RandomDataGenerator{rows, cols, 0.5}.Device(ctx->gpu_id);
   std::shared_ptr<DMatrix> m = gen.GenerateDMatrix(true, false, kClasses);
 
   std::unique_ptr<Learner> learner {
@@ -113,12 +115,14 @@ void TestInplacePrediction(std::shared_ptr<DMatrix> x, std::string predictor, bs
   learner->SetParam("num_class", std::to_string(kClasses));
   learner->SetParam("seed", "0");
   learner->SetParam("subsample", "0.5");
-  learner->SetParam("gpu_id", std::to_string(device));
-  learner->SetParam("predictor", predictor);
+  learner->SetParam("tree_method", "hist");
   for (int32_t it = 0; it < 4; ++it) {
     learner->UpdateOneIter(it, m);
   }
 
+  learner->SetParam("gpu_id", std::to_string(ctx->gpu_id));
+  learner->Configure();
+
   HostDeviceVector<float> *p_out_predictions_0{nullptr};
   learner->InplacePredict(x, PredictionType::kMargin, std::numeric_limits<float>::quiet_NaN(),
                           &p_out_predictions_0, 0, 2);
@@ -154,40 +158,79 @@ void TestInplacePrediction(std::shared_ptr<DMatrix> x, std::string predictor, bs
 }
 
 namespace {
-std::unique_ptr<Learner> LearnerForTest(std::shared_ptr<DMatrix> dmat, size_t iters,
-                                        size_t forest = 1) {
+std::unique_ptr<Learner> LearnerForTest(Context const *ctx, std::shared_ptr<DMatrix> dmat,
+                                        size_t iters, size_t forest = 1) {
   std::unique_ptr<Learner> learner{Learner::Create({dmat})};
   learner->SetParams(Args{{"num_parallel_tree", std::to_string(forest)}});
   for (size_t i = 0; i < iters; ++i) {
     learner->UpdateOneIter(i, dmat);
   }
+
+  ConfigLearnerByCtx(ctx, learner.get());
   return learner;
 }
 
-void VerifyPredictionWithLesserFeatures(Learner *learner, std::string const &predictor_name,
-                                        size_t rows, std::shared_ptr<DMatrix> const &m_test,
-                                        std::shared_ptr<DMatrix> const &m_invalid) {
+void VerifyPredictionWithLesserFeatures(Learner *learner, bst_row_t kRows,
+                                        std::shared_ptr<DMatrix> m_test,
+                                        std::shared_ptr<DMatrix> m_invalid) {
   HostDeviceVector<float> prediction;
-  learner->SetParam("predictor", predictor_name);
-  learner->Configure();
   Json config{Object()};
   learner->SaveConfig(&config);
-  ASSERT_EQ(get<String>(config["learner"]["gradient_booster"]["gbtree_train_param"]["predictor"]),
-            predictor_name);
 
   learner->Predict(m_test, false, &prediction, 0, 0);
-  ASSERT_EQ(prediction.Size(), rows);
+  ASSERT_EQ(prediction.Size(), kRows);
 
   ASSERT_THROW({ learner->Predict(m_invalid, false, &prediction, 0, 0); }, dmlc::Error);
+}
+
+void VerifyPredictionWithLesserFeaturesColumnSplit(Learner *learner, size_t rows,
+                                                   std::shared_ptr<DMatrix> m_test,
+                                                   std::shared_ptr<DMatrix> m_invalid) {
+  auto const world_size = collective::GetWorldSize();
+  auto const rank = collective::GetRank();
+  std::shared_ptr<DMatrix> sliced_test{m_test->SliceCol(world_size, rank)};
+  std::shared_ptr<DMatrix> sliced_invalid{m_invalid->SliceCol(world_size, rank)};
+
+  VerifyPredictionWithLesserFeatures(learner, rows, sliced_test, sliced_invalid);
+}
+}  // anonymous namespace
+
+void TestPredictionWithLesserFeatures(Context const *ctx) {
+  size_t constexpr kRows = 256, kTrainCols = 256, kTestCols = 4, kIters = 4;
+  auto m_train = RandomDataGenerator(kRows, kTrainCols, 0.5).GenerateDMatrix(true);
+  auto learner = LearnerForTest(ctx, m_train, kIters);
+  auto m_test = RandomDataGenerator(kRows, kTestCols, 0.5).GenerateDMatrix(false);
+  auto m_invalid = RandomDataGenerator(kRows, kTrainCols + 1, 0.5).GenerateDMatrix(false);
+  VerifyPredictionWithLesserFeatures(learner.get(), kRows, m_test, m_invalid);
+}
+
+void TestPredictionDeviceAccess() {
+  Context ctx;
+  size_t constexpr kRows = 256, kTrainCols = 256, kTestCols = 4, kIters = 4;
+  auto m_train = RandomDataGenerator(kRows, kTrainCols, 0.5).GenerateDMatrix(true);
+  auto m_test = RandomDataGenerator(kRows, kTestCols, 0.5).GenerateDMatrix(false);
+  auto learner = LearnerForTest(&ctx, m_train, kIters);
+
+  HostDeviceVector<float> from_cpu;
+  {
+    ASSERT_EQ(from_cpu.DeviceIdx(), Context::kCpuId);
+    Context cpu_ctx;
+    ConfigLearnerByCtx(&cpu_ctx, learner.get());
+    learner->Predict(m_test, false, &from_cpu, 0, 0);
+    ASSERT_TRUE(from_cpu.HostCanWrite());
+    ASSERT_FALSE(from_cpu.DeviceCanRead());
+  }
 
 #if defined(XGBOOST_USE_CUDA)
-  HostDeviceVector<float> from_cpu;
-  learner->SetParam("predictor", "cpu_predictor");
-  learner->Predict(m_test, false, &from_cpu, 0, 0);
-
   HostDeviceVector<float> from_cuda;
-  learner->SetParam("predictor", "gpu_predictor");
-  learner->Predict(m_test, false, &from_cuda, 0, 0);
+  {
+    Context cuda_ctx = MakeCUDACtx(0);
+    ConfigLearnerByCtx(&cuda_ctx, learner.get());
+    learner->Predict(m_test, false, &from_cuda, 0, 0);
+    ASSERT_EQ(from_cuda.DeviceIdx(), 0);
+    ASSERT_TRUE(from_cuda.DeviceCanWrite());
+    ASSERT_FALSE(from_cuda.HostCanRead());
+  }
 
   auto const &h_cpu = from_cpu.ConstHostVector();
   auto const &h_gpu = from_cuda.ConstHostVector();
@@ -196,41 +239,17 @@ void VerifyPredictionWithLesserFeatures(Learner *learner, std::string const &pre
   }
 #endif  // defined(XGBOOST_USE_CUDA)
 }
-}  // anonymous namespace
 
-void TestPredictionWithLesserFeatures(std::string predictor_name) {
+void TestPredictionWithLesserFeaturesColumnSplit(Context const *ctx) {
   size_t constexpr kRows = 256, kTrainCols = 256, kTestCols = 4, kIters = 4;
   auto m_train = RandomDataGenerator(kRows, kTrainCols, 0.5).GenerateDMatrix(true);
-  auto learner = LearnerForTest(m_train, kIters);
-  auto m_test = RandomDataGenerator(kRows, kTestCols, 0.5).GenerateDMatrix(false);
-  auto m_invalid = RandomDataGenerator(kRows, kTrainCols + 1, 0.5).GenerateDMatrix(false);
-  VerifyPredictionWithLesserFeatures(learner.get(), predictor_name, kRows, m_test, m_invalid);
-}
-
-namespace {
-void VerifyPredictionWithLesserFeaturesColumnSplit(Learner *learner,
-                                                   std::string const &predictor_name, size_t rows,
-                                                   std::shared_ptr<DMatrix> m_test,
-                                                   std::shared_ptr<DMatrix> m_invalid) {
-  auto const world_size = collective::GetWorldSize();
-  auto const rank = collective::GetRank();
-  std::shared_ptr<DMatrix> sliced_test{m_test->SliceCol(world_size, rank)};
-  std::shared_ptr<DMatrix> sliced_invalid{m_invalid->SliceCol(world_size, rank)};
-
-  VerifyPredictionWithLesserFeatures(learner, predictor_name, rows, sliced_test, sliced_invalid);
-}
-}  // anonymous namespace
-
-void TestPredictionWithLesserFeaturesColumnSplit(std::string predictor_name) {
-  size_t constexpr kRows = 256, kTrainCols = 256, kTestCols = 4, kIters = 4;
-  auto m_train = RandomDataGenerator(kRows, kTrainCols, 0.5).GenerateDMatrix(true);
-  auto learner = LearnerForTest(m_train, kIters);
+  auto learner = LearnerForTest(ctx, m_train, kIters);
   auto m_test = RandomDataGenerator(kRows, kTestCols, 0.5).GenerateDMatrix(false);
   auto m_invalid = RandomDataGenerator(kRows, kTrainCols + 1, 0.5).GenerateDMatrix(false);
 
   auto constexpr kWorldSize = 2;
   RunWithInMemoryCommunicator(kWorldSize, VerifyPredictionWithLesserFeaturesColumnSplit,
-                              learner.get(), predictor_name, kRows, m_test, m_invalid);
+                              learner.get(), kRows, m_test, m_invalid);
 }
 
 void GBTreeModelForTest(gbm::GBTreeModel *model, uint32_t split_ind,
@@ -252,7 +271,7 @@ void GBTreeModelForTest(gbm::GBTreeModel *model, uint32_t split_ind,
   model->CommitModelGroup(std::move(trees), 0);
 }
 
-void TestCategoricalPrediction(std::string name, bool is_column_split) {
+void TestCategoricalPrediction(Context const* ctx, bool is_column_split) {
   size_t constexpr kCols = 10;
   PredictionCacheEntry out_predictions;
 
@@ -262,13 +281,10 @@ void TestCategoricalPrediction(std::string name, bool is_column_split) {
   float left_weight = 1.3f;
   float right_weight = 1.7f;
 
-  Context ctx;
-  ctx.UpdateAllowUnknown(Args{});
-  gbm::GBTreeModel model(&mparam, &ctx);
+  gbm::GBTreeModel model(&mparam, ctx);
   GBTreeModelForTest(&model, split_ind, split_cat, left_weight, right_weight);
 
-  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
-  std::unique_ptr<Predictor> predictor{Predictor::Create(name.c_str(), &ctx)};
+  std::unique_ptr<Predictor> predictor{CreatePredictorForTest(ctx)};
 
   std::vector<float> row(kCols);
   row[split_ind] = split_cat;
@@ -298,12 +314,12 @@ void TestCategoricalPrediction(std::string name, bool is_column_split) {
   ASSERT_EQ(out_predictions.predictions.HostVector()[0], left_weight + score);
 }
 
-void TestCategoricalPredictionColumnSplit(std::string name) {
+void TestCategoricalPredictionColumnSplit(Context const *ctx) {
   auto constexpr kWorldSize = 2;
-  RunWithInMemoryCommunicator(kWorldSize, TestCategoricalPrediction, name, true);
+  RunWithInMemoryCommunicator(kWorldSize, TestCategoricalPrediction, ctx, true);
 }
 
-void TestCategoricalPredictLeaf(StringView name, bool is_column_split) {
+void TestCategoricalPredictLeaf(Context const *ctx, bool is_column_split) {
   size_t constexpr kCols = 10;
   PredictionCacheEntry out_predictions;
 
@@ -314,14 +330,10 @@ void TestCategoricalPredictLeaf(StringView name, bool is_column_split) {
   float left_weight = 1.3f;
   float right_weight = 1.7f;
 
-  Context ctx;
-  ctx.UpdateAllowUnknown(Args{});
-
-  gbm::GBTreeModel model(&mparam, &ctx);
+  gbm::GBTreeModel model(&mparam, ctx);
   GBTreeModelForTest(&model, split_ind, split_cat, left_weight, right_weight);
 
-  ctx.gpu_id = 0;
-  std::unique_ptr<Predictor> predictor{Predictor::Create(name.c_str(), &ctx)};
+  std::unique_ptr<Predictor> predictor{CreatePredictorForTest(ctx)};
 
   std::vector<float> row(kCols);
   row[split_ind] = split_cat;
@@ -346,19 +358,21 @@ void TestCategoricalPredictLeaf(StringView name, bool is_column_split) {
   ASSERT_EQ(out_predictions.predictions.HostVector()[0], 1);
 }
 
-void TestCategoricalPredictLeafColumnSplit(StringView name) {
+void TestCategoricalPredictLeafColumnSplit(Context const *ctx) {
   auto constexpr kWorldSize = 2;
-  RunWithInMemoryCommunicator(kWorldSize, TestCategoricalPredictLeaf, name, true);
+  RunWithInMemoryCommunicator(kWorldSize, TestCategoricalPredictLeaf, ctx, true);
 }
 
-void TestIterationRange(std::string name) {
+void TestIterationRange(Context const* ctx) {
   size_t constexpr kRows = 1000, kCols = 20, kClasses = 4, kForest = 3, kIters = 10;
-  auto dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix(true, true, kClasses);
-  auto learner = LearnerForTest(dmat, kIters, kForest);
-  learner->SetParams(Args{{"predictor", name}});
+  auto dmat = RandomDataGenerator(kRows, kCols, 0)
+                  .Device(ctx->gpu_id)
+                  .GenerateDMatrix(true, true, kClasses);
+  auto learner = LearnerForTest(ctx, dmat, kIters, kForest);
 
   bool bound = false;
-  std::unique_ptr<Learner> sliced {learner->Slice(0, 3, 1, &bound)};
+  bst_layer_t lend{3};
+  std::unique_ptr<Learner> sliced{learner->Slice(0, lend, 1, &bound)};
   ASSERT_FALSE(bound);
 
   HostDeviceVector<float> out_predt_sliced;
@@ -366,11 +380,8 @@ void TestIterationRange(std::string name) {
 
   // margin
   {
-    sliced->Predict(dmat, true, &out_predt_sliced, 0, 0, false, false, false,
-                    false, false);
-
-    learner->Predict(dmat, true, &out_predt_ranged, 0, 3, false, false, false,
-                     false, false);
+    sliced->Predict(dmat, true, &out_predt_sliced, 0, 0, false, false, false, false, false);
+    learner->Predict(dmat, true, &out_predt_ranged, 0, lend, false, false, false, false, false);
 
     auto const &h_sliced = out_predt_sliced.HostVector();
     auto const &h_range = out_predt_ranged.HostVector();
@@ -380,11 +391,8 @@ void TestIterationRange(std::string name) {
 
   // SHAP
   {
-    sliced->Predict(dmat, false, &out_predt_sliced, 0, 0, false, false,
-                    true, false, false);
-
-    learner->Predict(dmat, false, &out_predt_ranged, 0, 3, false, false, true,
-                     false, false);
+    sliced->Predict(dmat, false, &out_predt_sliced, 0, 0, false, false, true, false, false);
+    learner->Predict(dmat, false, &out_predt_ranged, 0, lend, false, false, true, false, false);
 
     auto const &h_sliced = out_predt_sliced.HostVector();
     auto const &h_range = out_predt_ranged.HostVector();
@@ -394,10 +402,8 @@ void TestIterationRange(std::string name) {
 
   // SHAP interaction
   {
-    sliced->Predict(dmat, false, &out_predt_sliced, 0, 0, false, false,
-                    false, false, true);
-    learner->Predict(dmat, false, &out_predt_ranged, 0, 3, false, false, false,
-                     false, true);
+    sliced->Predict(dmat, false, &out_predt_sliced, 0, 0, false, false, false, false, true);
+    learner->Predict(dmat, false, &out_predt_ranged, 0, lend, false, false, false, false, true);
     auto const &h_sliced = out_predt_sliced.HostVector();
     auto const &h_range = out_predt_ranged.HostVector();
     ASSERT_EQ(h_sliced.size(), h_range.size());
@@ -406,10 +412,8 @@ void TestIterationRange(std::string name) {
 
   // Leaf
   {
-    sliced->Predict(dmat, false, &out_predt_sliced, 0, 0, false, true,
-                    false, false, false);
-    learner->Predict(dmat, false, &out_predt_ranged, 0, 3, false, true, false,
-                     false, false);
+    sliced->Predict(dmat, false, &out_predt_sliced, 0, 0, false, true, false, false, false);
+    learner->Predict(dmat, false, &out_predt_ranged, 0, lend, false, true, false, false, false);
     auto const &h_sliced = out_predt_sliced.HostVector();
     auto const &h_range = out_predt_ranged.HostVector();
     ASSERT_EQ(h_sliced.size(), h_range.size());
@@ -456,11 +460,16 @@ void VerifyIterationRangeColumnSplit(DMatrix *dmat, Learner *learner, Learner *s
 }
 }  // anonymous namespace
 
-void TestIterationRangeColumnSplit(std::string name) {
+void TestIterationRangeColumnSplit(Context const* ctx) {
   size_t constexpr kRows = 1000, kCols = 20, kClasses = 4, kForest = 3, kIters = 10;
   auto dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix(true, true, kClasses);
-  auto learner = LearnerForTest(dmat, kIters, kForest);
-  learner->SetParams(Args{{"predictor", name}});
+  auto learner = LearnerForTest(ctx, dmat, kIters, kForest);
+
+  if (ctx->IsCPU()) {
+    learner->SetParams(Args{{"gpu_id", std::to_string(-1)}});
+  } else {
+    learner->SetParams(Args{{"gpu_id", std::to_string(0)}});
+  }
 
   bool bound = false;
   std::unique_ptr<Learner> sliced{learner->Slice(0, 3, 1, &bound)};
@@ -488,10 +497,10 @@ void TestIterationRangeColumnSplit(std::string name) {
                               leaf_ranged, leaf_sliced);
 }
 
-void TestSparsePrediction(float sparsity, std::string predictor) {
+void TestSparsePrediction(Context const *ctx, float sparsity) {
   size_t constexpr kRows = 512, kCols = 128, kIters = 4;
   auto Xy = RandomDataGenerator(kRows, kCols, sparsity).GenerateDMatrix(true);
-  auto learner = LearnerForTest(Xy, kIters);
+  auto learner = LearnerForTest(ctx, Xy, kIters);
 
   HostDeviceVector<float> sparse_predt;
 
@@ -501,11 +510,14 @@ void TestSparsePrediction(float sparsity, std::string predictor) {
   learner.reset(Learner::Create({Xy}));
   learner->LoadModel(model);
 
-  learner->SetParam("predictor", predictor);
+  if (ctx->IsCUDA()) {
+    learner->SetParam("tree_method", "gpu_hist");
+    learner->SetParam("gpu_id", std::to_string(ctx->gpu_id));
+  }
   learner->Predict(Xy, false, &sparse_predt, 0, 0);
 
   HostDeviceVector<float> with_nan(kRows * kCols, std::numeric_limits<float>::quiet_NaN());
-  auto& h_with_nan = with_nan.HostVector();
+  auto &h_with_nan = with_nan.HostVector();
   for (auto const &page : Xy->GetBatches<SparsePage>()) {
     auto batch = page.GetView();
     for (size_t i = 0; i < batch.Size(); ++i) {
@@ -516,7 +528,8 @@ void TestSparsePrediction(float sparsity, std::string predictor) {
     }
   }
 
-  learner->SetParam("predictor", "cpu_predictor");
+  learner->SetParam("tree_method", "hist");
+  learner->SetParam("gpu_id", "-1");
   // Xcode_12.4 doesn't compile with `std::make_shared`.
   auto dense = std::shared_ptr<DMatrix>(new data::DMatrixProxy{});
   auto array_interface = GetArrayInterface(&with_nan, kRows, kCols);
@@ -527,8 +540,8 @@ void TestSparsePrediction(float sparsity, std::string predictor) {
   learner->InplacePredict(dense, PredictionType::kValue, std::numeric_limits<float>::quiet_NaN(),
                           &p_dense_predt, 0, 0);
 
-  auto const& dense_predt = *p_dense_predt;
-  if (predictor == "cpu_predictor") {
+  auto const &dense_predt = *p_dense_predt;
+  if (ctx->IsCPU()) {
     ASSERT_EQ(dense_predt.HostVector(), sparse_predt.HostVector());
   } else {
     auto const &h_dense = dense_predt.HostVector();
@@ -556,10 +569,10 @@ void VerifySparsePredictionColumnSplit(DMatrix *dmat, Learner *learner,
 }
 }  // anonymous namespace
 
-void TestSparsePredictionColumnSplit(float sparsity, std::string predictor) {
+void TestSparsePredictionColumnSplit(Context const* ctx, float sparsity) {
   size_t constexpr kRows = 512, kCols = 128, kIters = 4;
   auto Xy = RandomDataGenerator(kRows, kCols, sparsity).GenerateDMatrix(true);
-  auto learner = LearnerForTest(Xy, kIters);
+  auto learner = LearnerForTest(ctx, Xy, kIters);
 
   HostDeviceVector<float> sparse_predt;
 
@@ -569,7 +582,7 @@ void TestSparsePredictionColumnSplit(float sparsity, std::string predictor) {
   learner.reset(Learner::Create({Xy}));
   learner->LoadModel(model);
 
-  learner->SetParam("predictor", predictor);
+  ConfigLearnerByCtx(ctx, learner.get());
   learner->Predict(Xy, false, &sparse_predt, 0, 0);
 
   auto constexpr kWorldSize = 2;
diff --git a/tests/cpp/predictor/test_predictor.h b/tests/cpp/predictor/test_predictor.h
index 4854029fb..c6f4d1816 100644
--- a/tests/cpp/predictor/test_predictor.h
+++ b/tests/cpp/predictor/test_predictor.h
@@ -31,8 +31,17 @@ inline gbm::GBTreeModel CreateTestModel(LearnerModelParam const* param, Context
   return model;
 }
 
+inline auto CreatePredictorForTest(Context const* ctx) {
+  if (ctx->IsCPU()) {
+    return Predictor::Create("cpu_predictor", ctx);
+  } else {
+    return Predictor::Create("gpu_predictor", ctx);
+  }
+}
+
+// fixme: cpu test
 template <typename Page>
-void TestPredictionFromGradientIndex(std::string name, size_t rows, size_t cols,
+void TestPredictionFromGradientIndex(Context const* ctx, size_t rows, size_t cols,
                                      std::shared_ptr<DMatrix> p_hist) {
   constexpr size_t kClasses { 3 };
 
@@ -40,12 +49,10 @@ void TestPredictionFromGradientIndex(std::string name, size_t rows, size_t cols,
   auto cuda_ctx = MakeCUDACtx(0);
 
   std::unique_ptr<Predictor> predictor =
-      std::unique_ptr<Predictor>(Predictor::Create(name, &cuda_ctx));
+      std::unique_ptr<Predictor>(CreatePredictorForTest(&cuda_ctx));
   predictor->Configure({});
 
-  Context ctx;
-  ctx.UpdateAllowUnknown(Args{});
-  gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx, kClasses);
+  gbm::GBTreeModel model = CreateTestModel(&mparam, ctx, kClasses);
 
   {
     auto p_precise = RandomDataGenerator(rows, cols, 0).GenerateDMatrix();
@@ -81,28 +88,30 @@ void TestTrainingPrediction(size_t rows, size_t bins, std::string tree_method,
                             std::shared_ptr<DMatrix> p_full,
                             std::shared_ptr<DMatrix> p_hist);
 
-void TestInplacePrediction(std::shared_ptr<DMatrix> x, std::string predictor, bst_row_t rows,
-                           bst_feature_t cols, int32_t device = -1);
+void TestInplacePrediction(Context const* ctx, std::shared_ptr<DMatrix> x, bst_row_t rows,
+                           bst_feature_t cols);
 
-void TestPredictionWithLesserFeatures(std::string preditor_name);
+void TestPredictionWithLesserFeatures(Context const* ctx);
 
-void TestPredictionWithLesserFeaturesColumnSplit(std::string preditor_name);
+void TestPredictionDeviceAccess();
 
-void TestCategoricalPrediction(std::string name, bool is_column_split = false);
+void TestCategoricalPrediction(Context const* ctx, bool is_column_split);
 
-void TestCategoricalPredictionColumnSplit(std::string name);
+void TestCategoricalPredictionColumnSplit(Context const* ctx);
 
-void TestCategoricalPredictLeaf(StringView name, bool is_column_split = false);
+void TestPredictionWithLesserFeaturesColumnSplit(Context const* ctx);
 
-void TestCategoricalPredictLeafColumnSplit(StringView name);
+void TestCategoricalPredictLeaf(Context const* ctx, bool is_column_split);
 
-void TestIterationRange(std::string name);
+void TestCategoricalPredictLeafColumnSplit(Context const* ctx);
 
-void TestIterationRangeColumnSplit(std::string name);
+void TestIterationRange(Context const* ctx);
 
-void TestSparsePrediction(float sparsity, std::string predictor);
+void TestIterationRangeColumnSplit(Context const* ctx);
 
-void TestSparsePredictionColumnSplit(float sparsity, std::string predictor);
+void TestSparsePrediction(Context const* ctx, float sparsity);
+
+void TestSparsePredictionColumnSplit(Context const* ctx, float sparsity);
 
 void TestVectorLeafPrediction(Context const* ctx);
 }  // namespace xgboost
diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc
index 5c561d2a4..0981fc352 100644
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@@ -342,16 +342,6 @@ TEST(Learner, GPUConfiguration) {
     learner->UpdateOneIter(0, p_dmat);
     ASSERT_EQ(learner->Ctx()->gpu_id, 0);
   }
-  {
-    // With CPU algorithm but GPU Predictor, this is to simulate when
-    // XGBoost is only used for prediction, so tree method is not
-    // specified.
-    std::unique_ptr<Learner> learner {Learner::Create(mat)};
-    learner->SetParams({Arg{"tree_method", "hist"},
-                        Arg{"predictor", "gpu_predictor"}});
-    learner->UpdateOneIter(0, p_dmat);
-    ASSERT_EQ(learner->Ctx()->gpu_id, 0);
-  }
 }
 #endif  // defined(XGBOOST_USE_CUDA)
 
diff --git a/tests/cpp/test_serialization.cc b/tests/cpp/test_serialization.cc
index a01a21ef6..350744c58 100644
--- a/tests/cpp/test_serialization.cc
+++ b/tests/cpp/test_serialization.cc
@@ -698,10 +698,6 @@ TEST_F(MultiClassesSerializationTest, GpuHist) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", std::to_string(kClasses)},
-                            // Somehow rebuilding the cache can generate slightly
-                            // different result (1e-7) with CPU predictor for some
-                            // entries.
-                            {"predictor", "gpu_predictor"},
                             // Mitigate the difference caused by hardware fused multiply
                             // add to tree weight during update prediction cache.
                             {"learning_rate", "1.0"},
diff --git a/tests/python-gpu/load_pickle.py b/tests/python-gpu/load_pickle.py
index f12dde360..caefa362d 100644
--- a/tests/python-gpu/load_pickle.py
+++ b/tests/python-gpu/load_pickle.py
@@ -1,5 +1,5 @@
-'''Loading a pickled model generated by test_pickling.py, only used by
-`test_gpu_with_dask.py`'''
+"""Loading a pickled model generated by test_pickling.py, only used by
+`test_gpu_with_dask.py`"""
 import json
 import os
 
@@ -12,9 +12,9 @@ from xgboost import testing as tm
 
 
 class TestLoadPickle:
-    def test_load_pkl(self):
-        '''Test whether prediction is correct.'''
-        assert os.environ['CUDA_VISIBLE_DEVICES'] == '-1'
+    def test_load_pkl(self) -> None:
+        """Test whether prediction is correct."""
+        assert os.environ["CUDA_VISIBLE_DEVICES"] == "-1"
         bst = load_pickle(model_path)
         x, y = build_dataset()
         if isinstance(bst, xgb.Booster):
@@ -28,46 +28,42 @@ class TestLoadPickle:
 
         assert len(res) == 10
 
-    def test_predictor_type_is_auto(self):
-        '''Under invalid CUDA_VISIBLE_DEVICES, predictor should be set to
-        auto'''
-        assert os.environ['CUDA_VISIBLE_DEVICES'] == '-1'
+    def test_context_is_removed(self) -> None:
+        """Under invalid CUDA_VISIBLE_DEVICES, context should reset"""
+        assert os.environ["CUDA_VISIBLE_DEVICES"] == "-1"
         bst = load_pickle(model_path)
         config = bst.save_config()
         config = json.loads(config)
-        assert config['learner']['gradient_booster']['gbtree_train_param'][
-            'predictor'] == 'auto'
+        assert config["learner"]["generic_param"]["gpu_id"] == "-1"
 
-    def test_predictor_type_is_gpu(self):
-        '''When CUDA_VISIBLE_DEVICES is not specified, keep using
-        `gpu_predictor`'''
-        assert 'CUDA_VISIBLE_DEVICES' not in os.environ.keys()
+    def test_context_is_preserved(self) -> None:
+        """Test the device context is preserved after pickling."""
+        assert "CUDA_VISIBLE_DEVICES" not in os.environ.keys()
         bst = load_pickle(model_path)
         config = bst.save_config()
         config = json.loads(config)
-        assert config['learner']['gradient_booster']['gbtree_train_param'][
-            'predictor'] == 'gpu_predictor'
+        assert config["learner"]["generic_param"]["gpu_id"] == "0"
 
-    def test_wrap_gpu_id(self):
-        assert os.environ['CUDA_VISIBLE_DEVICES'] == '0'
+    def test_wrap_gpu_id(self) -> None:
+        assert os.environ["CUDA_VISIBLE_DEVICES"] == "0"
         bst = load_pickle(model_path)
         config = bst.save_config()
         config = json.loads(config)
-        assert config['learner']['generic_param']['gpu_id'] == '0'
+        assert config["learner"]["generic_param"]["gpu_id"] == "0"
 
         x, y = build_dataset()
         test_x = xgb.DMatrix(x)
         res = bst.predict(test_x)
         assert len(res) == 10
 
-    def test_training_on_cpu_only_env(self):
-        assert os.environ['CUDA_VISIBLE_DEVICES'] == '-1'
+    def test_training_on_cpu_only_env(self) -> None:
+        assert os.environ["CUDA_VISIBLE_DEVICES"] == "-1"
         rng = np.random.RandomState(1994)
         X = rng.randn(10, 10)
         y = rng.randn(10)
         with tm.captured_output() as (out, err):
             # Test no thrust exception is thrown
             with pytest.raises(xgb.core.XGBoostError):
-                xgb.train({'tree_method': 'gpu_hist'}, xgb.DMatrix(X, y))
+                xgb.train({"tree_method": "gpu_hist"}, xgb.DMatrix(X, y))
 
-            assert out.getvalue().find('No visible GPU is found') != -1
+            assert out.getvalue().find("No visible GPU is found") != -1
diff --git a/tests/python-gpu/test_device_quantile_dmatrix.py b/tests/python-gpu/test_device_quantile_dmatrix.py
index c5b7e4fc5..477e9f2a1 100644
--- a/tests/python-gpu/test_device_quantile_dmatrix.py
+++ b/tests/python-gpu/test_device_quantile_dmatrix.py
@@ -203,7 +203,7 @@ class TestQuantileDMatrix:
         np.testing.assert_equal(h_ret.indices, d_ret.indices)
 
         booster = xgb.train(
-            {"tree_method": "gpu_hist", "predictor": "gpu_predictor"}, dtrain=d_m
+            {"tree_method": "gpu_hist", "gpu_id": "0"}, dtrain=d_m
         )
 
         np.testing.assert_allclose(
diff --git a/tests/python-gpu/test_from_cupy.py b/tests/python-gpu/test_from_cupy.py
index 8ac2bf876..70080b13a 100644
--- a/tests/python-gpu/test_from_cupy.py
+++ b/tests/python-gpu/test_from_cupy.py
@@ -221,9 +221,10 @@ Arrow specification.'''
     def test_specified_device(self):
         import cupy as cp
         cp.cuda.runtime.setDevice(0)
-        dtrain = dmatrix_from_cupy(
-            np.float32, xgb.QuantileDMatrix, np.nan)
-        with pytest.raises(xgb.core.XGBoostError):
+        dtrain = dmatrix_from_cupy(np.float32, xgb.QuantileDMatrix, np.nan)
+        with pytest.raises(
+            xgb.core.XGBoostError, match="Data is resided on a different device"
+        ):
             xgb.train(
                 {'tree_method': 'gpu_hist', 'gpu_id': 1}, dtrain, num_boost_round=10
             )
diff --git a/tests/python-gpu/test_gpu_pickling.py b/tests/python-gpu/test_gpu_pickling.py
index 7c452926e..49ac24740 100644
--- a/tests/python-gpu/test_gpu_pickling.py
+++ b/tests/python-gpu/test_gpu_pickling.py
@@ -1,5 +1,4 @@
-'''Test model IO with pickle.'''
-import json
+"""Test model IO with pickle."""
 import os
 import pickle
 import subprocess
@@ -11,49 +10,48 @@ import xgboost as xgb
 from xgboost import XGBClassifier
 from xgboost import testing as tm
 
-model_path = './model.pkl'
+model_path = "./model.pkl"
 
 pytestmark = tm.timeout(30)
 
 
 def build_dataset():
     N = 10
-    x = np.linspace(0, N*N, N*N)
+    x = np.linspace(0, N * N, N * N)
     x = x.reshape((N, N))
     y = np.linspace(0, N, N)
     return x, y
 
 
 def save_pickle(bst, path):
-    with open(path, 'wb') as fd:
+    with open(path, "wb") as fd:
         pickle.dump(bst, fd)
 
 
 def load_pickle(path):
-    with open(path, 'rb') as fd:
+    with open(path, "rb") as fd:
         bst = pickle.load(fd)
     return bst
 
 
 class TestPickling:
-    args_template = [
-        "pytest",
-        "--verbose",
-        "-s",
-        "--fulltrace"]
+    args_template = ["pytest", "--verbose", "-s", "--fulltrace"]
 
     def run_pickling(self, bst) -> None:
         save_pickle(bst, model_path)
         args = [
-            "pytest", "--verbose", "-s", "--fulltrace",
-            "./tests/python-gpu/load_pickle.py::TestLoadPickle::test_load_pkl"
+            "pytest",
+            "--verbose",
+            "-s",
+            "--fulltrace",
+            "./tests/python-gpu/load_pickle.py::TestLoadPickle::test_load_pkl",
         ]
-        command = ''
+        command = ""
         for arg in args:
             command += arg
-            command += ' '
+            command += " "
 
-        cuda_environment = {'CUDA_VISIBLE_DEVICES': '-1'}
+        cuda_environment = {"CUDA_VISIBLE_DEVICES": "-1"}
         env = os.environ.copy()
         # Passing new_environment directly to `env' argument results
         # in failure on Windows:
@@ -72,7 +70,7 @@ class TestPickling:
         x, y = build_dataset()
         train_x = xgb.DMatrix(x, label=y)
 
-        param = {'tree_method': 'gpu_hist', "gpu_id": 0}
+        param = {"tree_method": "gpu_hist", "gpu_id": 0}
         bst = xgb.train(param, train_x)
         self.run_pickling(bst)
 
@@ -91,43 +89,46 @@ class TestPickling:
         X, y = build_dataset()
         dtrain = xgb.DMatrix(X, y)
 
-        bst = xgb.train({'tree_method': 'gpu_hist',
-                         'gpu_id': 1},
-                        dtrain, num_boost_round=6)
+        bst = xgb.train(
+            {"tree_method": "gpu_hist", "gpu_id": 1}, dtrain, num_boost_round=6
+        )
 
-        model_path = 'model.pkl'
+        model_path = "model.pkl"
         save_pickle(bst, model_path)
-        cuda_environment = {'CUDA_VISIBLE_DEVICES': '0'}
+        cuda_environment = {"CUDA_VISIBLE_DEVICES": "0"}
         env = os.environ.copy()
         env.update(cuda_environment)
         args = self.args_template.copy()
         args.append(
-            "./tests/python-gpu/"
-            "load_pickle.py::TestLoadPickle::test_wrap_gpu_id"
+            "./tests/python-gpu/" "load_pickle.py::TestLoadPickle::test_wrap_gpu_id"
         )
         status = subprocess.call(args, env=env)
         assert status == 0
         os.remove(model_path)
 
-    def test_pickled_predictor(self):
-        x, y = build_dataset()
+    def test_pickled_context(self):
+        x, y = tm.make_sparse_regression(10, 10, sparsity=0.8, as_dense=True)
         train_x = xgb.DMatrix(x, label=y)
 
-        param = {'tree_method': 'gpu_hist',
-                 'verbosity': 1, 'predictor': 'gpu_predictor'}
+        param = {"tree_method": "gpu_hist", "verbosity": 1}
         bst = xgb.train(param, train_x)
-        config = json.loads(bst.save_config())
-        assert config['learner']['gradient_booster']['gbtree_train_param'][
-            'predictor'] == 'gpu_predictor'
+
+        with tm.captured_output() as (out, err):
+            bst.inplace_predict(x)
+
+        # The warning is redirected to Python callback, so it's printed in stdout
+        # instead of stderr.
+        stdout = out.getvalue()
+        assert stdout.find("mismatched devices") != -1
 
         save_pickle(bst, model_path)
 
         args = self.args_template.copy()
-        args.append(
-            "./tests/python-gpu/"
-            "load_pickle.py::TestLoadPickle::test_predictor_type_is_auto")
+        root = tm.project_root(__file__)
+        path = os.path.join(root, "tests", "python-gpu", "load_pickle.py")
+        args.append(path + "::TestLoadPickle::test_context_is_removed")
 
-        cuda_environment = {'CUDA_VISIBLE_DEVICES': '-1'}
+        cuda_environment = {"CUDA_VISIBLE_DEVICES": "-1"}
         env = os.environ.copy()
         env.update(cuda_environment)
 
@@ -138,25 +139,29 @@ class TestPickling:
         args = self.args_template.copy()
         args.append(
             "./tests/python-gpu/"
-            "load_pickle.py::TestLoadPickle::test_predictor_type_is_gpu")
+            "load_pickle.py::TestLoadPickle::test_context_is_preserved"
+        )
 
         # Load in environment that has GPU.
         env = os.environ.copy()
-        assert 'CUDA_VISIBLE_DEVICES' not in env.keys()
+        assert "CUDA_VISIBLE_DEVICES" not in env.keys()
         status = subprocess.call(args, env=env)
         assert status == 0
 
         os.remove(model_path)
 
     @pytest.mark.skipif(**tm.no_sklearn())
-    def test_predict_sklearn_pickle(self):
+    def test_predict_sklearn_pickle(self) -> None:
         from sklearn.datasets import load_digits
+
         x, y = load_digits(return_X_y=True)
 
-        kwargs = {'tree_method': 'gpu_hist',
-                  'predictor': 'gpu_predictor',
-                  'objective': 'binary:logistic',
-                  'n_estimators': 10}
+        kwargs = {
+            "tree_method": "gpu_hist",
+            "objective": "binary:logistic",
+            "gpu_id": 0,
+            "n_estimators": 10,
+        }
 
         model = XGBClassifier(**kwargs)
         model.fit(x, y)
@@ -165,24 +170,25 @@ class TestPickling:
         del model
 
         # load model
-        model: xgb.XGBClassifier = load_pickle("model.pkl")
+        model = load_pickle("model.pkl")
         os.remove("model.pkl")
 
         gpu_pred = model.predict(x, output_margin=True)
 
         # Switch to CPU predictor
         bst = model.get_booster()
-        bst.set_param({'predictor': 'cpu_predictor'})
+        tm.set_ordinal(-1, bst)
         cpu_pred = model.predict(x, output_margin=True)
         np.testing.assert_allclose(cpu_pred, gpu_pred, rtol=1e-5)
 
     def test_training_on_cpu_only_env(self):
-        cuda_environment = {'CUDA_VISIBLE_DEVICES': '-1'}
+        cuda_environment = {"CUDA_VISIBLE_DEVICES": "-1"}
         env = os.environ.copy()
         env.update(cuda_environment)
         args = self.args_template.copy()
         args.append(
             "./tests/python-gpu/"
-            "load_pickle.py::TestLoadPickle::test_training_on_cpu_only_env")
+            "load_pickle.py::TestLoadPickle::test_training_on_cpu_only_env"
+        )
         status = subprocess.call(args, env=env)
         assert status == 0
diff --git a/tests/python-gpu/test_gpu_prediction.py b/tests/python-gpu/test_gpu_prediction.py
index dba2e9aeb..0d961d0e3 100644
--- a/tests/python-gpu/test_gpu_prediction.py
+++ b/tests/python-gpu/test_gpu_prediction.py
@@ -1,4 +1,5 @@
 import sys
+from copy import copy
 
 import numpy as np
 import pytest
@@ -11,8 +12,10 @@ from xgboost.compat import PANDAS_INSTALLED
 if PANDAS_INSTALLED:
     from hypothesis.extra.pandas import column, data_frames, range_indexes
 else:
+
     def noop(*args, **kwargs):
         pass
+
     column, data_frames, range_indexes = noop, noop, noop
 
 sys.path.append("tests/python")
@@ -21,16 +24,20 @@ from test_predict import run_threaded_predict  # noqa
 
 rng = np.random.RandomState(1994)
 
-shap_parameter_strategy = strategies.fixed_dictionaries({
-    'max_depth': strategies.integers(1, 11),
-    'max_leaves': strategies.integers(0, 256),
-    'num_parallel_tree': strategies.sampled_from([1, 10]),
-}).filter(lambda x: x['max_depth'] > 0 or x['max_leaves'] > 0)
+shap_parameter_strategy = strategies.fixed_dictionaries(
+    {
+        "max_depth": strategies.integers(1, 11),
+        "max_leaves": strategies.integers(0, 256),
+        "num_parallel_tree": strategies.sampled_from([1, 10]),
+    }
+).filter(lambda x: x["max_depth"] > 0 or x["max_leaves"] > 0)
 
-predict_parameter_strategy = strategies.fixed_dictionaries({
-    'max_depth': strategies.integers(1, 8),
-    'num_parallel_tree': strategies.sampled_from([1, 4]),
-})
+predict_parameter_strategy = strategies.fixed_dictionaries(
+    {
+        "max_depth": strategies.integers(1, 8),
+        "num_parallel_tree": strategies.sampled_from([1, 4]),
+    }
+)
 
 pytestmark = tm.timeout(20)
 
@@ -47,43 +54,45 @@ class TestGPUPredict:
         # with 5000 rows is 0.04.
         for num_rows in test_num_rows:
             for num_cols in test_num_cols:
-                dtrain = xgb.DMatrix(np.random.randn(num_rows, num_cols),
-                                     label=[0, 1] * int(num_rows / 2))
-                dval = xgb.DMatrix(np.random.randn(num_rows, num_cols),
-                                   label=[0, 1] * int(num_rows / 2))
-                dtest = xgb.DMatrix(np.random.randn(num_rows, num_cols),
-                                    label=[0, 1] * int(num_rows / 2))
-                watchlist = [(dtrain, 'train'), (dval, 'validation')]
+                dtrain = xgb.DMatrix(
+                    np.random.randn(num_rows, num_cols),
+                    label=[0, 1] * int(num_rows / 2),
+                )
+                dval = xgb.DMatrix(
+                    np.random.randn(num_rows, num_cols),
+                    label=[0, 1] * int(num_rows / 2),
+                )
+                dtest = xgb.DMatrix(
+                    np.random.randn(num_rows, num_cols),
+                    label=[0, 1] * int(num_rows / 2),
+                )
+                watchlist = [(dtrain, "train"), (dval, "validation")]
                 res = {}
                 param = {
                     "objective": "binary:logistic",
-                    "predictor": "gpu_predictor",
-                    'eval_metric': 'logloss',
-                    'tree_method': 'gpu_hist',
-                    'max_depth': 1
+                    "eval_metric": "logloss",
+                    "tree_method": "gpu_hist",
+                    "gpu_id": 0,
+                    "max_depth": 1,
                 }
-                bst = xgb.train(param, dtrain, iterations, evals=watchlist,
-                                evals_result=res)
-                assert self.non_increasing(res["train"]["logloss"])
+                bst = xgb.train(
+                    param, dtrain, iterations, evals=watchlist, evals_result=res
+                )
+                assert tm.non_increasing(res["train"]["logloss"], tolerance=0.001)
+
                 gpu_pred_train = bst.predict(dtrain, output_margin=True)
                 gpu_pred_test = bst.predict(dtest, output_margin=True)
                 gpu_pred_val = bst.predict(dval, output_margin=True)
 
-                param["predictor"] = "cpu_predictor"
-                bst_cpu = xgb.train(param, dtrain, iterations, evals=watchlist)
+                bst.set_param({"gpu_id": -1, "tree_method": "hist"})
+                bst_cpu = copy(bst)
                 cpu_pred_train = bst_cpu.predict(dtrain, output_margin=True)
                 cpu_pred_test = bst_cpu.predict(dtest, output_margin=True)
                 cpu_pred_val = bst_cpu.predict(dval, output_margin=True)
 
-                np.testing.assert_allclose(cpu_pred_train, gpu_pred_train,
-                                           rtol=1e-6)
-                np.testing.assert_allclose(cpu_pred_val, gpu_pred_val,
-                                           rtol=1e-6)
-                np.testing.assert_allclose(cpu_pred_test, gpu_pred_test,
-                                           rtol=1e-6)
-
-    def non_increasing(self, L):
-        return all((y - x) < 0.001 for x, y in zip(L, L[1:]))
+                np.testing.assert_allclose(cpu_pred_train, gpu_pred_train, rtol=1e-6)
+                np.testing.assert_allclose(cpu_pred_val, gpu_pred_val, rtol=1e-6)
+                np.testing.assert_allclose(cpu_pred_test, gpu_pred_test, rtol=1e-6)
 
     # Test case for a bug where multiple batch predictions made on a
     # test set produce incorrect results
@@ -94,26 +103,22 @@ class TestGPUPredict:
 
         n = 1000
         X, y = make_regression(n, random_state=rng)
-        X_train, X_test, y_train, y_test = train_test_split(X, y,
-                                                            random_state=123)
+        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)
         dtrain = xgb.DMatrix(X_train, label=y_train)
-        dtest = xgb.DMatrix(X_test)
 
         params = {}
         params["tree_method"] = "gpu_hist"
+        bst = xgb.train(params, dtrain)
 
-        params['predictor'] = "gpu_predictor"
-        bst_gpu_predict = xgb.train(params, dtrain)
+        tm.set_ordinal(0, bst)
+        # Don't reuse the DMatrix for prediction, otherwise the result is cached.
+        predict_gpu_0 = bst.predict(xgb.DMatrix(X_test))
+        predict_gpu_1 = bst.predict(xgb.DMatrix(X_test))
+        tm.set_ordinal(-1, bst)
+        predict_cpu = bst.predict(xgb.DMatrix(X_test))
 
-        params['predictor'] = "cpu_predictor"
-        bst_cpu_predict = xgb.train(params, dtrain)
-
-        predict0 = bst_gpu_predict.predict(dtest)
-        predict1 = bst_gpu_predict.predict(dtest)
-        cpu_predict = bst_cpu_predict.predict(dtest)
-
-        assert np.allclose(predict0, predict1)
-        assert np.allclose(predict0, cpu_predict)
+        assert np.allclose(predict_gpu_0, predict_gpu_1)
+        assert np.allclose(predict_gpu_0, predict_cpu)
 
     @pytest.mark.skipif(**tm.no_sklearn())
     def test_sklearn(self):
@@ -121,30 +126,31 @@ class TestGPUPredict:
         tr_size = 2500
         X = np.random.rand(m, n)
         y = 200 * np.matmul(X, np.arange(-3, -3 + n))
+        y = y.reshape(y.size)
         X_train, y_train = X[:tr_size, :], y[:tr_size]
         X_test, y_test = X[tr_size:, :], y[tr_size:]
 
-        # First with cpu_predictor
-        params = {'tree_method': 'gpu_hist',
-                  'predictor': 'cpu_predictor',
-                  'n_jobs': -1,
-                  'seed': 123}
-        m = xgb.XGBRegressor(**params).fit(X_train, y_train)
-        cpu_train_score = m.score(X_train, y_train)
-        cpu_test_score = m.score(X_test, y_test)
-
-        # Now with gpu_predictor
-        params['predictor'] = 'gpu_predictor'
-
+        params = {
+            "tree_method": "gpu_hist",
+            "gpu_id": "0",
+            "n_jobs": -1,
+            "seed": 123,
+        }
         m = xgb.XGBRegressor(**params).fit(X_train, y_train)
         gpu_train_score = m.score(X_train, y_train)
         gpu_test_score = m.score(X_test, y_test)
 
+        # Now with cpu
+        m = tm.set_ordinal(-1, m)
+        cpu_train_score = m.score(X_train, y_train)
+        cpu_test_score = m.score(X_test, y_test)
+
         assert np.allclose(cpu_train_score, gpu_train_score)
         assert np.allclose(cpu_test_score, gpu_test_score)
 
     def run_inplace_base_margin(self, booster, dtrain, X, base_margin):
         import cupy as cp
+
         dtrain.set_info(base_margin=base_margin)
         from_inplace = booster.inplace_predict(data=X, base_margin=base_margin)
         from_dmatrix = booster.predict(dtrain)
@@ -152,10 +158,11 @@ class TestGPUPredict:
 
     def run_inplace_predict_cupy(self, device: int) -> None:
         import cupy as cp
+
         cp.cuda.runtime.setDevice(device)
         rows = 1000
         cols = 10
-        missing = 11            # set to integer for testing
+        missing = 11  # set to integer for testing
 
         cp_rng = cp.random.RandomState(1994)
         cp.random.set_random_state(cp_rng)
@@ -168,7 +175,7 @@ class TestGPUPredict:
         dtrain = xgb.DMatrix(X, y)
 
         booster = xgb.train(
-            {'tree_method': 'gpu_hist', "gpu_id": device}, dtrain, num_boost_round=10
+            {"tree_method": "gpu_hist", "gpu_id": device}, dtrain, num_boost_round=10
         )
 
         test = xgb.DMatrix(X[:10, ...], missing=missing)
@@ -186,7 +193,7 @@ class TestGPUPredict:
         # Don't do this on Windows, see issue #5793
         if sys.platform.startswith("win"):
             pytest.skip(
-                'Multi-threaded in-place prediction with cuPy is not working on Windows'
+                "Multi-threaded in-place prediction with cuPy is not working on Windows"
             )
         for i in range(10):
             run_threaded_predict(X, rows, predict_dense)
@@ -205,9 +212,10 @@ class TestGPUPredict:
         )
         reg.fit(X, y)
 
+        reg = tm.set_ordinal(device, reg)
         gpu_predt = reg.predict(X)
-        reg.set_params(predictor="cpu_predictor")
-        cpu_predt = reg.predict(X)
+        reg = tm.set_ordinal(-1, reg)
+        cpu_predt = reg.predict(cp.asnumpy(X))
         np.testing.assert_allclose(gpu_predt, cpu_predt, atol=1e-6)
         cp.cuda.runtime.setDevice(0)
 
@@ -215,11 +223,11 @@ class TestGPUPredict:
     def test_inplace_predict_cupy(self):
         self.run_inplace_predict_cupy(0)
 
-    @pytest.mark.xfail
     @pytest.mark.skipif(**tm.no_cupy())
     @pytest.mark.mgpu
     def test_inplace_predict_cupy_specified_device(self):
         import cupy as cp
+
         n_devices = cp.cuda.runtime.getDeviceCount()
         for d in range(n_devices):
             self.run_inplace_predict_cupy(d)
@@ -230,6 +238,7 @@ class TestGPUPredict:
         import cudf
         import cupy as cp
         import pandas as pd
+
         rows = 1000
         cols = 10
         rng = np.random.RandomState(1994)
@@ -241,8 +250,7 @@ class TestGPUPredict:
 
         dtrain = xgb.DMatrix(X, y)
 
-        booster = xgb.train({'tree_method': 'gpu_hist'},
-                            dtrain, num_boost_round=10)
+        booster = xgb.train({"tree_method": "gpu_hist"}, dtrain, num_boost_round=10)
         test = xgb.DMatrix(X)
         predt_from_array = booster.inplace_predict(X)
         predt_from_dmatrix = booster.predict(test)
@@ -272,11 +280,12 @@ class TestGPUPredict:
     def test_shap(self, num_rounds, dataset, param):
         if dataset.name.endswith("-l1"):  # not supported by the exact tree method
             return
-        param.update({"predictor": "gpu_predictor", "gpu_id": 0})
+        param.update({"tree_method": "gpu_hist", "gpu_id": 0})
         param = dataset.set_params(param)
         dmat = dataset.get_dmat()
         bst = xgb.train(param, dmat, num_rounds)
         test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w, dataset.margin)
+        bst = tm.set_ordinal(0, bst)
         shap = bst.predict(test_dmat, pred_contribs=True)
         margin = bst.predict(test_dmat, output_margin=True)
         assume(len(dataset.y) > 0)
@@ -289,31 +298,35 @@ class TestGPUPredict:
     def test_shap_interactions(self, num_rounds, dataset, param):
         if dataset.name.endswith("-l1"):  # not supported by the exact tree method
             return
-        param.update({"predictor": "gpu_predictor", "gpu_id": 0})
+        param.update({"tree_method": "hist", "gpu_id": 0})
         param = dataset.set_params(param)
         dmat = dataset.get_dmat()
         bst = xgb.train(param, dmat, num_rounds)
         test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w, dataset.margin)
+        bst = tm.set_ordinal(0, bst)
         shap = bst.predict(test_dmat, pred_interactions=True)
         margin = bst.predict(test_dmat, output_margin=True)
         assume(len(dataset.y) > 0)
-        assert np.allclose(np.sum(shap, axis=(len(shap.shape) - 1, len(shap.shape) - 2)),
-                           margin,
-                           1e-3, 1e-3)
+        assert np.allclose(
+            np.sum(shap, axis=(len(shap.shape) - 1, len(shap.shape) - 2)),
+            margin,
+            1e-3,
+            1e-3,
+        )
 
     def test_shap_categorical(self):
         X, y = tm.make_categorical(100, 20, 7, False)
         Xy = xgb.DMatrix(X, y, enable_categorical=True)
         booster = xgb.train({"tree_method": "gpu_hist"}, Xy, num_boost_round=10)
 
-        booster.set_param({"predictor": "gpu_predictor"})
+        booster = tm.set_ordinal(0, booster)
         shap = booster.predict(Xy, pred_contribs=True)
         margin = booster.predict(Xy, output_margin=True)
         np.testing.assert_allclose(
             np.sum(shap, axis=len(shap.shape) - 1), margin, rtol=1e-3
         )
 
-        booster.set_param({"predictor": "cpu_predictor"})
+        booster = tm.set_ordinal(-1, booster)
         shap = booster.predict(Xy, pred_contribs=True)
         margin = booster.predict(Xy, output_margin=True)
         np.testing.assert_allclose(
@@ -321,18 +334,20 @@ class TestGPUPredict:
         )
 
     def test_predict_leaf_basic(self):
-        gpu_leaf = run_predict_leaf('gpu_predictor')
-        cpu_leaf = run_predict_leaf('cpu_predictor')
+        gpu_leaf = run_predict_leaf(0)
+        cpu_leaf = run_predict_leaf(-1)
         np.testing.assert_equal(gpu_leaf, cpu_leaf)
 
     def run_predict_leaf_booster(self, param, num_rounds, dataset):
         param = dataset.set_params(param)
         m = dataset.get_dmat()
-        booster = xgb.train(param, dtrain=dataset.get_dmat(), num_boost_round=num_rounds)
-        booster.set_param({'predictor': 'cpu_predictor'})
+        booster = xgb.train(
+            param, dtrain=dataset.get_dmat(), num_boost_round=num_rounds
+        )
+        booster = tm.set_ordinal(-1, booster)
         cpu_leaf = booster.predict(m, pred_leaf=True)
 
-        booster.set_param({'predictor': 'gpu_predictor'})
+        booster = tm.set_ordinal(0, booster)
         gpu_leaf = booster.predict(m, pred_leaf=True)
 
         np.testing.assert_equal(cpu_leaf, gpu_leaf)
@@ -344,8 +359,8 @@ class TestGPUPredict:
         if param.get("num_parallel_tree", 1) > 1 and dataset.name.endswith("-l1"):
             return
 
-        param['booster'] = 'gbtree'
-        param['tree_method'] = 'gpu_hist'
+        param["booster"] = "gbtree"
+        param["tree_method"] = "gpu_hist"
         self.run_predict_leaf_booster(param, 10, dataset)
 
     @given(predict_parameter_strategy, tm.make_dataset_strategy())
@@ -355,42 +370,61 @@ class TestGPUPredict:
         if param.get("num_parallel_tree", 1) > 1 and dataset.name.endswith("-l1"):
             return
 
-        param['booster'] = 'dart'
-        param['tree_method'] = 'gpu_hist'
+        param["booster"] = "dart"
+        param["tree_method"] = "gpu_hist"
         self.run_predict_leaf_booster(param, 10, dataset)
 
     @pytest.mark.skipif(**tm.no_sklearn())
     @pytest.mark.skipif(**tm.no_pandas())
-    @given(df=data_frames([column('x0', elements=strategies.integers(min_value=0, max_value=3)),
-                           column('x1', elements=strategies.integers(min_value=0, max_value=5))],
-                          index=range_indexes(min_size=20, max_size=50)))
+    @given(
+        df=data_frames(
+            [
+                column("x0", elements=strategies.integers(min_value=0, max_value=3)),
+                column("x1", elements=strategies.integers(min_value=0, max_value=5)),
+            ],
+            index=range_indexes(min_size=20, max_size=50),
+        )
+    )
     @settings(deadline=None, max_examples=20, print_blob=True)
     def test_predict_categorical_split(self, df):
         from sklearn.metrics import mean_squared_error
 
-        df = df.astype('category')
-        x0, x1 = df['x0'].to_numpy(), df['x1'].to_numpy()
+        df = df.astype("category")
+        x0, x1 = df["x0"].to_numpy(), df["x1"].to_numpy()
         y = (x0 * 10 - 20) + (x1 - 2)
         dtrain = xgb.DMatrix(df, label=y, enable_categorical=True)
 
         params = {
-            'tree_method': 'gpu_hist', 'predictor': 'gpu_predictor',
-            'max_depth': 3, 'learning_rate': 1.0, 'base_score': 0.0, 'eval_metric': 'rmse'
+            "tree_method": "gpu_hist",
+            "max_depth": 3,
+            "learning_rate": 1.0,
+            "base_score": 0.0,
+            "eval_metric": "rmse",
+            "gpu_id": "0",
         }
 
         eval_history = {}
-        bst = xgb.train(params, dtrain, num_boost_round=5, evals=[(dtrain, 'train')],
-                        verbose_eval=False, evals_result=eval_history)
-
+        bst = xgb.train(
+            params,
+            dtrain,
+            num_boost_round=5,
+            evals=[(dtrain, "train")],
+            verbose_eval=False,
+            evals_result=eval_history,
+        )
+        bst = tm.set_ordinal(0, bst)
         pred = bst.predict(dtrain)
         rmse = mean_squared_error(y_true=y, y_pred=pred, squared=False)
-        np.testing.assert_almost_equal(rmse, eval_history['train']['rmse'][-1], decimal=5)
+        np.testing.assert_almost_equal(
+            rmse, eval_history["train"]["rmse"][-1], decimal=5
+        )
 
     @pytest.mark.skipif(**tm.no_cupy())
     @pytest.mark.parametrize("n_classes", [2, 3])
     def test_predict_dart(self, n_classes):
         import cupy as cp
         from sklearn.datasets import make_classification
+
         n_samples = 1000
         X_, y_ = make_classification(
             n_samples=n_samples, n_informative=5, n_classes=n_classes
@@ -403,7 +437,7 @@ class TestGPUPredict:
                 "tree_method": "gpu_hist",
                 "booster": "dart",
                 "rate_drop": 0.5,
-                "objective": "binary:logistic"
+                "objective": "binary:logistic",
             }
         else:
             params = {
@@ -411,15 +445,18 @@ class TestGPUPredict:
                 "booster": "dart",
                 "rate_drop": 0.5,
                 "objective": "multi:softprob",
-                "num_class": n_classes
+                "num_class": n_classes,
             }
 
         booster = xgb.train(params, Xy, num_boost_round=32)
-        # predictor=auto
+
+        # auto (GPU)
         inplace = booster.inplace_predict(X)
         copied = booster.predict(Xy)
+
+        # CPU
+        booster = tm.set_ordinal(-1, booster)
         cpu_inplace = booster.inplace_predict(X_)
-        booster.set_param({"predictor": "cpu_predictor"})
         cpu_copied = booster.predict(Xy)
 
         copied = cp.array(copied)
@@ -427,7 +464,8 @@ class TestGPUPredict:
         cp.testing.assert_allclose(cpu_copied, copied, atol=1e-6)
         cp.testing.assert_allclose(inplace, copied, atol=1e-6)
 
-        booster.set_param({"predictor": "gpu_predictor"})
+        # GPU
+        booster = tm.set_ordinal(0, booster)
         inplace = booster.inplace_predict(X)
         copied = booster.predict(Xy)
 
@@ -437,12 +475,11 @@ class TestGPUPredict:
     @pytest.mark.skipif(**tm.no_cupy())
     def test_dtypes(self):
         import cupy as cp
+
         rows = 1000
         cols = 10
         rng = cp.random.RandomState(1994)
-        orig = rng.randint(low=0, high=127, size=rows * cols).reshape(
-            rows, cols
-        )
+        orig = rng.randint(low=0, high=127, size=rows * cols).reshape(rows, cols)
         y = rng.randint(low=0, high=127, size=rows)
         dtrain = xgb.DMatrix(orig, label=y)
         booster = xgb.train({"tree_method": "gpu_hist"}, dtrain)
@@ -450,19 +487,16 @@ class TestGPUPredict:
         predt_orig = booster.inplace_predict(orig)
         # all primitive types in numpy
         for dtype in [
-            cp.signedinteger,
             cp.byte,
             cp.short,
             cp.intc,
             cp.int_,
             cp.longlong,
-            cp.unsignedinteger,
             cp.ubyte,
             cp.ushort,
             cp.uintc,
             cp.uint,
             cp.ulonglong,
-            cp.floating,
             cp.half,
             cp.single,
             cp.double,
@@ -472,9 +506,7 @@ class TestGPUPredict:
             cp.testing.assert_allclose(predt, predt_orig)
 
         # boolean
-        orig = cp.random.binomial(1, 0.5, size=rows * cols).reshape(
-            rows, cols
-        )
+        orig = cp.random.binomial(1, 0.5, size=rows * cols).reshape(rows, cols)
         predt_orig = booster.inplace_predict(orig)
         for dtype in [cp.bool8, cp.bool_]:
             X = cp.array(orig, dtype=dtype)
diff --git a/tests/python-gpu/test_gpu_ranking.py b/tests/python-gpu/test_gpu_ranking.py
index 50bbc3f1c..2579b17de 100644
--- a/tests/python-gpu/test_gpu_ranking.py
+++ b/tests/python-gpu/test_gpu_ranking.py
@@ -29,7 +29,6 @@ def comp_training_with_rank_objective(
         "booster": "gbtree",
         "tree_method": "gpu_hist",
         "gpu_id": 0,
-        "predictor": "gpu_predictor",
     }
 
     num_trees = 100
@@ -54,7 +53,6 @@ def comp_training_with_rank_objective(
         "booster": "gbtree",
         "tree_method": "hist",
         "gpu_id": -1,
-        "predictor": "cpu_predictor",
     }
     cpu_params["objective"] = rank_objective
     cpu_params["eval_metric"] = metric_name
diff --git a/tests/python-gpu/test_gpu_updaters.py b/tests/python-gpu/test_gpu_updaters.py
index 8522f41d3..a6b183daf 100644
--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@@ -260,7 +260,6 @@ class TestGPUUpdaters:
                 "seed": 66,
                 "subsample": 0.5,
                 "gamma": 0.2,
-                "predictor": "auto",
                 "eval_metric": "auc",
             },
             num_boost_round=150,
diff --git a/tests/python/test_predict.py b/tests/python/test_predict.py
index 6f89edd16..15288f53e 100644
--- a/tests/python/test_predict.py
+++ b/tests/python/test_predict.py
@@ -28,7 +28,7 @@ def run_threaded_predict(X, rows, predict_func):
         assert f.result()
 
 
-def run_predict_leaf(predictor):
+def run_predict_leaf(gpu_id: int) -> np.ndarray:
     rows = 100
     cols = 4
     classes = 5
@@ -42,13 +42,13 @@ def run_predict_leaf(predictor):
         {
             "num_parallel_tree": num_parallel_tree,
             "num_class": classes,
-            "predictor": predictor,
             "tree_method": "hist",
         },
         m,
         num_boost_round=num_boost_round,
     )
 
+    booster = tm.set_ordinal(gpu_id, booster)
     empty = xgb.DMatrix(np.ones(shape=(0, cols)))
     empty_leaf = booster.predict(empty, pred_leaf=True)
     assert empty_leaf.shape[0] == 0
@@ -74,13 +74,14 @@ def run_predict_leaf(predictor):
 
     # When there's only 1 tree, the output is a 1 dim vector
     booster = xgb.train({"tree_method": "hist"}, num_boost_round=1, dtrain=m)
+    booster = tm.set_ordinal(gpu_id, booster)
     assert booster.predict(m, pred_leaf=True).shape == (rows,)
 
     return leaf
 
 
-def test_predict_leaf():
-    run_predict_leaf("cpu_predictor")
+def test_predict_leaf() -> None:
+    run_predict_leaf(-1)
 
 
 def test_predict_shape():
diff --git a/tests/python/test_updaters.py b/tests/python/test_updaters.py
index 0a9013eaa..095c9936a 100644
--- a/tests/python/test_updaters.py
+++ b/tests/python/test_updaters.py
@@ -274,7 +274,7 @@ class TestTreeMethod:
     ) -> None:
         parameters: Dict[str, Any] = {"tree_method": tree_method}
         cat, label = tm.make_categorical(
-            n_samples=rows, n_features=cols, n_categories=cats, onehot=False, sparsity=0.5
+            rows, n_features=cols, n_categories=cats, onehot=False, sparsity=0.5
         )
         Xy = xgb.DMatrix(cat, label, enable_categorical=True)
 
@@ -294,7 +294,9 @@ class TestTreeMethod:
             y_predt = booster.predict(Xy)
 
             rmse = tm.root_mean_square(label, y_predt)
-            np.testing.assert_allclose(rmse, evals_result["Train"]["rmse"][-1])
+            np.testing.assert_allclose(
+                rmse, evals_result["Train"]["rmse"][-1], rtol=2e-5
+            )
 
         # Test with OHE split
         run(self.USE_ONEHOT)
@@ -311,10 +313,8 @@ class TestTreeMethod:
         by_etl_results: Dict[str, Dict[str, List[float]]] = {}
         by_builtin_results: Dict[str, Dict[str, List[float]]] = {}
 
-        predictor = "gpu_predictor" if tree_method == "gpu_hist" else None
         parameters: Dict[str, Any] = {
             "tree_method": tree_method,
-            "predictor": predictor,
             # Use one-hot exclusively
             "max_cat_to_onehot": self.USE_ONEHOT
         }
diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
index 26d18493c..f897d8afc 100644
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -1418,23 +1418,6 @@ def test_categorical():
     np.testing.assert_allclose(predt_cat, predt_enc)
 
 
-def test_prediction_config():
-    reg = xgb.XGBRegressor()
-    assert reg._can_use_inplace_predict() is True
-
-    reg.set_params(predictor="cpu_predictor")
-    assert reg._can_use_inplace_predict() is False
-
-    reg.set_params(predictor="auto")
-    assert reg._can_use_inplace_predict() is True
-
-    reg.set_params(predictor=None)
-    assert reg._can_use_inplace_predict() is True
-
-    reg.set_params(booster="gblinear")
-    assert reg._can_use_inplace_predict() is False
-
-
 def test_evaluation_metric():
     from sklearn.datasets import load_diabetes, load_digits
     from sklearn.metrics import mean_absolute_error

From e964654b8fcbfe36cacfe66f044d5d52ea135eb5 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 3 Jul 2023 22:06:17 +0800
Subject: [PATCH 015/136] [skl] Enable cat feature without specifying tree
 method. (#9353)

---
 python-package/xgboost/sklearn.py                       | 3 +--
 tests/python/test_with_sklearn.py                       | 3 +--
 tests/test_distributed/test_with_dask/test_with_dask.py | 6 +++---
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index 440cd34be..a46ba14d0 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -930,8 +930,7 @@ class XGBModel(XGBModelBase):
         callbacks = self.callbacks if self.callbacks is not None else callbacks
 
         tree_method = params.get("tree_method", None)
-        cat_support = {"gpu_hist", "approx", "hist"}
-        if self.enable_categorical and tree_method not in cat_support:
+        if self.enable_categorical and tree_method == "exact":
             raise ValueError(
                 "Experimental support for categorical data is not implemented for"
                 " current tree method yet."
diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
index f897d8afc..b4550dab2 100644
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -1390,7 +1390,6 @@ def test_categorical():
     X, y = tm.make_categorical(n_samples=32, n_features=2, n_categories=3, onehot=False)
     ft = ["c"] * X.shape[1]
     reg = xgb.XGBRegressor(
-        tree_method="hist",
         feature_types=ft,
         max_cat_to_onehot=1,
         enable_categorical=True,
@@ -1409,7 +1408,7 @@ def test_categorical():
     onehot, y = tm.make_categorical(
         n_samples=32, n_features=2, n_categories=3, onehot=True
     )
-    reg = xgb.XGBRegressor(tree_method="hist")
+    reg = xgb.XGBRegressor()
     reg.fit(onehot, y, eval_set=[(onehot, y)])
     from_enc = reg.evals_result()["validation_0"]["rmse"]
     predt_enc = reg.predict(onehot)
diff --git a/tests/test_distributed/test_with_dask/test_with_dask.py b/tests/test_distributed/test_with_dask/test_with_dask.py
index d6075481f..cab4188a8 100644
--- a/tests/test_distributed/test_with_dask/test_with_dask.py
+++ b/tests/test_distributed/test_with_dask/test_with_dask.py
@@ -308,7 +308,7 @@ def test_dask_sparse(client: "Client") -> None:
 
 
 def run_categorical(client: "Client", tree_method: str, X, X_onehot, y) -> None:
-    parameters = {"tree_method": tree_method, "max_cat_to_onehot": 9999} # force onehot
+    parameters = {"tree_method": tree_method, "max_cat_to_onehot": 9999}  # force onehot
     rounds = 10
     m = xgb.dask.DaskDMatrix(client, X_onehot, y, enable_categorical=True)
     by_etl_results = xgb.dask.train(
@@ -364,9 +364,9 @@ def run_categorical(client: "Client", tree_method: str, X, X_onehot, y) -> None:
     check_model_output(reg.get_booster())
 
     reg = xgb.dask.DaskXGBRegressor(
-        enable_categorical=True, n_estimators=10
+        enable_categorical=True, n_estimators=10, tree_method="exact"
     )
-    with pytest.raises(ValueError):
+    with pytest.raises(ValueError, match="categorical data"):
         reg.fit(X, y)
     # check partition based
     reg = xgb.dask.DaskXGBRegressor(

From 6155394a06fc0ce0c3e320f68deb29a7c3de06f1 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 4 Jul 2023 01:04:34 +0800
Subject: [PATCH 016/136] Update news for 1.7.6 [skip ci] (#9350)

---
 NEWS.md | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/NEWS.md b/NEWS.md
index 963dd3337..2a1000e55 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -3,6 +3,23 @@ XGBoost Change Log
 
 This file records the changes in xgboost library in reverse chronological order.
 
+## 1.7.6 (2023 Jun 16)
+
+This is a patch release for bug fixes. The CRAN package for the R binding is kept at 1.7.5.
+
+### Bug Fixes
+* Fix distributed training with mixed dense and sparse partitions. (#9272)
+* Fix monotone constraints on CPU with large trees. (#9122)
+* [spark] Make the spark model have the same UID as its estimator (#9022)
+* Optimize prediction with `QuantileDMatrix`. (#9096)
+
+### Document
+* Improve doxygen (#8959)
+* Update the cuDF pip index URL. (#9106)
+
+### Maintenance
+* Fix tests with pandas 2.0. (#9014)
+
 ## 1.7.5 (2023 Mar 30)
 This is a patch release for bug fixes.
 

From d0916849a6c554641a6933ffa8ef40c540a55bea Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 4 Jul 2023 01:07:09 +0800
Subject: [PATCH 017/136] Remove unused weight from buffer for cat features.
 (#9341)

---
 src/common/hist_util.cu            | 107 +++++++++++++++++------------
 src/common/hist_util.cuh           |  16 ++---
 tests/cpp/common/test_hist_util.cu |  74 ++++++++++++++++++--
 3 files changed, 142 insertions(+), 55 deletions(-)

diff --git a/src/common/hist_util.cu b/src/common/hist_util.cu
index ae86129bf..76fff8a98 100644
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@@ -127,55 +127,76 @@ void SortByWeight(dh::device_vector<float>* weights,
                                 });
 }
 
-void RemoveDuplicatedCategories(
-    int32_t device, MetaInfo const &info, Span<bst_row_t> d_cuts_ptr,
-    dh::device_vector<Entry> *p_sorted_entries,
-    dh::caching_device_vector<size_t> *p_column_sizes_scan) {
+void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
+                                dh::device_vector<Entry>* p_sorted_entries,
+                                dh::device_vector<float>* p_sorted_weights,
+                                dh::caching_device_vector<size_t>* p_column_sizes_scan) {
   info.feature_types.SetDevice(device);
   auto d_feature_types = info.feature_types.ConstDeviceSpan();
   CHECK(!d_feature_types.empty());
-  auto &column_sizes_scan = *p_column_sizes_scan;
-  auto &sorted_entries = *p_sorted_entries;
+  auto& column_sizes_scan = *p_column_sizes_scan;
+  auto& sorted_entries = *p_sorted_entries;
   // Removing duplicated entries in categorical features.
+
+  // We don't need to accumulate weight for duplicated entries as there's no weighted
+  // sketching for categorical features, the categories are the cut values.
   dh::caching_device_vector<size_t> new_column_scan(column_sizes_scan.size());
-  dh::SegmentedUnique(column_sizes_scan.data().get(),
-                      column_sizes_scan.data().get() + column_sizes_scan.size(),
-                      sorted_entries.begin(), sorted_entries.end(),
-                      new_column_scan.data().get(), sorted_entries.begin(),
-                      [=] __device__(Entry const &l, Entry const &r) {
-                        if (l.index == r.index) {
-                          if (IsCat(d_feature_types, l.index)) {
-                            return l.fvalue == r.fvalue;
-                          }
-                        }
-                        return false;
-                      });
+  std::size_t n_uniques{0};
+  if (p_sorted_weights) {
+    using Pair = thrust::tuple<Entry, float>;
+    auto d_sorted_entries = dh::ToSpan(sorted_entries);
+    auto d_sorted_weights = dh::ToSpan(*p_sorted_weights);
+    auto val_in_it = thrust::make_zip_iterator(d_sorted_entries.data(), d_sorted_weights.data());
+    auto val_out_it = thrust::make_zip_iterator(d_sorted_entries.data(), d_sorted_weights.data());
+    n_uniques = dh::SegmentedUnique(
+        column_sizes_scan.data().get(), column_sizes_scan.data().get() + column_sizes_scan.size(),
+        val_in_it, val_in_it + sorted_entries.size(), new_column_scan.data().get(), val_out_it,
+        [=] __device__(Pair const& l, Pair const& r) {
+          Entry const& le = thrust::get<0>(l);
+          Entry const& re = thrust::get<0>(r);
+          if (le.index == re.index && IsCat(d_feature_types, le.index)) {
+            return le.fvalue == re.fvalue;
+          }
+          return false;
+        });
+    p_sorted_weights->resize(n_uniques);
+  } else {
+    n_uniques = dh::SegmentedUnique(
+        column_sizes_scan.data().get(), column_sizes_scan.data().get() + column_sizes_scan.size(),
+        sorted_entries.begin(), sorted_entries.end(), new_column_scan.data().get(),
+        sorted_entries.begin(), [=] __device__(Entry const& l, Entry const& r) {
+          if (l.index == r.index) {
+            if (IsCat(d_feature_types, l.index)) {
+              return l.fvalue == r.fvalue;
+            }
+          }
+          return false;
+        });
+  }
+  sorted_entries.resize(n_uniques);
 
   // Renew the column scan and cut scan based on categorical data.
   auto d_old_column_sizes_scan = dh::ToSpan(column_sizes_scan);
-  dh::caching_device_vector<SketchContainer::OffsetT> new_cuts_size(
-      info.num_col_ + 1);
+  dh::caching_device_vector<SketchContainer::OffsetT> new_cuts_size(info.num_col_ + 1);
   CHECK_EQ(new_column_scan.size(), new_cuts_size.size());
-  dh::LaunchN(
-      new_column_scan.size(),
-      [=, d_new_cuts_size = dh::ToSpan(new_cuts_size),
-       d_old_column_sizes_scan = dh::ToSpan(column_sizes_scan),
-       d_new_columns_ptr = dh::ToSpan(new_column_scan)] __device__(size_t idx) {
-        d_old_column_sizes_scan[idx] = d_new_columns_ptr[idx];
-        if (idx == d_new_columns_ptr.size() - 1) {
-          return;
-        }
-        if (IsCat(d_feature_types, idx)) {
-          // Cut size is the same as number of categories in input.
-          d_new_cuts_size[idx] =
-              d_new_columns_ptr[idx + 1] - d_new_columns_ptr[idx];
-        } else {
-          d_new_cuts_size[idx] = d_cuts_ptr[idx + 1] - d_cuts_ptr[idx];
-        }
-      });
+  dh::LaunchN(new_column_scan.size(),
+              [=, d_new_cuts_size = dh::ToSpan(new_cuts_size),
+               d_old_column_sizes_scan = dh::ToSpan(column_sizes_scan),
+               d_new_columns_ptr = dh::ToSpan(new_column_scan)] __device__(size_t idx) {
+                d_old_column_sizes_scan[idx] = d_new_columns_ptr[idx];
+                if (idx == d_new_columns_ptr.size() - 1) {
+                  return;
+                }
+                if (IsCat(d_feature_types, idx)) {
+                  // Cut size is the same as number of categories in input.
+                  d_new_cuts_size[idx] = d_new_columns_ptr[idx + 1] - d_new_columns_ptr[idx];
+                } else {
+                  d_new_cuts_size[idx] = d_cuts_ptr[idx + 1] - d_cuts_ptr[idx];
+                }
+              });
   // Turn size into ptr.
-  thrust::exclusive_scan(thrust::device, new_cuts_size.cbegin(),
-                         new_cuts_size.cend(), d_cuts_ptr.data());
+  thrust::exclusive_scan(thrust::device, new_cuts_size.cbegin(), new_cuts_size.cend(),
+                         d_cuts_ptr.data());
 }
 }  // namespace detail
 
@@ -209,8 +230,8 @@ void ProcessBatch(int device, MetaInfo const &info, const SparsePage &page,
   auto d_cuts_ptr = cuts_ptr.DeviceSpan();
 
   if (sketch_container->HasCategorical()) {
-    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
-                                       &sorted_entries, &column_sizes_scan);
+    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr, &sorted_entries, nullptr,
+                                       &column_sizes_scan);
   }
 
   auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
@@ -276,8 +297,8 @@ void ProcessWeightedBatch(int device, const SparsePage& page,
                              &column_sizes_scan);
   auto d_cuts_ptr = cuts_ptr.DeviceSpan();
   if (sketch_container->HasCategorical()) {
-    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
-                                       &sorted_entries, &column_sizes_scan);
+    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr, &sorted_entries, &temp_weights,
+                                       &column_sizes_scan);
   }
 
   auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
diff --git a/src/common/hist_util.cuh b/src/common/hist_util.cuh
index 77424d7fa..0dcdad64d 100644
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@@ -240,10 +240,10 @@ void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Ran
 void SortByWeight(dh::device_vector<float>* weights,
                   dh::device_vector<Entry>* sorted_entries);
 
-void RemoveDuplicatedCategories(
-    int32_t device, MetaInfo const &info, Span<bst_row_t> d_cuts_ptr,
-    dh::device_vector<Entry> *p_sorted_entries,
-    dh::caching_device_vector<size_t> *p_column_sizes_scan);
+void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
+                                dh::device_vector<Entry>* p_sorted_entries,
+                                dh::device_vector<float>* p_sorted_weights,
+                                dh::caching_device_vector<size_t>* p_column_sizes_scan);
 }  // namespace detail
 
 // Compute sketch on DMatrix.
@@ -275,8 +275,8 @@ void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
 
   if (sketch_container->HasCategorical()) {
     auto d_cuts_ptr = cuts_ptr.DeviceSpan();
-    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
-                                       &sorted_entries, &column_sizes_scan);
+    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr, &sorted_entries, nullptr,
+                                       &column_sizes_scan);
   }
 
   auto d_cuts_ptr = cuts_ptr.DeviceSpan();
@@ -354,8 +354,8 @@ void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
 
   if (sketch_container->HasCategorical()) {
     auto d_cuts_ptr = cuts_ptr.DeviceSpan();
-    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr,
-                                       &sorted_entries, &column_sizes_scan);
+    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr, &sorted_entries, &temp_weights,
+                                       &column_sizes_scan);
   }
 
   auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
diff --git a/tests/cpp/common/test_hist_util.cu b/tests/cpp/common/test_hist_util.cu
index a6780d433..20fd1043d 100644
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@@ -143,11 +143,14 @@ TEST(HistUtil, DeviceSketchCategoricalFeatures) {
 
 void TestMixedSketch() {
   size_t n_samples = 1000, n_features = 2, n_categories = 3;
+  bst_bin_t n_bins = 64;
+
   std::vector<float> data(n_samples * n_features);
   SimpleLCG gen;
   SimpleRealUniformDistribution<float> cat_d{0.0f, static_cast<float>(n_categories)};
   SimpleRealUniformDistribution<float> num_d{0.0f, 3.0f};
   for (size_t i = 0; i < n_samples * n_features; ++i) {
+    // two features, row major. The first column is numeric and the second is categorical.
     if (i % 2 == 0) {
       data[i] = std::floor(cat_d(&gen));
     } else {
@@ -159,12 +162,75 @@ void TestMixedSketch() {
   m->Info().feature_types.HostVector().push_back(FeatureType::kCategorical);
   m->Info().feature_types.HostVector().push_back(FeatureType::kNumerical);
 
-  auto cuts = DeviceSketch(0, m.get(), 64);
-  ASSERT_EQ(cuts.Values().size(), 64 + n_categories);
+  auto cuts = DeviceSketch(0, m.get(), n_bins);
+  ASSERT_EQ(cuts.Values().size(), n_bins + n_categories);
 }
 
-TEST(HistUtil, DeviceSketchMixedFeatures) {
-  TestMixedSketch();
+TEST(HistUtil, DeviceSketchMixedFeatures) { TestMixedSketch(); }
+
+TEST(HistUtil, RemoveDuplicatedCategories) {
+  bst_row_t n_samples = 512;
+  bst_feature_t n_features = 3;
+  bst_cat_t n_categories = 5;
+
+  auto ctx = MakeCUDACtx(0);
+  SimpleLCG rng;
+  SimpleRealUniformDistribution<float> cat_d{0.0f, static_cast<float>(n_categories)};
+
+  dh::device_vector<Entry> sorted_entries(n_samples * n_features);
+  for (std::size_t i = 0; i < n_samples; ++i) {
+    for (bst_feature_t j = 0; j < n_features; ++j) {
+      float fvalue{0.0f};
+      // The second column is categorical
+      if (j == 1) {
+        fvalue = std::floor(cat_d(&rng));
+      } else {
+        fvalue = i;
+      }
+      sorted_entries[i * n_features + j] = Entry{j, fvalue};
+    }
+  }
+
+  MetaInfo info;
+  info.num_col_ = n_features;
+  info.num_row_ = n_samples;
+  info.feature_types.HostVector() = std::vector<FeatureType>{
+      FeatureType::kNumerical, FeatureType::kCategorical, FeatureType::kNumerical};
+  ASSERT_EQ(info.feature_types.Size(), n_features);
+
+  HostDeviceVector<bst_row_t> cuts_ptr{0, n_samples, n_samples * 2, n_samples * 3};
+  cuts_ptr.SetDevice(0);
+
+  dh::device_vector<float> weight(n_samples * n_features, 0);
+  dh::Iota(dh::ToSpan(weight));
+
+  dh::caching_device_vector<bst_row_t> columns_ptr(4);
+  for (std::size_t i = 0; i < columns_ptr.size(); ++i) {
+    columns_ptr[i] = i * n_samples;
+  }
+  // sort into column major
+  thrust::sort_by_key(sorted_entries.begin(), sorted_entries.end(), weight.begin(),
+                      detail::EntryCompareOp());
+
+  detail::RemoveDuplicatedCategories(ctx.gpu_id, info, cuts_ptr.DeviceSpan(), &sorted_entries,
+                                     &weight, &columns_ptr);
+
+  auto const& h_cptr = cuts_ptr.ConstHostVector();
+  ASSERT_EQ(h_cptr.back(), n_samples * 2 + n_categories);
+  // check numerical
+  for (std::size_t i = 0; i < n_samples; ++i) {
+    ASSERT_EQ(weight[i], i * 3);
+  }
+  auto beg = n_samples + n_categories;
+  for (std::size_t i = 0; i < n_samples; ++i) {
+    ASSERT_EQ(weight[i + beg], i * 3 + 2);
+  }
+  // check categorical
+  beg = n_samples;
+  for (std::size_t i = 0; i < n_categories; ++i) {
+    // all from the second column
+    ASSERT_EQ(static_cast<bst_feature_t>(weight[i + beg]) % n_features, 1);
+  }
 }
 
 TEST(HistUtil, DeviceSketchMultipleColumns) {

From bb2de1fd5d4ca159bd195284a59920ac9ea84005 Mon Sep 17 00:00:00 2001
From: Boris <mail@dotbg.name>
Date: Mon, 3 Jul 2023 21:31:33 +0200
Subject: [PATCH 018/136] xgboost4j-gpu_2.12-2.0.0: added libxgboost4j.so back.
 (#9351)

---
 jvm-packages/pom.xml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index 89055ad16..d2e363601 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -91,6 +91,9 @@
                     <value>ON</value>
                 </property>
             </activation>
+            <properties>
+               <use.cuda>ON</use.cuda>
+            </properties>
             <modules>
                 <module>xgboost4j-gpu</module>
                 <module>xgboost4j-spark-gpu</module>

From 6c9c8a9001523d88f1ec9640715d01f1b9d966b0 Mon Sep 17 00:00:00 2001
From: Oliver Holworthy <oliver@holworthy.com>
Date: Tue, 4 Jul 2023 22:46:17 +0100
Subject: [PATCH 019/136] Enable Installation of Python Package with System lib
 in a Virtual Environment (#9349)

---
 .github/workflows/python_tests.yml   | 41 ++++++++++++++++++++++++++++
 doc/build.rst                        |  2 +-
 python-package/packager/nativelib.py |  4 +--
 python-package/xgboost/libpath.py    |  6 ++--
 4 files changed, 47 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml
index 98dc1b468..b9e97d439 100644
--- a/.github/workflows/python_tests.yml
+++ b/.github/workflows/python_tests.yml
@@ -255,3 +255,44 @@ jobs:
       shell: bash -l {0}
       run: |
         pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_spark
+
+  python-system-installation-on-ubuntu:
+    name: Test XGBoost Python package System Installation on ${{ matrix.os }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [ubuntu-latest]
+
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          submodules: 'true'
+
+      - name: Set up Python 3.8
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.8
+
+      - name: Install ninja
+        run: |
+          sudo apt-get update && sudo apt-get install -y ninja-build
+
+      - name: Build XGBoost on Ubuntu
+        run: |
+          mkdir build
+          cd build
+          cmake .. -GNinja
+          ninja
+
+      - name: Copy lib to system lib
+        run: |
+          cp lib/* "$(python -c 'import sys; print(sys.base_prefix)')/lib"
+
+      - name: Install XGBoost in Virtual Environment
+        run: |
+          cd python-package
+          pip install virtualenv
+          virtualenv venv
+          source venv/bin/activate && \
+            pip install -v . --config-settings use_system_libxgboost=True && \
+            python -c 'import xgboost'
diff --git a/doc/build.rst b/doc/build.rst
index e78d2d2f4..e30d57bc8 100644
--- a/doc/build.rst
+++ b/doc/build.rst
@@ -259,7 +259,7 @@ There are several ways to build and install the package from source:
 
     import sys
     import pathlib
-    libpath = pathlib.Path(sys.prefix).joinpath("lib", "libxgboost.so")
+    libpath = pathlib.Path(sys.base_prefix).joinpath("lib", "libxgboost.so")
     assert libpath.exists()
 
   Then pass ``use_system_libxgboost=True`` option to ``pip install``:
diff --git a/python-package/packager/nativelib.py b/python-package/packager/nativelib.py
index f1708d6c5..ff38fa11d 100644
--- a/python-package/packager/nativelib.py
+++ b/python-package/packager/nativelib.py
@@ -132,8 +132,8 @@ def locate_or_build_libxgboost(
 
     if build_config.use_system_libxgboost:
         # Find libxgboost from system prefix
-        sys_prefix = pathlib.Path(sys.prefix).absolute().resolve()
-        libxgboost_sys = sys_prefix / "lib" / _lib_name()
+        sys_base_prefix = pathlib.Path(sys.base_prefix).absolute().resolve()
+        libxgboost_sys = sys_base_prefix / "lib" / _lib_name()
         if not libxgboost_sys.exists():
             raise RuntimeError(
                 f"use_system_libxgboost was specified but {_lib_name()} is "
diff --git a/python-package/xgboost/libpath.py b/python-package/xgboost/libpath.py
index be37b364e..0437f3a4c 100644
--- a/python-package/xgboost/libpath.py
+++ b/python-package/xgboost/libpath.py
@@ -27,7 +27,7 @@ def find_lib_path() -> List[str]:
         os.path.join(curr_path, os.path.pardir, os.path.pardir, "lib"),
         # use libxgboost from a system prefix, if available.  This should be the last
         # option.
-        os.path.join(sys.prefix, "lib"),
+        os.path.join(sys.base_prefix, "lib"),
     ]
 
     if sys.platform == "win32":
@@ -62,8 +62,8 @@ def find_lib_path() -> List[str]:
             + ("\n- ".join(dll_path))
             + "\nXGBoost Python package path: "
             + curr_path
-            + "\nsys.prefix: "
-            + sys.prefix
+            + "\nsys.base_prefix: "
+            + sys.base_prefix
             + "\nSee: "
             + link
             + " for installing XGBoost."

From 645037e3764af8e8bfff44fde17d474984837cb0 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 5 Jul 2023 15:17:22 +0800
Subject: [PATCH 020/136] Improve test coverage with predictor configuration.
 (#9354)

* Improve test coverage with predictor configuration.

- Test with ext memory.
- Test with QDM.
- Test with dart.
---
 include/xgboost/context.h                  | 26 ++++++-
 include/xgboost/predictor.h                | 14 ++--
 src/common/error_msg.h                     |  4 ++
 src/data/gradient_index_format.cc          |  1 +
 src/data/gradient_index_page_source.cc     | 10 ++-
 src/gbm/gbtree.cc                          | 73 +++++++++++--------
 src/predictor/cpu_predictor.cc             |  3 +-
 src/predictor/gpu_predictor.cu             |  5 +-
 tests/cpp/data/test_gradient_index.cc      | 21 +++---
 tests/cpp/data/test_sparse_page_dmatrix.cc | 10 +--
 tests/cpp/gbm/test_gbtree.cc               | 82 ++++++++++++++++++++++
 tests/cpp/gbm/test_gbtree.cu               |  1 -
 tests/cpp/helpers.cc                       | 65 ++++++++++++++++-
 tests/cpp/helpers.cu                       |  5 +-
 tests/cpp/helpers.h                        | 28 +++++---
 tests/cpp/predictor/test_cpu_predictor.cc  |  2 +-
 tests/cpp/predictor/test_gpu_predictor.cu  |  9 +--
 17 files changed, 280 insertions(+), 79 deletions(-)

diff --git a/include/xgboost/context.h b/include/xgboost/context.h
index b11ca70ec..de7648079 100644
--- a/include/xgboost/context.h
+++ b/include/xgboost/context.h
@@ -9,9 +9,10 @@
 #include <xgboost/logging.h>    // for CHECK_GE
 #include <xgboost/parameter.h>  // for XGBoostParameter
 
-#include <cstdint>  // for int16_t, int32_t, int64_t
-#include <memory>   // for shared_ptr
-#include <string>   // for string, to_string
+#include <cstdint>      // for int16_t, int32_t, int64_t
+#include <memory>       // for shared_ptr
+#include <string>       // for string, to_string
+#include <type_traits>  // for invoke_result_t, is_same_v
 
 namespace xgboost {
 
@@ -152,6 +153,25 @@ struct Context : public XGBoostParameter<Context> {
     ctx.gpu_id = kCpuId;
     return ctx;
   }
+  /**
+   * @brief Call function based on the current device.
+   */
+  template <typename CPUFn, typename CUDAFn>
+  decltype(auto) DispatchDevice(CPUFn&& cpu_fn, CUDAFn&& cuda_fn) const {
+    static_assert(std::is_same_v<std::invoke_result_t<CPUFn>, std::invoke_result_t<CUDAFn>>);
+    switch (this->Device().device) {
+      case DeviceOrd::kCPU:
+        return cpu_fn();
+      case DeviceOrd::kCUDA:
+        return cuda_fn();
+      default:
+        // Do not use the device name as this is likely an internal error, the name
+        // wouldn't be valid.
+        LOG(FATAL) << "Unknown device type:" << static_cast<std::int16_t>(this->Device().device);
+        break;
+    }
+    return std::invoke_result_t<CPUFn>();
+  }
 
   // declare parameters
   DMLC_DECLARE_PARAMETER(Context) {
diff --git a/include/xgboost/predictor.h b/include/xgboost/predictor.h
index 615bc0f39..f0d2e8e37 100644
--- a/include/xgboost/predictor.h
+++ b/include/xgboost/predictor.h
@@ -6,24 +6,22 @@
  */
 #pragma once
 #include <xgboost/base.h>
-#include <xgboost/cache.h>  // DMatrixCache
+#include <xgboost/cache.h>    // for DMatrixCache
+#include <xgboost/context.h>  // for Context
 #include <xgboost/context.h>
 #include <xgboost/data.h>
 #include <xgboost/host_device_vector.h>
 
-#include <functional>  // std::function
-#include <memory>
+#include <functional>  // for function
+#include <memory>      // for shared_ptr
 #include <string>
-#include <thread>   // for get_id
 #include <utility>  // for make_pair
 #include <vector>
 
 // Forward declarations
-namespace xgboost {
-namespace gbm {
+namespace xgboost::gbm {
 struct GBTreeModel;
-}  // namespace gbm
-}  // namespace xgboost
+}  // namespace xgboost::gbm
 
 namespace xgboost {
 /**
diff --git a/src/common/error_msg.h b/src/common/error_msg.h
index e690a12f3..e9b9fc56b 100644
--- a/src/common/error_msg.h
+++ b/src/common/error_msg.h
@@ -47,5 +47,9 @@ inline void MaxFeatureSize(std::uint64_t n_features) {
       << "Unfortunately, XGBoost does not support data matrices with "
       << std::numeric_limits<bst_feature_t>::max() << " features or greater";
 }
+
+constexpr StringView InplacePredictProxy() {
+  return "Inplace predict accepts only DMatrixProxy as input.";
+}
 }  // namespace xgboost::error
 #endif  // XGBOOST_COMMON_ERROR_MSG_H_
diff --git a/src/data/gradient_index_format.cc b/src/data/gradient_index_format.cc
index ac52c0697..241abfb1f 100644
--- a/src/data/gradient_index_format.cc
+++ b/src/data/gradient_index_format.cc
@@ -68,6 +68,7 @@ class GHistIndexRawFormat : public SparsePageFormat<GHistIndexMatrix> {
   }
 
   std::size_t Write(GHistIndexMatrix const& page, common::AlignedFileWriteStream* fo) override {
+    CHECK_NE(page.index.Size(), 0) << "Empty page is not supported.";
     std::size_t bytes = 0;
     bytes += WriteHistogramCuts(page.cut, fo);
     // indptr
diff --git a/src/data/gradient_index_page_source.cc b/src/data/gradient_index_page_source.cc
index 6fa2f07e0..1b2ed3fdd 100644
--- a/src/data/gradient_index_page_source.cc
+++ b/src/data/gradient_index_page_source.cc
@@ -1,10 +1,9 @@
-/*!
- * Copyright 2021-2022 by XGBoost Contributors
+/**
+ * Copyright 2021-2023, XGBoost Contributors
  */
 #include "gradient_index_page_source.h"
 
-namespace xgboost {
-namespace data {
+namespace xgboost::data {
 void GradientIndexPageSource::Fetch() {
   if (!this->ReadCache()) {
     if (count_ != 0 && !sync_) {
@@ -21,5 +20,4 @@ void GradientIndexPageSource::Fetch() {
     this->WriteCache();
   }
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index 9d595c378..b5c1573b1 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -18,7 +18,7 @@
 #include <vector>
 
 #include "../common/common.h"
-#include "../common/error_msg.h"  // for UnknownDevice
+#include "../common/error_msg.h"  // for UnknownDevice, InplacePredictProxy
 #include "../common/random.h"
 #include "../common/threading_utils.h"
 #include "../common/timer.h"
@@ -542,6 +542,18 @@ void GBTree::PredictBatchImpl(DMatrix* p_fmat, PredictionCacheEntry* out_preds,
   }
 }
 
+namespace {
+inline void MismatchedDevices(Context const* booster, Context const* data) {
+  LOG(WARNING) << "Falling back to prediction using DMatrix due to mismatched devices. XGBoost "
+               << "is running on: " << booster->DeviceName()
+               << ", while the input data is on: " << data->DeviceName() << ".\n"
+               << R"(Potential solutions:
+- Use a data structure that matches the device ordinal in the booster.
+- Set the device for booster before call to inplace_predict.
+)";
+}
+};  // namespace
+
 void GBTree::PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool is_training,
                           bst_layer_t layer_begin, bst_layer_t layer_end) {
   // dispatch to const function.
@@ -555,24 +567,26 @@ void GBTree::InplacePredict(std::shared_ptr<DMatrix> p_m, float missing,
   auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
   CHECK_LE(tree_end, model_.trees.size()) << "Invalid number of trees.";
   if (p_m->Ctx()->Device() != this->ctx_->Device()) {
-    LOG(WARNING) << "Falling back to prediction using DMatrix due to mismatched devices. XGBoost "
-                 << "is running on: " << this->ctx_->DeviceName()
-                 << ", while the input data is on: " << p_m->Ctx()->DeviceName() << ".";
+    MismatchedDevices(this->ctx_, p_m->Ctx());
     CHECK_EQ(out_preds->version, 0);
     auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_m);
-    auto any_adapter = proxy->Adapter();
+    CHECK(proxy) << error::InplacePredictProxy();
     auto p_fmat = data::CreateDMatrixFromProxy(ctx_, proxy, missing);
     this->PredictBatchImpl(p_fmat.get(), out_preds, false, layer_begin, layer_end);
     return;
   }
 
-  if (this->ctx_->IsCPU()) {
-    this->cpu_predictor_->InplacePredict(p_m, model_, missing, out_preds, tree_begin, tree_end);
-  } else if (p_m->Ctx()->IsCUDA()) {
-    CHECK(this->gpu_predictor_);
-    this->gpu_predictor_->InplacePredict(p_m, model_, missing, out_preds, tree_begin, tree_end);
-  } else {
-    LOG(FATAL) << error::UnknownDevice();
+  bool known_type = this->ctx_->DispatchDevice(
+      [&, begin = tree_begin, end = tree_end] {
+        return this->cpu_predictor_->InplacePredict(p_m, model_, missing, out_preds, begin, end);
+      },
+      [&, begin = tree_begin, end = tree_end] {
+        return this->gpu_predictor_->InplacePredict(p_m, model_, missing, out_preds, begin, end);
+      });
+  if (!known_type) {
+    auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_m);
+    CHECK(proxy) << error::InplacePredictProxy();
+    LOG(FATAL) << "Unknown data type for inplace prediction:" << proxy->Adapter().type().name();
   }
 }
 
@@ -808,11 +822,9 @@ class Dart : public GBTree {
     auto n_groups = model_.learner_model_param->num_output_group;
 
     if (ctx_->Device() != p_fmat->Ctx()->Device()) {
-      LOG(WARNING) << "Falling back to prediction using DMatrix due to mismatched devices. XGBoost "
-                   << "is running on: " << this->ctx_->DeviceName()
-                   << ", while the input data is on: " << p_fmat->Ctx()->DeviceName() << ".";
+      MismatchedDevices(ctx_, p_fmat->Ctx());
       auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_fmat);
-      auto any_adapter = proxy->Adapter();
+      CHECK(proxy) << error::InplacePredictProxy();
       auto p_fmat = data::CreateDMatrixFromProxy(ctx_, proxy, missing);
       this->PredictBatchImpl(p_fmat.get(), p_out_preds, false, layer_begin, layer_end);
       return;
@@ -825,20 +837,15 @@ class Dart : public GBTree {
     }
     predts.predictions.Resize(p_fmat->Info().num_row_ * n_groups, 0);
 
-    auto get_predictor = [&]() -> Predictor const* {
-      if (ctx_->IsCPU()) {
-        return cpu_predictor_.get();
-      } else if (ctx_->IsCUDA()) {
-        CHECK(this->gpu_predictor_);
-        return gpu_predictor_.get();
-      } else {
-        LOG(FATAL) << error::UnknownDevice();
-        return nullptr;
-      }
-    };
     auto predict_impl = [&](size_t i) {
       predts.predictions.Fill(0);
-      bool success{get_predictor()->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1)};
+      bool success = this->ctx_->DispatchDevice(
+          [&] {
+            return cpu_predictor_->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1);
+          },
+          [&] {
+            return gpu_predictor_->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1);
+          });
       CHECK(success) << msg;
     };
 
@@ -846,7 +853,15 @@ class Dart : public GBTree {
     for (bst_tree_t i = tree_begin; i < tree_end; ++i) {
       predict_impl(i);
       if (i == tree_begin) {
-        get_predictor()->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions, model_);
+        this->ctx_->DispatchDevice(
+            [&] {
+              this->cpu_predictor_->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions,
+                                                       model_);
+            },
+            [&] {
+              this->gpu_predictor_->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions,
+                                                       model_);
+            });
       }
       // Multiple the tree weight
       auto w = this->weight_drop_.at(i);
diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index b9cb02d56..c092c0b04 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -16,6 +16,7 @@
 #include "../common/bitfield.h"               // for RBitField8
 #include "../common/categorical.h"            // for IsCat, Decision
 #include "../common/common.h"                 // for DivRoundUp
+#include "../common/error_msg.h"              // for InplacePredictProxy
 #include "../common/math.h"                   // for CheckNAN
 #include "../common/threading_utils.h"        // for ParallelFor
 #include "../data/adapter.h"                  // for ArrayAdapter, CSRAdapter, CSRArrayAdapter
@@ -741,7 +742,7 @@ class CPUPredictor : public Predictor {
                       PredictionCacheEntry *out_preds, uint32_t tree_begin,
                       unsigned tree_end) const override {
     auto proxy = dynamic_cast<data::DMatrixProxy *>(p_m.get());
-    CHECK(proxy)<< "Inplace predict accepts only DMatrixProxy as input.";
+    CHECK(proxy)<< error::InplacePredictProxy();
     CHECK(!p_m->Info().IsColumnSplit())
         << "Inplace predict support for column-wise data split is not yet implemented.";
     auto x = proxy->Adapter();
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index 4ca0e33ff..578fda180 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -15,8 +15,9 @@
 #include "../common/bitfield.h"
 #include "../common/categorical.h"
 #include "../common/common.h"
-#include "../common/cuda_context.cuh"
+#include "../common/cuda_context.cuh"  // for CUDAContext
 #include "../common/device_helpers.cuh"
+#include "../common/error_msg.h"  // for InplacePredictProxy
 #include "../data/device_adapter.cuh"
 #include "../data/ellpack_page.cuh"
 #include "../data/proxy_dmatrix.h"
@@ -989,7 +990,7 @@ class GPUPredictor : public xgboost::Predictor {
                       PredictionCacheEntry* out_preds, uint32_t tree_begin,
                       unsigned tree_end) const override {
     auto proxy = dynamic_cast<data::DMatrixProxy*>(p_m.get());
-    CHECK(proxy)<< "Inplace predict accepts only DMatrixProxy as input.";
+    CHECK(proxy) << error::InplacePredictProxy();
     auto x = proxy->Adapter();
     if (x.type() == typeid(std::shared_ptr<data::CupyAdapter>)) {
       this->DispatchedInplacePredict<data::CupyAdapter,
diff --git a/tests/cpp/data/test_gradient_index.cc b/tests/cpp/data/test_gradient_index.cc
index bd29c87b0..5354c2f1a 100644
--- a/tests/cpp/data/test_gradient_index.cc
+++ b/tests/cpp/data/test_gradient_index.cc
@@ -27,26 +27,31 @@
 #include "xgboost/host_device_vector.h"         // for HostDeviceVector
 
 namespace xgboost::data {
-TEST(GradientIndex, ExternalMemory) {
+TEST(GradientIndex, ExternalMemoryBaseRowID) {
   Context ctx;
-  std::unique_ptr<DMatrix> dmat = CreateSparsePageDMatrix(10000);
+  auto p_fmat = RandomDataGenerator{4096, 256, 0.5}
+                    .Device(ctx.gpu_id)
+                    .Batches(8)
+                    .GenerateSparsePageDMatrix("cache", true);
+
   std::vector<size_t> base_rowids;
-  std::vector<float> hessian(dmat->Info().num_row_, 1);
-  for (auto const &page : dmat->GetBatches<GHistIndexMatrix>(&ctx, {64, hessian, true})) {
+  std::vector<float> hessian(p_fmat->Info().num_row_, 1);
+  for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(&ctx, {64, hessian, true})) {
     base_rowids.push_back(page.base_rowid);
   }
-  size_t i = 0;
-  for (auto const &page : dmat->GetBatches<SparsePage>()) {
+
+  std::size_t i = 0;
+  for (auto const &page : p_fmat->GetBatches<SparsePage>()) {
     ASSERT_EQ(base_rowids[i], page.base_rowid);
     ++i;
   }
 
   base_rowids.clear();
-  for (auto const &page : dmat->GetBatches<GHistIndexMatrix>(&ctx, {64, hessian, false})) {
+  for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(&ctx, {64, hessian, false})) {
     base_rowids.push_back(page.base_rowid);
   }
   i = 0;
-  for (auto const &page : dmat->GetBatches<SparsePage>()) {
+  for (auto const &page : p_fmat->GetBatches<SparsePage>()) {
     ASSERT_EQ(base_rowids[i], page.base_rowid);
     ++i;
   }
diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cc b/tests/cpp/data/test_sparse_page_dmatrix.cc
index d1e9e6242..839ea762e 100644
--- a/tests/cpp/data/test_sparse_page_dmatrix.cc
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cc
@@ -76,9 +76,11 @@ TEST(SparsePageDMatrix, LoadFile) {
 // allow caller to retain pages so they can process multiple pages at the same time.
 template <typename Page>
 void TestRetainPage() {
-  auto m = CreateSparsePageDMatrix(10000);
+  std::size_t n_batches = 4;
+  auto p_fmat = RandomDataGenerator{1024, 128, 0.5f}.Batches(n_batches).GenerateSparsePageDMatrix(
+      "cache", true);
   Context ctx;
-  auto batches = m->GetBatches<Page>(&ctx);
+  auto batches = p_fmat->GetBatches<Page>(&ctx);
   auto begin = batches.begin();
   auto end = batches.end();
 
@@ -94,7 +96,7 @@ void TestRetainPage() {
     }
     ASSERT_EQ(pages.back().Size(), (*it).Size());
   }
-  ASSERT_GE(iterators.size(), 2);
+  ASSERT_GE(iterators.size(), n_batches);
 
   for (size_t i = 0; i < iterators.size(); ++i) {
     ASSERT_EQ((*iterators[i]).Size(), pages.at(i).Size());
@@ -102,7 +104,7 @@ void TestRetainPage() {
   }
 
   // make sure it's const and the caller can not modify the content of page.
-  for (auto &page : m->GetBatches<Page>({&ctx})) {
+  for (auto &page : p_fmat->GetBatches<Page>({&ctx})) {
     static_assert(std::is_const<std::remove_reference_t<decltype(page)>>::value);
   }
 }
diff --git a/tests/cpp/gbm/test_gbtree.cc b/tests/cpp/gbm/test_gbtree.cc
index 2bc0b2c6b..f57b1f47c 100644
--- a/tests/cpp/gbm/test_gbtree.cc
+++ b/tests/cpp/gbm/test_gbtree.cc
@@ -514,4 +514,86 @@ TEST(GBTree, PredictRange) {
                  dmlc::Error);
   }
 }
+
+TEST(GBTree, InplacePredictionError) {
+  std::size_t n_samples{2048}, n_features{32};
+
+  auto test_ext_err = [&](std::string booster, Context const* ctx) {
+    std::shared_ptr<DMatrix> p_fmat =
+        RandomDataGenerator{n_samples, n_features, 0.5f}.Batches(2).GenerateSparsePageDMatrix(
+            "cache", true);
+    std::unique_ptr<Learner> learner{Learner::Create({p_fmat})};
+    learner->SetParam("booster", booster);
+    ConfigLearnerByCtx(ctx, learner.get());
+    learner->Configure();
+    for (std::int32_t i = 0; i < 3; ++i) {
+      learner->UpdateOneIter(i, p_fmat);
+    }
+    HostDeviceVector<float>* out_predt;
+    ASSERT_THROW(
+        {
+          learner->InplacePredict(p_fmat, PredictionType::kValue,
+                                  std::numeric_limits<float>::quiet_NaN(), &out_predt, 0, 0);
+        },
+        dmlc::Error);
+  };
+
+  {
+    Context ctx;
+    test_ext_err("gbtree", &ctx);
+    test_ext_err("dart", &ctx);
+  }
+
+#if defined(XGBOOST_USE_CUDA)
+  {
+    auto ctx = MakeCUDACtx(0);
+    test_ext_err("gbtree", &ctx);
+    test_ext_err("dart", &ctx);
+  }
+#endif  // defined(XGBOOST_USE_CUDA)
+
+  auto test_qdm_err = [&](std::string booster, Context const* ctx) {
+    std::shared_ptr<DMatrix> p_fmat;
+    bst_bin_t max_bins = 16;
+    auto rng = RandomDataGenerator{n_samples, n_features, 0.5f}.Device(ctx->gpu_id).Bins(max_bins);
+    if (ctx->IsCPU()) {
+      p_fmat = rng.GenerateQuantileDMatrix(true);
+    } else {
+#if defined(XGBOOST_USE_CUDA)
+      p_fmat = rng.GenerateDeviceDMatrix(true);
+#else
+      CHECK(p_fmat);
+#endif  // defined(XGBOOST_USE_CUDA)
+    };
+    std::unique_ptr<Learner> learner{Learner::Create({p_fmat})};
+    learner->SetParam("booster", booster);
+    learner->SetParam("max_bin", std::to_string(max_bins));
+    ConfigLearnerByCtx(ctx, learner.get());
+    learner->Configure();
+    for (std::int32_t i = 0; i < 3; ++i) {
+      learner->UpdateOneIter(i, p_fmat);
+    }
+    HostDeviceVector<float>* out_predt;
+    ASSERT_THROW(
+        {
+          learner->InplacePredict(p_fmat, PredictionType::kValue,
+                                  std::numeric_limits<float>::quiet_NaN(), &out_predt, 0, 0);
+        },
+        dmlc::Error);
+  };
+
+  {
+    Context ctx;
+    test_qdm_err("gbtree", &ctx);
+    test_qdm_err("dart", &ctx);
+  }
+
+#if defined(XGBOOST_USE_CUDA)
+  {
+    auto ctx = MakeCUDACtx(0);
+    test_qdm_err("gbtree", &ctx);
+    test_qdm_err("dart", &ctx);
+  }
+#endif  // defined(XGBOOST_USE_CUDA)
+}
 }  // namespace xgboost
diff --git a/tests/cpp/gbm/test_gbtree.cu b/tests/cpp/gbm/test_gbtree.cu
index 2393bfabd..7321be75e 100644
--- a/tests/cpp/gbm/test_gbtree.cu
+++ b/tests/cpp/gbm/test_gbtree.cu
@@ -61,7 +61,6 @@ void TestInplaceFallback(Context const* ctx) {
   learner->InplacePredict(p_m, PredictionType::kValue, std::numeric_limits<float>::quiet_NaN(),
                           &out_predt, 0, 0);
   auto output = testing::internal::GetCapturedStderr();
-  std::cout << "output:" << output << std::endl;
   ASSERT_NE(output.find("Falling back"), std::string::npos);
 
   // test when the contexts match
diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index 49ff5e412..4f44b7b1e 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -210,6 +210,16 @@ SimpleLCG::StateType SimpleLCG::Max() const { return max(); }
 // Make sure it's compile time constant.
 static_assert(SimpleLCG::max() - SimpleLCG::min());
 
+void RandomDataGenerator::GenerateLabels(std::shared_ptr<DMatrix> p_fmat) const {
+  RandomDataGenerator{p_fmat->Info().num_row_, this->n_targets_, 0.0f}.GenerateDense(
+      p_fmat->Info().labels.Data());
+  CHECK_EQ(p_fmat->Info().labels.Size(), this->rows_ * this->n_targets_);
+  p_fmat->Info().labels.Reshape(this->rows_, this->n_targets_);
+  if (device_ != Context::kCpuId) {
+    p_fmat->Info().labels.SetDevice(device_);
+  }
+}
+
 void RandomDataGenerator::GenerateDense(HostDeviceVector<float> *out) const {
   xgboost::SimpleRealUniformDistribution<bst_float> dist(lower_, upper_);
   CHECK(out);
@@ -363,8 +373,9 @@ void RandomDataGenerator::GenerateCSR(
   CHECK_EQ(columns->Size(), value->Size());
 }
 
-std::shared_ptr<DMatrix> RandomDataGenerator::GenerateDMatrix(bool with_label, bool float_label,
-                                                              size_t classes) const {
+[[nodiscard]] std::shared_ptr<DMatrix> RandomDataGenerator::GenerateDMatrix(bool with_label,
+                                                                            bool float_label,
+                                                                            size_t classes) const {
   HostDeviceVector<float> data;
   HostDeviceVector<bst_row_t> rptrs;
   HostDeviceVector<bst_feature_t> columns;
@@ -406,10 +417,58 @@ std::shared_ptr<DMatrix> RandomDataGenerator::GenerateDMatrix(bool with_label, b
   return out;
 }
 
-std::shared_ptr<DMatrix> RandomDataGenerator::GenerateQuantileDMatrix() {
+[[nodiscard]] std::shared_ptr<DMatrix> RandomDataGenerator::GenerateSparsePageDMatrix(
+    std::string prefix, bool with_label) const {
+  CHECK_GE(this->rows_, this->n_batches_);
+  CHECK_GE(this->n_batches_, 1)
+      << "Must set the n_batches before generating an external memory DMatrix.";
+  std::unique_ptr<ArrayIterForTest> iter;
+  if (device_ == Context::kCpuId) {
+    iter = std::make_unique<NumpyArrayIterForTest>(this->sparsity_, rows_, cols_, n_batches_);
+  } else {
+#if defined(XGBOOST_USE_CUDA)
+    iter = std::make_unique<CudaArrayIterForTest>(this->sparsity_, rows_, cols_, n_batches_);
+#else
+    CHECK(iter);
+#endif  // defined(XGBOOST_USE_CUDA)
+  }
+
+  std::unique_ptr<DMatrix> dmat{
+      DMatrix::Create(static_cast<DataIterHandle>(iter.get()), iter->Proxy(), Reset, Next,
+                      std::numeric_limits<float>::quiet_NaN(), Context{}.Threads(), prefix)};
+
+  auto row_page_path =
+      data::MakeId(prefix, dynamic_cast<data::SparsePageDMatrix*>(dmat.get())) + ".row.page";
+  EXPECT_TRUE(FileExists(row_page_path)) << row_page_path;
+
+  // Loop over the batches and count the number of pages
+  std::size_t batch_count = 0;
+  bst_row_t row_count = 0;
+  for (const auto& batch : dmat->GetBatches<xgboost::SparsePage>()) {
+    batch_count++;
+    row_count += batch.Size();
+    CHECK_NE(batch.data.Size(), 0);
+  }
+
+  EXPECT_EQ(batch_count, n_batches_);
+  EXPECT_EQ(row_count, dmat->Info().num_row_);
+
+  if (with_label) {
+    RandomDataGenerator{dmat->Info().num_row_, this->n_targets_, 0.0f}.GenerateDense(
+        dmat->Info().labels.Data());
+    CHECK_EQ(dmat->Info().labels.Size(), this->rows_ * this->n_targets_);
+    dmat->Info().labels.Reshape(this->rows_, this->n_targets_);
+  }
+  return dmat;
+}
+
+std::shared_ptr<DMatrix> RandomDataGenerator::GenerateQuantileDMatrix(bool with_label) {
   NumpyArrayIterForTest iter{this->sparsity_, this->rows_, this->cols_, 1};
   auto m = std::make_shared<data::IterativeDMatrix>(
       &iter, iter.Proxy(), nullptr, Reset, Next, std::numeric_limits<float>::quiet_NaN(), 0, bins_);
+  if (with_label) {
+    this->GenerateLabels(m);
+  }
   return m;
 }
 
diff --git a/tests/cpp/helpers.cu b/tests/cpp/helpers.cu
index f72281cb4..10b800fc1 100644
--- a/tests/cpp/helpers.cu
+++ b/tests/cpp/helpers.cu
@@ -24,10 +24,13 @@ int CudaArrayIterForTest::Next() {
   return 1;
 }
 
-std::shared_ptr<DMatrix> RandomDataGenerator::GenerateDeviceDMatrix() {
+std::shared_ptr<DMatrix> RandomDataGenerator::GenerateDeviceDMatrix(bool with_label) {
   CudaArrayIterForTest iter{this->sparsity_, this->rows_, this->cols_, 1};
   auto m = std::make_shared<data::IterativeDMatrix>(
       &iter, iter.Proxy(), nullptr, Reset, Next, std::numeric_limits<float>::quiet_NaN(), 0, bins_);
+  if (with_label) {
+    this->GenerateLabels(m);
+  }
   return m;
 }
 }  // namespace xgboost
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index 035baf22a..449d97a40 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -238,15 +238,18 @@ class RandomDataGenerator {
   bst_target_t n_targets_{1};
 
   std::int32_t device_{Context::kCpuId};
+  std::size_t n_batches_{0};
   std::uint64_t seed_{0};
   SimpleLCG lcg_;
 
-  std::size_t bins_{0};
+  bst_bin_t bins_{0};
   std::vector<FeatureType> ft_;
   bst_cat_t max_cat_;
 
   Json ArrayInterfaceImpl(HostDeviceVector<float>* storage, size_t rows, size_t cols) const;
 
+  void GenerateLabels(std::shared_ptr<DMatrix> p_fmat) const;
+
  public:
   RandomDataGenerator(bst_row_t rows, size_t cols, float sparsity)
       : rows_{rows}, cols_{cols}, sparsity_{sparsity}, lcg_{seed_} {}
@@ -263,12 +266,16 @@ class RandomDataGenerator {
     device_ = d;
     return *this;
   }
+  RandomDataGenerator& Batches(std::size_t n_batches) {
+    n_batches_ = n_batches;
+    return *this;
+  }
   RandomDataGenerator& Seed(uint64_t s) {
     seed_ = s;
     lcg_.Seed(seed_);
     return *this;
   }
-  RandomDataGenerator& Bins(size_t b) {
+  RandomDataGenerator& Bins(bst_bin_t b) {
     bins_ = b;
     return *this;
   }
@@ -309,12 +316,17 @@ class RandomDataGenerator {
   void GenerateCSR(HostDeviceVector<float>* value, HostDeviceVector<bst_row_t>* row_ptr,
                    HostDeviceVector<bst_feature_t>* columns) const;
 
-  std::shared_ptr<DMatrix> GenerateDMatrix(bool with_label = false, bool float_label = true,
-                                           size_t classes = 1) const;
+  [[nodiscard]] std::shared_ptr<DMatrix> GenerateDMatrix(bool with_label = false,
+                                                         bool float_label = true,
+                                                         size_t classes = 1) const;
+
+  [[nodiscard]] std::shared_ptr<DMatrix> GenerateSparsePageDMatrix(std::string prefix,
+                                                                   bool with_label) const;
+
 #if defined(XGBOOST_USE_CUDA)
-  std::shared_ptr<DMatrix> GenerateDeviceDMatrix();
+  std::shared_ptr<DMatrix> GenerateDeviceDMatrix(bool with_label);
 #endif
-  std::shared_ptr<DMatrix> GenerateQuantileDMatrix();
+  std::shared_ptr<DMatrix> GenerateQuantileDMatrix(bool with_label);
 };
 
 // Generate an empty DMatrix, mostly for its meta info.
@@ -443,11 +455,11 @@ class ArrayIterForTest {
   size_t static constexpr Cols() { return 13; }
 
  public:
-  std::string AsArray() const { return interface_; }
+  [[nodiscard]] std::string AsArray() const { return interface_; }
 
   virtual int Next() = 0;
   virtual void Reset() { iter_ = 0; }
-  size_t Iter() const { return iter_; }
+  [[nodiscard]] std::size_t Iter() const { return iter_; }
   auto Proxy() -> decltype(proxy_) { return proxy_; }
 
   explicit ArrayIterForTest(float sparsity, size_t rows, size_t cols, size_t batches);
diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc
index 087543cfe..841a576d5 100644
--- a/tests/cpp/predictor/test_cpu_predictor.cc
+++ b/tests/cpp/predictor/test_cpu_predictor.cc
@@ -216,7 +216,7 @@ void TestUpdatePredictionCache(bool use_subsampling) {
 
 TEST(CPUPredictor, GHistIndex) {
   size_t constexpr kRows{128}, kCols{16}, kBins{64};
-  auto p_hist = RandomDataGenerator{kRows, kCols, 0.0}.Bins(kBins).GenerateQuantileDMatrix();
+  auto p_hist = RandomDataGenerator{kRows, kCols, 0.0}.Bins(kBins).GenerateQuantileDMatrix(false);
   HostDeviceVector<float> storage(kRows * kCols);
   auto columnar = RandomDataGenerator{kRows, kCols, 0.0}.GenerateArrayInterface(&storage);
   auto adapter = data::ArrayAdapter(columnar.c_str());
diff --git a/tests/cpp/predictor/test_gpu_predictor.cu b/tests/cpp/predictor/test_gpu_predictor.cu
index 30fbaf997..15fbd462e 100644
--- a/tests/cpp/predictor/test_gpu_predictor.cu
+++ b/tests/cpp/predictor/test_gpu_predictor.cu
@@ -123,7 +123,8 @@ TEST(GPUPredictor, EllpackBasic) {
   auto ctx = MakeCUDACtx(0);
   for (size_t bins = 2; bins < 258; bins += 16) {
     size_t rows = bins * 16;
-    auto p_m = RandomDataGenerator{rows, kCols, 0.0}.Bins(bins).Device(0).GenerateDeviceDMatrix();
+    auto p_m =
+        RandomDataGenerator{rows, kCols, 0.0}.Bins(bins).Device(0).GenerateDeviceDMatrix(false);
     ASSERT_FALSE(p_m->PageExists<SparsePage>());
     TestPredictionFromGradientIndex<EllpackPage>(&ctx, rows, kCols, p_m);
     TestPredictionFromGradientIndex<EllpackPage>(&ctx, bins, kCols, p_m);
@@ -133,7 +134,7 @@ TEST(GPUPredictor, EllpackBasic) {
 TEST(GPUPredictor, EllpackTraining) {
   size_t constexpr kRows { 128 }, kCols { 16 }, kBins { 64 };
   auto p_ellpack =
-      RandomDataGenerator{kRows, kCols, 0.0}.Bins(kBins).Device(0).GenerateDeviceDMatrix();
+      RandomDataGenerator{kRows, kCols, 0.0}.Bins(kBins).Device(0).GenerateDeviceDMatrix(false);
   HostDeviceVector<float> storage(kRows * kCols);
   auto columnar = RandomDataGenerator{kRows, kCols, 0.0}
        .Device(0)
@@ -219,7 +220,7 @@ TEST(GPUPredictor, ShapStump) {
   gbm::GBTreeModel model(&mparam, &ctx);
 
   std::vector<std::unique_ptr<RegTree>> trees;
-  trees.push_back(std::unique_ptr<RegTree>(new RegTree));
+  trees.push_back(std::make_unique<RegTree>());
   model.CommitModelGroup(std::move(trees), 0);
 
   auto gpu_lparam = MakeCUDACtx(0);
@@ -246,7 +247,7 @@ TEST(GPUPredictor, Shap) {
   gbm::GBTreeModel model(&mparam, &ctx);
 
   std::vector<std::unique_ptr<RegTree>> trees;
-  trees.push_back(std::unique_ptr<RegTree>(new RegTree));
+  trees.push_back(std::make_unique<RegTree>());
   trees[0]->ExpandNode(0, 0, 0.5, true, 1.0, -1.0, 1.0, 0.0, 5.0, 2.0, 3.0);
   model.CommitModelGroup(std::move(trees), 0);
 

From b572a3991922404cdbd52db61fe5eaaba3c6242e Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 5 Jul 2023 16:49:25 +0800
Subject: [PATCH 021/136] [doc] Fix removed reference. (#9358)

---
 doc/tutorials/param_tuning.rst | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/doc/tutorials/param_tuning.rst b/doc/tutorials/param_tuning.rst
index cb58fcc20..5ede195f3 100644
--- a/doc/tutorials/param_tuning.rst
+++ b/doc/tutorials/param_tuning.rst
@@ -97,7 +97,6 @@ amount of memory:
     array = np.array(...)
 
 You can find some more specific memory reduction practices scattered through the documents
-For instances: :doc:`/tutorials/dask`, :doc:`/gpu/index`,
-:doc:`/contrib/scaling`. However, before going into these, being conscious about making
-data copies is a good starting point. It usually consumes a lot more memory than people
-expect.
+For instances: :doc:`/tutorials/dask`, :doc:`/gpu/index`. However, before going into
+these, being conscious about making data copies is a good starting point. It usually
+consumes a lot more memory than people expect.

From 41c681349605935af88b9dba56ceff4e6ea0f773 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 5 Jul 2023 20:20:07 +0800
Subject: [PATCH 022/136] Preserve order of saved updaters config. (#9355)

- Save the updater sequence as an array instead of object.
- Warn only once.

The compatibility is kept, but we should be able to break it as the config is not loaded
in pickle model and it's declared to be not stable.
---
 .../tests/testthat/test_model_compatibility.R | 34 +++++----------
 src/common/error_msg.h                        | 28 +++++++++++++
 src/data/simple_dmatrix.cc                    | 14 +++++--
 src/gbm/gbtree.cc                             | 42 ++++++++++++-------
 src/learner.cc                                | 25 +++--------
 tests/cpp/gbm/test_gbtree.cc                  | 40 +++++++++++++-----
 6 files changed, 112 insertions(+), 71 deletions(-)

diff --git a/R-package/tests/testthat/test_model_compatibility.R b/R-package/tests/testthat/test_model_compatibility.R
index a52e08f76..2f4992c06 100644
--- a/R-package/tests/testthat/test_model_compatibility.R
+++ b/R-package/tests/testthat/test_model_compatibility.R
@@ -76,32 +76,20 @@ test_that("Models from previous versions of XGBoost can be loaded", {
     name <- m[3]
     is_rds <- endsWith(model_file, '.rds')
     is_json <- endsWith(model_file, '.json')
-
-    cpp_warning <- capture.output({
-      # Expect an R warning when a model is loaded from RDS and it was generated by version < 1.1.x
-      if (is_rds && compareVersion(model_xgb_ver, '1.1.1.1') < 0) {
+    # Expect an R warning when a model is loaded from RDS and it was generated by version < 1.1.x
+    if (is_rds && compareVersion(model_xgb_ver, '1.1.1.1') < 0) {
+      booster <- readRDS(model_file)
+      expect_warning(predict(booster, newdata = pred_data))
+      booster <- readRDS(model_file)
+      expect_warning(run_booster_check(booster, name))
+    } else {
+      if (is_rds) {
         booster <- readRDS(model_file)
-        expect_warning(predict(booster, newdata = pred_data))
-        booster <- readRDS(model_file)
-        expect_warning(run_booster_check(booster, name))
       } else {
-        if (is_rds) {
-          booster <- readRDS(model_file)
-        } else {
-          booster <- xgb.load(model_file)
-        }
-        predict(booster, newdata = pred_data)
-        run_booster_check(booster, name)
+        booster <- xgb.load(model_file)
       }
-    })
-    cpp_warning <- paste0(cpp_warning, collapse = ' ')
-    if (is_rds && compareVersion(model_xgb_ver, '1.1.1.1') >= 0) {
-      # Expect a C++ warning when a model is loaded from RDS and it was generated by old XGBoost`
-      m <- grepl(paste0('.*If you are loading a serialized model ',
-                        '\\(like pickle in Python, RDS in R\\).*',
-                        'for more details about differences between ',
-                        'saving model and serializing.*'), cpp_warning, perl = TRUE)
-      expect_true(length(m) > 0 && all(m))
+      predict(booster, newdata = pred_data)
+      run_booster_check(booster, name)
     }
   })
 })
diff --git a/src/common/error_msg.h b/src/common/error_msg.h
index e9b9fc56b..37d70b144 100644
--- a/src/common/error_msg.h
+++ b/src/common/error_msg.h
@@ -51,5 +51,33 @@ inline void MaxFeatureSize(std::uint64_t n_features) {
 constexpr StringView InplacePredictProxy() {
   return "Inplace predict accepts only DMatrixProxy as input.";
 }
+
+inline void MaxSampleSize(std::size_t n) {
+  LOG(FATAL) << "Sample size too large for the current updater. Maximum number of samples:" << n
+             << ". Consider using a different updater or tree_method.";
+}
+
+constexpr StringView OldSerialization() {
+  return R"doc(If you are loading a serialized model (like pickle in Python, RDS in R) or
+configuration generated by an older version of XGBoost, please export the model by calling
+`Booster.save_model` from that version first, then load it back in current version. See:
+
+    https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html
+
+for more details about differences between saving model and serializing.
+)doc";
+}
+
+inline void WarnOldSerialization() {
+  // Display it once is enough. Otherwise this can be really verbose in distributed
+  // environments.
+  static thread_local bool logged{false};
+  if (logged) {
+    return;
+  }
+
+  LOG(WARNING) << OldSerialization();
+  logged = true;
+}
 }  // namespace xgboost::error
 #endif  // XGBOOST_COMMON_ERROR_MSG_H_
diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc
index 7855ccb18..b77c8fd84 100644
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -21,8 +21,7 @@
 #include "xgboost/c_api.h"
 #include "xgboost/data.h"
 
-namespace xgboost {
-namespace data {
+namespace xgboost::data {
 MetaInfo& SimpleDMatrix::Info() { return info_; }
 
 const MetaInfo& SimpleDMatrix::Info() const { return info_; }
@@ -97,6 +96,10 @@ BatchSet<SparsePage> SimpleDMatrix::GetRowBatches() {
 BatchSet<CSCPage> SimpleDMatrix::GetColumnBatches(Context const* ctx) {
   // column page doesn't exist, generate it
   if (!column_page_) {
+    auto n = std::numeric_limits<decltype(Entry::index)>::max();
+    if (this->sparse_page_->Size() > n) {
+      error::MaxSampleSize(n);
+    }
     column_page_.reset(new CSCPage(sparse_page_->GetTranspose(info_.num_col_, ctx->Threads())));
   }
   auto begin_iter = BatchIterator<CSCPage>(new SimpleBatchIteratorImpl<CSCPage>(column_page_));
@@ -106,6 +109,10 @@ BatchSet<CSCPage> SimpleDMatrix::GetColumnBatches(Context const* ctx) {
 BatchSet<SortedCSCPage> SimpleDMatrix::GetSortedColumnBatches(Context const* ctx) {
   // Sorted column page doesn't exist, generate it
   if (!sorted_column_page_) {
+    auto n = std::numeric_limits<decltype(Entry::index)>::max();
+    if (this->sparse_page_->Size() > n) {
+      error::MaxSampleSize(n);
+    }
     sorted_column_page_.reset(
         new SortedCSCPage(sparse_page_->GetTranspose(info_.num_col_, ctx->Threads())));
     sorted_column_page_->SortRows(ctx->Threads());
@@ -427,5 +434,4 @@ SimpleDMatrix::SimpleDMatrix(RecordBatchesIterAdapter* adapter, float missing, i
 
   fmat_ctx_ = ctx;
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index b5c1573b1..4f8cd47bb 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -18,7 +18,7 @@
 #include <vector>
 
 #include "../common/common.h"
-#include "../common/error_msg.h"  // for UnknownDevice, InplacePredictProxy
+#include "../common/error_msg.h"  // for UnknownDevice, WarnOldSerialization, InplacePredictProxy
 #include "../common/random.h"
 #include "../common/threading_utils.h"
 #include "../common/timer.h"
@@ -391,19 +391,32 @@ void GBTree::LoadConfig(Json const& in) {
     LOG(WARNING) << msg << "  Changing `tree_method` to `hist`.";
   }
 
-  auto const& j_updaters = get<Object const>(in["updater"]);
+  std::vector<Json> updater_seq;
+  if (IsA<Object>(in["updater"])) {
+    // before 2.0
+    error::WarnOldSerialization();
+    for (auto const& kv : get<Object const>(in["updater"])) {
+      auto name = kv.first;
+      auto config = kv.second;
+      config["name"] = name;
+      updater_seq.push_back(config);
+    }
+  } else {
+    // after 2.0
+    auto const& j_updaters = get<Array const>(in["updater"]);
+    updater_seq = j_updaters;
+  }
+
   updaters_.clear();
 
-  for (auto const& kv : j_updaters) {
-    auto name = kv.first;
+  for (auto const& config : updater_seq) {
+    auto name = get<String>(config["name"]);
     if (n_gpus == 0 && name == "grow_gpu_hist") {
       name = "grow_quantile_histmaker";
       LOG(WARNING) << "Changing updater from `grow_gpu_hist` to `grow_quantile_histmaker`.";
     }
-    std::unique_ptr<TreeUpdater> up{
-        TreeUpdater::Create(name, ctx_, &model_.learner_model_param->task)};
-    up->LoadConfig(kv.second);
-    updaters_.push_back(std::move(up));
+    updaters_.emplace_back(TreeUpdater::Create(name, ctx_, &model_.learner_model_param->task));
+    updaters_.back()->LoadConfig(config);
   }
 
   specified_updater_ = get<Boolean>(in["specified_updater"]);
@@ -425,13 +438,14 @@ void GBTree::SaveConfig(Json* p_out) const {
   // language binding doesn't need to know about the forest size.
   out["gbtree_model_param"] = ToJson(model_.param);
 
-  out["updater"] = Object();
+  out["updater"] = Array{};
+  auto& j_updaters = get<Array>(out["updater"]);
 
-  auto& j_updaters = out["updater"];
-  for (auto const& up : updaters_) {
-    j_updaters[up->Name()] = Object();
-    auto& j_up = j_updaters[up->Name()];
-    up->SaveConfig(&j_up);
+  for (auto const& up : this->updaters_) {
+    Json up_config{Object{}};
+    up_config["name"] = String{up->Name()};
+    up->SaveConfig(&up_config);
+    j_updaters.emplace_back(up_config);
   }
   out["specified_updater"] = Boolean{specified_updater_};
 }
diff --git a/src/learner.cc b/src/learner.cc
index d2f1c774d..4fd0a0f09 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -40,7 +40,7 @@
 #include "common/api_entry.h"             // for XGBAPIThreadLocalEntry
 #include "common/charconv.h"              // for to_chars, to_chars_result, NumericLimits, from_...
 #include "common/common.h"                // for ToString, Split
-#include "common/error_msg.h"             // for MaxFeatureSize
+#include "common/error_msg.h"             // for MaxFeatureSize, WarnOldSerialization
 #include "common/io.h"                    // for PeekableInStream, ReadAll, FixedSizeStream, Mem...
 #include "common/observer.h"              // for TrainingObserver
 #include "common/random.h"                // for GlobalRandom
@@ -357,21 +357,6 @@ DMLC_REGISTER_PARAMETER(LearnerTrainParam);
 using LearnerAPIThreadLocalStore =
     dmlc::ThreadLocalStore<std::map<Learner const *, XGBAPIThreadLocalEntry>>;
 
-namespace {
-StringView ModelMsg() {
-  return StringView{
-      R"doc(
-  If you are loading a serialized model (like pickle in Python, RDS in R) generated by
-  older XGBoost, please export the model by calling `Booster.save_model` from that version
-  first, then load it back in current version. See:
-
-    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html
-
-  for more details about differences between saving model and serializing.
-)doc"};
-}
-}  // anonymous namespace
-
 class LearnerConfiguration : public Learner {
  private:
   std::mutex config_lock_;
@@ -531,7 +516,7 @@ class LearnerConfiguration : public Learner {
     }
 
     if (!Version::Same(origin_version)) {
-      LOG(WARNING) << ModelMsg();
+      error::WarnOldSerialization();
       return;  // skip configuration if version is not matched
     }
 
@@ -562,7 +547,7 @@ class LearnerConfiguration : public Learner {
     for (size_t i = 0; i < n_metrics; ++i) {
       auto old_serialization = IsA<String>(j_metrics[i]);
       if (old_serialization) {
-        LOG(WARNING) << ModelMsg();
+        error::WarnOldSerialization();
         metric_names_[i] = get<String>(j_metrics[i]);
       } else {
         metric_names_[i] = get<String>(j_metrics[i]["name"]);
@@ -1173,7 +1158,7 @@ class LearnerIO : public LearnerConfiguration {
       Json memory_snapshot;
       if (header[1] == '"') {
         memory_snapshot = Json::Load(StringView{buffer});
-        LOG(WARNING) << ModelMsg();
+        error::WarnOldSerialization();
       } else if (std::isalpha(header[1])) {
         memory_snapshot = Json::Load(StringView{buffer}, std::ios::binary);
       } else {
@@ -1192,7 +1177,7 @@ class LearnerIO : public LearnerConfiguration {
       header.resize(serialisation_header_.size());
       CHECK_EQ(fp.Read(&header[0], header.size()), serialisation_header_.size());
       // Avoid printing the content in loaded header, which might be random binary code.
-      CHECK(header == serialisation_header_) << ModelMsg();
+      CHECK(header == serialisation_header_) << error::OldSerialization();
       int64_t sz {-1};
       CHECK_EQ(fp.Read(&sz, sizeof(sz)), sizeof(sz));
       if (!DMLC_IO_NO_ENDIAN_SWAP) {
diff --git a/tests/cpp/gbm/test_gbtree.cc b/tests/cpp/gbm/test_gbtree.cc
index f57b1f47c..7a3008cff 100644
--- a/tests/cpp/gbm/test_gbtree.cc
+++ b/tests/cpp/gbm/test_gbtree.cc
@@ -174,32 +174,52 @@ TEST(GBTree, JsonIO) {
   Context ctx;
   LearnerModelParam mparam{MakeMP(kCols, .5, 1)};
 
-  std::unique_ptr<GradientBooster> gbm {
-    CreateTrainedGBM("gbtree", Args{}, kRows, kCols, &mparam, &ctx) };
+  std::unique_ptr<GradientBooster> gbm{
+      CreateTrainedGBM("gbtree", Args{{"tree_method", "exact"}, {"default_direction", "left"}},
+                       kRows, kCols, &mparam, &ctx)};
 
-  Json model {Object()};
+  Json model{Object()};
   model["model"] = Object();
-  auto& j_model = model["model"];
+  auto j_model = model["model"];
 
   model["config"] = Object();
-  auto& j_param = model["config"];
+  auto j_config = model["config"];
 
   gbm->SaveModel(&j_model);
-  gbm->SaveConfig(&j_param);
+  gbm->SaveConfig(&j_config);
 
   std::string model_str;
   Json::Dump(model, &model_str);
 
   model = Json::Load({model_str.c_str(), model_str.size()});
-  ASSERT_EQ(get<String>(model["model"]["name"]), "gbtree");
+  j_model = model["model"];
+  j_config = model["config"];
+  ASSERT_EQ(get<String>(j_model["name"]), "gbtree");
 
-  auto const& gbtree_model = model["model"]["model"];
+  auto gbtree_model = j_model["model"];
   ASSERT_EQ(get<Array>(gbtree_model["trees"]).size(), 1ul);
   ASSERT_EQ(get<Integer>(get<Object>(get<Array>(gbtree_model["trees"]).front()).at("id")), 0);
   ASSERT_EQ(get<Array>(gbtree_model["tree_info"]).size(), 1ul);
-
-  auto j_train_param = model["config"]["gbtree_model_param"];
+  auto j_train_param = j_config["gbtree_model_param"];
   ASSERT_EQ(get<String>(j_train_param["num_parallel_tree"]), "1");
+
+  auto check_config = [](Json j_up_config) {
+    auto colmaker = get<Array const>(j_up_config).front();
+    auto pruner = get<Array const>(j_up_config).back();
+    ASSERT_EQ(get<String const>(colmaker["name"]), "grow_colmaker");
+    ASSERT_EQ(get<String const>(pruner["name"]), "prune");
+    ASSERT_EQ(get<String const>(colmaker["colmaker_train_param"]["default_direction"]), "left");
+  };
+  check_config(j_config["updater"]);
+
+  std::unique_ptr<GradientBooster> loaded(gbm::GBTree::Create("gbtree", &ctx, &mparam));
+  loaded->LoadModel(j_model);
+  loaded->LoadConfig(j_config);
+
+  // roundtrip test
+  Json j_config_rt{Object{}};
+  loaded->SaveConfig(&j_config_rt);
+  check_config(j_config_rt["updater"]);
 }
 
 TEST(Dart, JsonIO) {

From 15ca12a77ebbaf76515291064c24d8c2268400fd Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Thu, 6 Jul 2023 20:21:35 -0700
Subject: [PATCH 023/136] Fix NCCL test hang (#9367)

---
 src/collective/communicator.cu                   | 16 ++++++++++++----
 src/collective/nccl_device_communicator.cu       | 10 ++++++++--
 src/collective/nccl_device_communicator.cuh      | 16 +++++++++++++++-
 .../collective/test_nccl_device_communicator.cu  |  2 +-
 4 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/src/collective/communicator.cu b/src/collective/communicator.cu
index 8cdb7f2fd..915a3beca 100644
--- a/src/collective/communicator.cu
+++ b/src/collective/communicator.cu
@@ -29,10 +29,18 @@ DeviceCommunicator* Communicator::GetDevice(int device_ordinal) {
     old_device_ordinal = device_ordinal;
     old_world_size = communicator_->GetWorldSize();
 #ifdef XGBOOST_USE_NCCL
-    if (type_ != CommunicatorType::kFederated) {
-      device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal));
-    } else {
-      device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal));
+    switch (type_) {
+      case CommunicatorType::kRabit:
+        device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false));
+        break;
+      case CommunicatorType::kFederated:
+        device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal));
+        break;
+      case CommunicatorType::kInMemory:
+        device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, true));
+        break;
+      default:
+        device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false));
     }
 #else
     device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal));
diff --git a/src/collective/nccl_device_communicator.cu b/src/collective/nccl_device_communicator.cu
index 7f5686075..470700d2d 100644
--- a/src/collective/nccl_device_communicator.cu
+++ b/src/collective/nccl_device_communicator.cu
@@ -7,8 +7,11 @@
 namespace xgboost {
 namespace collective {
 
-NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal)
-    : device_ordinal_{device_ordinal}, world_size_{GetWorldSize()}, rank_{GetRank()} {
+NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, bool needs_sync)
+    : device_ordinal_{device_ordinal},
+      needs_sync_{needs_sync},
+      world_size_{GetWorldSize()},
+      rank_{GetRank()} {
   if (device_ordinal_ < 0) {
     LOG(FATAL) << "Invalid device ordinal: " << device_ordinal_;
   }
@@ -140,6 +143,9 @@ void NcclDeviceCommunicator::BitwiseAllReduce(void *send_receive_buffer, std::si
   // First gather data from all the workers.
   dh::safe_nccl(ncclAllGather(send_receive_buffer, device_buffer, count, GetNcclDataType(data_type),
                               nccl_comm_, cuda_stream_));
+  if (needs_sync_) {
+    dh::safe_cuda(cudaStreamSynchronize(cuda_stream_));
+  }
 
   // Then reduce locally.
   auto *out_buffer = static_cast<char *>(send_receive_buffer);
diff --git a/src/collective/nccl_device_communicator.cuh b/src/collective/nccl_device_communicator.cuh
index 925603d21..bb3fce45c 100644
--- a/src/collective/nccl_device_communicator.cuh
+++ b/src/collective/nccl_device_communicator.cuh
@@ -12,7 +12,20 @@ namespace collective {
 
 class NcclDeviceCommunicator : public DeviceCommunicator {
  public:
-  explicit NcclDeviceCommunicator(int device_ordinal);
+  /**
+   * @brief Construct a new NCCL communicator.
+   * @param device_ordinal The GPU device id.
+   * @param needs_sync Whether extra CUDA stream synchronization is needed.
+   *
+   * In multi-GPU tests when multiple NCCL communicators are created in the same process, sometimes
+   * a deadlock happens because NCCL kernels are blocking. The extra CUDA stream synchronization
+   * makes sure that the NCCL kernels are caught up, thus avoiding the deadlock.
+   *
+   * The Rabit communicator runs with one process per GPU, so the additional synchronization is not
+   * needed. The in-memory communicator is used in tests with multiple threads, each thread
+   * representing a rank/worker, so the additional synchronization is needed to avoid deadlocks.
+   */
+  explicit NcclDeviceCommunicator(int device_ordinal, bool needs_sync);
   ~NcclDeviceCommunicator() override;
   void AllReduce(void *send_receive_buffer, std::size_t count, DataType data_type,
                  Operation op) override;
@@ -60,6 +73,7 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
                         Operation op);
 
   int const device_ordinal_;
+  bool const needs_sync_;
   int const world_size_;
   int const rank_;
   ncclComm_t nccl_comm_{};
diff --git a/tests/cpp/collective/test_nccl_device_communicator.cu b/tests/cpp/collective/test_nccl_device_communicator.cu
index 81dd3d46d..cd9cd26de 100644
--- a/tests/cpp/collective/test_nccl_device_communicator.cu
+++ b/tests/cpp/collective/test_nccl_device_communicator.cu
@@ -16,7 +16,7 @@ namespace xgboost {
 namespace collective {
 
 TEST(NcclDeviceCommunicatorSimpleTest, ThrowOnInvalidDeviceOrdinal) {
-  auto construct = []() { NcclDeviceCommunicator comm{-1}; };
+  auto construct = []() { NcclDeviceCommunicator comm{-1, false}; };
   EXPECT_THROW(construct(), dmlc::Error);
 }
 

From 59787b23af3993346018557931592f23839df621 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 8 Jul 2023 09:24:35 +0800
Subject: [PATCH 024/136] Allow empty page in external memory. (#9361)

---
 src/data/gradient_index_format.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/data/gradient_index_format.cc b/src/data/gradient_index_format.cc
index 241abfb1f..fa8f492ed 100644
--- a/src/data/gradient_index_format.cc
+++ b/src/data/gradient_index_format.cc
@@ -68,7 +68,6 @@ class GHistIndexRawFormat : public SparsePageFormat<GHistIndexMatrix> {
   }
 
   std::size_t Write(GHistIndexMatrix const& page, common::AlignedFileWriteStream* fo) override {
-    CHECK_NE(page.index.Size(), 0) << "Empty page is not supported.";
     std::size_t bytes = 0;
     bytes += WriteHistogramCuts(page.cut, fo);
     // indptr
@@ -81,7 +80,9 @@ class GHistIndexRawFormat : public SparsePageFormat<GHistIndexMatrix> {
     // - index buffer
     std::vector<std::uint8_t> data(page.index.begin(), page.index.end());
     bytes += fo->Write(static_cast<std::uint64_t>(data.size()));
-    bytes += fo->Write(data.data(), data.size());
+    if (!data.empty()) {
+      bytes += fo->Write(data.data(), data.size());
+    }
 
     // hit count
     bytes += common::WriteVec(fo, page.hit_count);

From c3124813e8aa1fd5724b1e55df7fee44b02fc09d Mon Sep 17 00:00:00 2001
From: edumugi <42951344+edumugi@users.noreply.github.com>
Date: Sat, 8 Jul 2023 07:18:12 +0200
Subject: [PATCH 025/136] Support numpy vertical split (#9365)

---
 python-package/xgboost/data.py | 11 +++++++++--
 src/c_api/c_api.cc             |  5 ++++-
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py
index 5e1a309e0..7864d541f 100644
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -197,6 +197,7 @@ def _from_numpy_array(
     nthread: int,
     feature_names: Optional[FeatureNames],
     feature_types: Optional[FeatureTypes],
+    data_split_mode: DataSplitMode = DataSplitMode.ROW,
 ) -> DispatchedDataBackendReturnType:
     """Initialize data from a 2-D numpy matrix."""
     _check_data_shape(data)
@@ -205,7 +206,11 @@ def _from_numpy_array(
     _check_call(
         _LIB.XGDMatrixCreateFromDense(
             _array_interface(data),
-            make_jcargs(missing=float(missing), nthread=int(nthread)),
+            make_jcargs(
+                missing=float(missing),
+                nthread=int(nthread),
+                data_split_mode=int(data_split_mode),
+            ),
             ctypes.byref(handle),
         )
     )
@@ -1046,7 +1051,9 @@ def dispatch_data_backend(
             data.tocsr(), missing, threads, feature_names, feature_types
         )
     if _is_numpy_array(data):
-        return _from_numpy_array(data, missing, threads, feature_names, feature_types)
+        return _from_numpy_array(
+            data, missing, threads, feature_names, feature_types, data_split_mode
+        )
     if _is_uri(data):
         return _from_uri(data, missing, feature_names, feature_types, data_split_mode)
     if _is_list(data):
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index e0f2d47b0..06bd43b2b 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -463,8 +463,11 @@ XGB_DLL int XGDMatrixCreateFromDense(char const *data,
   auto config = Json::Load(StringView{c_json_config});
   float missing = GetMissing(config);
   auto n_threads = OptionalArg<Integer, int64_t>(config, "nthread", 0);
+  auto data_split_mode =
+      static_cast<DataSplitMode>(OptionalArg<Integer, int64_t>(config, "data_split_mode", 0));
   xgboost_CHECK_C_ARG_PTR(out);
-  *out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, n_threads));
+  *out = new std::shared_ptr<DMatrix>(
+      DMatrix::Create(&adapter, missing, n_threads, "", data_split_mode));
   API_END();
 }
 

From 20c52f07d2afeb8775586dbba11766571c5e585b Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 8 Jul 2023 15:32:41 +0800
Subject: [PATCH 026/136] Support exporting cut values (#9356)

---
 include/xgboost/c_api.h                    |  24 ++-
 include/xgboost/data.h                     |  86 ++------
 python-package/xgboost/core.py             |  75 +++++++
 python-package/xgboost/testing/__init__.py |   8 +
 python-package/xgboost/testing/updater.py  |  99 ++++++++-
 src/c_api/c_api.cc                         | 102 +++++++++-
 src/common/api_entry.h                     |   2 +
 src/data/array_interface.h                 |   2 +-
 src/data/ellpack_page.cc                   |  21 +-
 src/data/ellpack_page.cu                   |  15 ++
 src/data/ellpack_page.cuh                  |  19 +-
 src/data/ellpack_page.h                    |  59 ++++++
 src/data/ellpack_page_source.cu            |   7 +-
 src/data/ellpack_page_source.h             |   9 +-
 src/data/gradient_index.h                  |   3 +
 src/data/simple_dmatrix.cc                 |   3 +-
 src/data/sparse_page_dmatrix.cc            |   5 +-
 src/data/sparse_page_dmatrix.cu            |  10 +-
 src/tree/updater_gpu_hist.cu               |   1 +
 tests/cpp/c_api/test_c_api.cc              | 222 ++++++++++++++++++++-
 tests/cpp/data/test_ellpack_page.cu        |   1 +
 tests/cpp/data/test_iterative_dmatrix.cu   |   1 +
 tests/cpp/data/test_sparse_page_dmatrix.cu |   1 +
 tests/cpp/helpers.cc                       |  14 +-
 tests/cpp/tree/test_gpu_hist.cu            |   2 +
 tests/python-gpu/test_from_cupy.py         |  11 +
 tests/python-gpu/test_gpu_updaters.py      |  10 +-
 tests/python/test_updaters.py              |  11 +-
 28 files changed, 722 insertions(+), 101 deletions(-)
 create mode 100644 src/data/ellpack_page.h

diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h
index 3cfba0468..2a7d51393 100644
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@@ -810,7 +810,7 @@ XGB_DLL int XGDMatrixNumCol(DMatrixHandle handle,
  */
 XGB_DLL int XGDMatrixNumNonMissing(DMatrixHandle handle, bst_ulong *out);
 
-/*!
+/**
  * \brief Get the predictors from DMatrix as CSR matrix for testing.  If this is a
  *        quantized DMatrix, quantized values are returned instead.
  *
@@ -819,8 +819,10 @@ XGB_DLL int XGDMatrixNumNonMissing(DMatrixHandle handle, bst_ulong *out);
  * XGBoost. This is to avoid allocating a huge memory buffer that can not be freed until
  * exiting the thread.
  *
+ * @since 1.7.0
+ *
  * \param handle the handle to the DMatrix
- * \param config Json configuration string. At the moment it should be an empty document,
+ * \param config JSON configuration string. At the moment it should be an empty document,
  *               preserved for future use.
  * \param out_indptr  indptr of output CSR matrix.
  * \param out_indices Column index of output CSR matrix.
@@ -831,6 +833,24 @@ XGB_DLL int XGDMatrixNumNonMissing(DMatrixHandle handle, bst_ulong *out);
 XGB_DLL int XGDMatrixGetDataAsCSR(DMatrixHandle const handle, char const *config,
                                   bst_ulong *out_indptr, unsigned *out_indices, float *out_data);
 
+/**
+ * @brief Export the quantile cuts used for training histogram-based models like `hist` and
+ *        `approx`. Useful for model compression.
+ *
+ * @since 2.0.0
+ *
+ * @param handle the handle to the DMatrix
+ * @param config JSON configuration string. At the moment it should be an empty document,
+ *               preserved for future use.
+ *
+ * @param out_indptr indptr of output CSC matrix represented by a JSON encoded
+ *                   __(cuda_)array_interface__.
+ * @param out_data   Data value of CSC matrix represented by a JSON encoded
+ *                   __(cuda_)array_interface__.
+ */
+XGB_DLL int XGDMatrixGetQuantileCut(DMatrixHandle const handle, char const *config,
+                                     char const **out_indptr, char const **out_data);
+
 /** @} */  // End of DMatrix
 
 /**
diff --git a/include/xgboost/data.h b/include/xgboost/data.h
index 6305abff8..472ca43b3 100644
--- a/include/xgboost/data.h
+++ b/include/xgboost/data.h
@@ -282,7 +282,7 @@ struct BatchParam {
   BatchParam(bst_bin_t max_bin, common::Span<float> hessian, bool regenerate)
       : max_bin{max_bin}, hess{hessian}, regen{regenerate} {}
 
-  bool ParamNotEqual(BatchParam const& other) const {
+  [[nodiscard]] bool ParamNotEqual(BatchParam const& other) const {
     // Check non-floating parameters.
     bool cond = max_bin != other.max_bin;
     // Check sparse thresh.
@@ -293,11 +293,11 @@ struct BatchParam {
 
     return cond;
   }
-  bool Initialized() const { return max_bin != 0; }
+  [[nodiscard]] bool Initialized() const { return max_bin != 0; }
   /**
    * \brief Make a copy of self for DMatrix to describe how its existing index was generated.
    */
-  BatchParam MakeCache() const {
+  [[nodiscard]] BatchParam MakeCache() const {
     auto p = *this;
     // These parameters have nothing to do with how the gradient index was generated in the
     // first place.
@@ -319,7 +319,7 @@ struct HostSparsePageView {
             static_cast<Inst::index_type>(size)};
   }
 
-  size_t Size() const { return offset.size() == 0 ? 0 : offset.size() - 1; }
+  [[nodiscard]] size_t Size() const { return offset.size() == 0 ? 0 : offset.size() - 1; }
 };
 
 /*!
@@ -337,7 +337,7 @@ class SparsePage {
   /*! \brief an instance of sparse vector in the batch */
   using Inst = common::Span<Entry const>;
 
-  HostSparsePageView GetView() const {
+  [[nodiscard]] HostSparsePageView GetView() const {
     return {offset.ConstHostSpan(), data.ConstHostSpan()};
   }
 
@@ -353,12 +353,12 @@ class SparsePage {
   virtual ~SparsePage() = default;
 
   /*! \return Number of instances in the page. */
-  inline size_t Size() const {
+  [[nodiscard]] size_t Size() const {
     return offset.Size() == 0 ? 0 : offset.Size() - 1;
   }
 
   /*! \return estimation of memory cost of this page */
-  inline size_t MemCostBytes() const {
+  [[nodiscard]] size_t MemCostBytes() const {
     return offset.Size() * sizeof(size_t) + data.Size() * sizeof(Entry);
   }
 
@@ -376,7 +376,7 @@ class SparsePage {
     base_rowid = row_id;
   }
 
-  SparsePage GetTranspose(int num_columns, int32_t n_threads) const;
+  [[nodiscard]] SparsePage GetTranspose(int num_columns, int32_t n_threads) const;
 
   /**
    * \brief Sort the column index.
@@ -385,7 +385,7 @@ class SparsePage {
   /**
    * \brief Check wether the column index is sorted.
    */
-  bool IsIndicesSorted(int32_t n_threads) const;
+  [[nodiscard]] bool IsIndicesSorted(int32_t n_threads) const;
   /**
    * \brief Reindex the column index with an offset.
    */
@@ -440,49 +440,7 @@ class SortedCSCPage : public SparsePage {
   explicit SortedCSCPage(SparsePage page) : SparsePage(std::move(page)) {}
 };
 
-class EllpackPageImpl;
-/*!
- * \brief A page stored in ELLPACK format.
- *
- * This class uses the PImpl idiom (https://en.cppreference.com/w/cpp/language/pimpl) to avoid
- * including CUDA-specific implementation details in the header.
- */
-class EllpackPage {
- public:
-  /*!
-   * \brief Default constructor.
-   *
-   * This is used in the external memory case. An empty ELLPACK page is constructed with its content
-   * set later by the reader.
-   */
-  EllpackPage();
-
-  /*!
-   * \brief Constructor from an existing DMatrix.
-   *
-   * This is used in the in-memory case. The ELLPACK page is constructed from an existing DMatrix
-   * in CSR format.
-   */
-  explicit EllpackPage(Context const* ctx, DMatrix* dmat, const BatchParam& param);
-
-  /*! \brief Destructor. */
-  ~EllpackPage();
-
-  EllpackPage(EllpackPage&& that);
-
-  /*! \return Number of instances in the page. */
-  size_t Size() const;
-
-  /*! \brief Set the base row id for this page. */
-  void SetBaseRowId(std::size_t row_id);
-
-  const EllpackPageImpl* Impl() const { return impl_.get(); }
-  EllpackPageImpl* Impl() { return impl_.get(); }
-
- private:
-  std::unique_ptr<EllpackPageImpl> impl_;
-};
-
+class EllpackPage;
 class GHistIndexMatrix;
 
 template<typename T>
@@ -492,7 +450,7 @@ class BatchIteratorImpl {
   virtual ~BatchIteratorImpl() = default;
   virtual const T& operator*() const = 0;
   virtual BatchIteratorImpl& operator++() = 0;
-  virtual bool AtEnd() const = 0;
+  [[nodiscard]] virtual bool AtEnd() const = 0;
   virtual std::shared_ptr<T const> Page() const = 0;
 };
 
@@ -519,12 +477,12 @@ class BatchIterator {
     return !impl_->AtEnd();
   }
 
-  bool AtEnd() const {
+  [[nodiscard]] bool AtEnd() const {
     CHECK(impl_ != nullptr);
     return impl_->AtEnd();
   }
 
-  std::shared_ptr<T const> Page() const {
+  [[nodiscard]] std::shared_ptr<T const> Page() const {
     return impl_->Page();
   }
 
@@ -563,15 +521,15 @@ class DMatrix {
     this->Info().SetInfo(ctx, key, StringView{interface_str});
   }
   /*! \brief meta information of the dataset */
-  virtual const MetaInfo& Info() const = 0;
+  [[nodiscard]] virtual const MetaInfo& Info() const = 0;
 
   /*! \brief Get thread local memory for returning data from DMatrix. */
-  XGBAPIThreadLocalEntry& GetThreadLocal() const;
+  [[nodiscard]] XGBAPIThreadLocalEntry& GetThreadLocal() const;
   /**
    * \brief Get the context object of this DMatrix.  The context is created during construction of
    *        DMatrix with user specified `nthread` parameter.
    */
-  virtual Context const* Ctx() const = 0;
+  [[nodiscard]] virtual Context const* Ctx() const = 0;
 
   /**
    * \brief Gets batches. Use range based for loop over BatchSet to access individual batches.
@@ -583,16 +541,16 @@ class DMatrix {
   template <typename T>
   BatchSet<T> GetBatches(Context const* ctx, const BatchParam& param);
   template <typename T>
-  bool PageExists() const;
+  [[nodiscard]] bool PageExists() const;
 
   // the following are column meta data, should be able to answer them fast.
   /*! \return Whether the data columns single column block. */
-  virtual bool SingleColBlock() const = 0;
+  [[nodiscard]] virtual bool SingleColBlock() const = 0;
   /*! \brief virtual destructor */
   virtual ~DMatrix();
 
   /*! \brief Whether the matrix is dense. */
-  bool IsDense() const {
+  [[nodiscard]] bool IsDense() const {
     return Info().num_nonzero_ == Info().num_row_ * Info().num_col_;
   }
 
@@ -695,9 +653,9 @@ class DMatrix {
                                                       BatchParam const& param) = 0;
   virtual BatchSet<ExtSparsePage> GetExtBatches(Context const* ctx, BatchParam const& param) = 0;
 
-  virtual bool EllpackExists() const = 0;
-  virtual bool GHistIndexExists() const = 0;
-  virtual bool SparsePageExists() const = 0;
+  [[nodiscard]] virtual bool EllpackExists() const = 0;
+  [[nodiscard]] virtual bool GHistIndexExists() const = 0;
+  [[nodiscard]] virtual bool SparsePageExists() const = 0;
 };
 
 template <>
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 07e8d89cc..31f34256d 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -3,6 +3,7 @@
 """Core XGBoost Library."""
 import copy
 import ctypes
+import importlib.util
 import json
 import os
 import re
@@ -381,6 +382,54 @@ def c_array(
     return (ctype * len(values))(*values)
 
 
+def from_array_interface(interface: dict) -> NumpyOrCupy:
+    """Convert array interface to numpy or cupy array"""
+
+    class Array:  # pylint: disable=too-few-public-methods
+        """Wrapper type for communicating with numpy and cupy."""
+
+        _interface: Optional[dict] = None
+
+        @property
+        def __array_interface__(self) -> Optional[dict]:
+            return self._interface
+
+        @__array_interface__.setter
+        def __array_interface__(self, interface: dict) -> None:
+            self._interface = copy.copy(interface)
+            # converts some fields to tuple as required by numpy
+            self._interface["shape"] = tuple(self._interface["shape"])
+            self._interface["data"] = tuple(self._interface["data"])
+            if self._interface.get("strides", None) is not None:
+                self._interface["strides"] = tuple(self._interface["strides"])
+
+        @property
+        def __cuda_array_interface__(self) -> Optional[dict]:
+            return self.__array_interface__
+
+        @__cuda_array_interface__.setter
+        def __cuda_array_interface__(self, interface: dict) -> None:
+            self.__array_interface__ = interface
+
+    arr = Array()
+
+    if "stream" in interface:
+        # CUDA stream is presented, this is a __cuda_array_interface__.
+        spec = importlib.util.find_spec("cupy")
+        if spec is None:
+            raise ImportError("`cupy` is required for handling CUDA buffer.")
+
+        import cupy as cp  # pylint: disable=import-error
+
+        arr.__cuda_array_interface__ = interface
+        out = cp.array(arr, copy=True)
+    else:
+        arr.__array_interface__ = interface
+        out = np.array(arr, copy=True)
+
+    return out
+
+
 def _prediction_output(
     shape: CNumericPtr, dims: c_bst_ulong, predts: CFloatPtr, is_cuda: bool
 ) -> NumpyOrCupy:
@@ -1060,6 +1109,32 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes,too-many-public-m
         )
         return ret
 
+    def get_quantile_cut(self) -> Tuple[np.ndarray, np.ndarray]:
+        """Get quantile cuts for quantization."""
+        n_features = self.num_col()
+
+        c_sindptr = ctypes.c_char_p()
+        c_sdata = ctypes.c_char_p()
+        config = make_jcargs()
+        _check_call(
+            _LIB.XGDMatrixGetQuantileCut(
+                self.handle, config, ctypes.byref(c_sindptr), ctypes.byref(c_sdata)
+            )
+        )
+        assert c_sindptr.value is not None
+        assert c_sdata.value is not None
+
+        i_indptr = json.loads(c_sindptr.value)
+        indptr = from_array_interface(i_indptr)
+        assert indptr.size == n_features + 1
+        assert indptr.dtype == np.uint64
+
+        i_data = json.loads(c_sdata.value)
+        data = from_array_interface(i_data)
+        assert data.size == indptr[-1]
+        assert data.dtype == np.float32
+        return indptr, data
+
     def num_row(self) -> int:
         """Get the number of rows in the DMatrix."""
         ret = c_bst_ulong()
diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py
index 862375026..8e2e13f43 100644
--- a/python-package/xgboost/testing/__init__.py
+++ b/python-package/xgboost/testing/__init__.py
@@ -265,6 +265,14 @@ def make_batches(
     return X, y, w
 
 
+def make_regression(
+    n_samples: int, n_features: int, use_cupy: bool
+) -> Tuple[ArrayLike, ArrayLike, ArrayLike]:
+    """Make a simple regression dataset."""
+    X, y, w = make_batches(n_samples, n_features, 1, use_cupy)
+    return X[0], y[0], w[0]
+
+
 def make_batches_sparse(
     n_samples_per_batch: int, n_features: int, n_batches: int, sparsity: float
 ) -> Tuple[List[sparse.csr_matrix], List[np.ndarray], List[np.ndarray]]:
diff --git a/python-package/xgboost/testing/updater.py b/python-package/xgboost/testing/updater.py
index 4086f92c8..62df8ec2e 100644
--- a/python-package/xgboost/testing/updater.py
+++ b/python-package/xgboost/testing/updater.py
@@ -1,7 +1,7 @@
 """Tests for updaters."""
 import json
 from functools import partial, update_wrapper
-from typing import Dict
+from typing import Any, Dict
 
 import numpy as np
 
@@ -159,3 +159,100 @@ def check_quantile_loss(tree_method: str, weighted: bool) -> None:
 
     for i in range(alpha.shape[0]):
         np.testing.assert_allclose(predts[:, i], predt_multi[:, i])
+
+
+def check_cut(
+    n_entries: int, indptr: np.ndarray, data: np.ndarray, dtypes: Any
+) -> None:
+    """Check the cut values."""
+    from pandas.api.types import is_categorical_dtype
+
+    assert data.shape[0] == indptr[-1]
+    assert data.shape[0] == n_entries
+
+    assert indptr.dtype == np.uint64
+    for i in range(1, indptr.size):
+        beg = int(indptr[i - 1])
+        end = int(indptr[i])
+        for j in range(beg + 1, end):
+            assert data[j] > data[j - 1]
+            if is_categorical_dtype(dtypes[i - 1]):
+                assert data[j] == data[j - 1] + 1
+
+
+def check_get_quantile_cut_device(tree_method: str, use_cupy: bool) -> None:
+    """Check with optional cupy."""
+    from pandas.api.types import is_categorical_dtype
+
+    n_samples = 1024
+    n_features = 14
+    max_bin = 16
+    dtypes = [np.float32] * n_features
+
+    # numerical
+    X, y, w = tm.make_regression(n_samples, n_features, use_cupy=use_cupy)
+    # - qdm
+    Xyw: xgb.DMatrix = xgb.QuantileDMatrix(X, y, weight=w, max_bin=max_bin)
+    indptr, data = Xyw.get_quantile_cut()
+    check_cut((max_bin + 1) * n_features, indptr, data, dtypes)
+    # - dm
+    Xyw = xgb.DMatrix(X, y, weight=w)
+    xgb.train({"tree_method": tree_method, "max_bin": max_bin}, Xyw)
+    indptr, data = Xyw.get_quantile_cut()
+    check_cut((max_bin + 1) * n_features, indptr, data, dtypes)
+    # - ext mem
+    n_batches = 3
+    n_samples_per_batch = 256
+    it = tm.IteratorForTest(
+        *tm.make_batches(n_samples_per_batch, n_features, n_batches, use_cupy),
+        cache="cache",
+    )
+    Xy: xgb.DMatrix = xgb.DMatrix(it)
+    xgb.train({"tree_method": tree_method, "max_bin": max_bin}, Xyw)
+    indptr, data = Xyw.get_quantile_cut()
+    check_cut((max_bin + 1) * n_features, indptr, data, dtypes)
+
+    # categorical
+    n_categories = 32
+    X, y = tm.make_categorical(n_samples, n_features, n_categories, False, sparsity=0.8)
+    if use_cupy:
+        import cudf  # pylint: disable=import-error
+        import cupy as cp  # pylint: disable=import-error
+
+        X = cudf.from_pandas(X)
+        y = cp.array(y)
+    # - qdm
+    Xy = xgb.QuantileDMatrix(X, y, max_bin=max_bin, enable_categorical=True)
+    indptr, data = Xy.get_quantile_cut()
+    check_cut(n_categories * n_features, indptr, data, X.dtypes)
+    # - dm
+    Xy = xgb.DMatrix(X, y, enable_categorical=True)
+    xgb.train({"tree_method": tree_method, "max_bin": max_bin}, Xy)
+    indptr, data = Xy.get_quantile_cut()
+    check_cut(n_categories * n_features, indptr, data, X.dtypes)
+
+    # mixed
+    X, y = tm.make_categorical(
+        n_samples, n_features, n_categories, False, sparsity=0.8, cat_ratio=0.5
+    )
+    n_cat_features = len([0 for dtype in X.dtypes if is_categorical_dtype(dtype)])
+    n_num_features = n_features - n_cat_features
+    n_entries = n_categories * n_cat_features + (max_bin + 1) * n_num_features
+    # - qdm
+    Xy = xgb.QuantileDMatrix(X, y, max_bin=max_bin, enable_categorical=True)
+    indptr, data = Xy.get_quantile_cut()
+    check_cut(n_entries, indptr, data, X.dtypes)
+    # - dm
+    Xy = xgb.DMatrix(X, y, enable_categorical=True)
+    xgb.train({"tree_method": tree_method, "max_bin": max_bin}, Xy)
+    indptr, data = Xy.get_quantile_cut()
+    check_cut(n_entries, indptr, data, X.dtypes)
+
+
+def check_get_quantile_cut(tree_method: str) -> None:
+    """Check the quantile cut getter."""
+
+    use_cupy = tree_method == "gpu_hist"
+    check_get_quantile_cut_device(tree_method, False)
+    if use_cupy:
+        check_get_quantile_cut_device(tree_method, True)
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 06bd43b2b..4e1f86ff2 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -3,7 +3,7 @@
  */
 #include "xgboost/c_api.h"
 
-#include <algorithm>                         // for copy
+#include <algorithm>                         // for copy, transform
 #include <cinttypes>                         // for strtoimax
 #include <cmath>                             // for nan
 #include <cstring>                           // for strcmp
@@ -20,9 +20,11 @@
 #include "../collective/communicator-inl.h"  // for Allreduce, Broadcast, Finalize, GetProcessor...
 #include "../common/api_entry.h"             // for XGBAPIThreadLocalEntry
 #include "../common/charconv.h"              // for from_chars, to_chars, NumericLimits, from_ch...
+#include "../common/hist_util.h"             // for HistogramCuts
 #include "../common/io.h"                    // for FileExtension, LoadSequentialFile, MemoryBuf...
 #include "../common/threading_utils.h"       // for OmpGetNumThreads, ParallelFor
 #include "../data/adapter.h"                 // for ArrayAdapter, DenseAdapter, RecordBatchesIte...
+#include "../data/ellpack_page.h"            // for EllpackPage
 #include "../data/proxy_dmatrix.h"           // for DMatrixProxy
 #include "../data/simple_dmatrix.h"          // for SimpleDMatrix
 #include "c_api_error.h"                     // for xgboost_CHECK_C_ARG_PTR, API_END, API_BEGIN
@@ -785,6 +787,104 @@ XGB_DLL int XGDMatrixGetDataAsCSR(DMatrixHandle const handle, char const *config
   API_END();
 }
 
+namespace {
+template <typename Page>
+void GetCutImpl(Context const *ctx, std::shared_ptr<DMatrix> p_m,
+                std::vector<std::uint64_t> *p_indptr, std::vector<float> *p_data) {
+  auto &indptr = *p_indptr;
+  auto &data = *p_data;
+  for (auto const &page : p_m->GetBatches<Page>(ctx, {})) {
+    auto const &cut = page.Cuts();
+
+    auto const &ptrs = cut.Ptrs();
+    indptr.resize(ptrs.size());
+
+    auto const &vals = cut.Values();
+    auto const &mins = cut.MinValues();
+
+    bst_feature_t n_features = p_m->Info().num_col_;
+    auto ft = p_m->Info().feature_types.ConstHostSpan();
+    std::size_t n_categories = std::count_if(ft.cbegin(), ft.cend(),
+                                             [](auto t) { return t == FeatureType::kCategorical; });
+    data.resize(vals.size() + n_features - n_categories);  // |vals| + |mins|
+    std::size_t i{0}, n_numeric{0};
+    for (bst_feature_t fidx = 0; fidx < n_features; ++fidx) {
+      CHECK_LT(i, data.size());
+      bool is_numeric = !common::IsCat(ft, fidx);
+      if (is_numeric) {
+        data[i] = mins[fidx];
+        i++;
+      }
+      auto beg = ptrs[fidx];
+      auto end = ptrs[fidx + 1];
+      CHECK_LE(end, data.size());
+      std::copy(vals.cbegin() + beg, vals.cbegin() + end, data.begin() + i);
+      i += (end - beg);
+      // shift by min values.
+      indptr[fidx] = ptrs[fidx] + n_numeric;
+      if (is_numeric) {
+        n_numeric++;
+      }
+    }
+    CHECK_EQ(n_numeric, n_features - n_categories);
+
+    indptr.back() = data.size();
+    CHECK_EQ(indptr.back(), vals.size() + mins.size() - n_categories);
+    break;
+  }
+}
+}  // namespace
+
+XGB_DLL int XGDMatrixGetQuantileCut(DMatrixHandle const handle, char const *config,
+                                    char const **out_indptr, char const **out_data) {
+  API_BEGIN();
+  CHECK_HANDLE();
+
+  auto p_m = CastDMatrixHandle(handle);
+
+  xgboost_CHECK_C_ARG_PTR(config);
+  xgboost_CHECK_C_ARG_PTR(out_indptr);
+  xgboost_CHECK_C_ARG_PTR(out_data);
+
+  auto jconfig = Json::Load(StringView{config});
+
+  if (!p_m->PageExists<GHistIndexMatrix>() && !p_m->PageExists<EllpackPage>()) {
+    LOG(FATAL) << "The quantile cut hasn't been generated yet. Unless this is a `QuantileDMatrix`, "
+                  "quantile cut is generated during training.";
+  }
+  // Get return buffer
+  auto &data = p_m->GetThreadLocal().ret_vec_float;
+  auto &indptr = p_m->GetThreadLocal().ret_vec_u64;
+
+  if (p_m->PageExists<GHistIndexMatrix>()) {
+    auto ctx = p_m->Ctx()->IsCPU() ? *p_m->Ctx() : p_m->Ctx()->MakeCPU();
+    GetCutImpl<GHistIndexMatrix>(&ctx, p_m, &indptr, &data);
+  } else {
+    auto ctx = p_m->Ctx()->IsCUDA() ? *p_m->Ctx() : p_m->Ctx()->MakeCUDA(0);
+    GetCutImpl<EllpackPage>(&ctx, p_m, &indptr, &data);
+  }
+
+  // Create a CPU context
+  Context ctx;
+  // Get return buffer
+  auto &ret_vec_str = p_m->GetThreadLocal().ret_vec_str;
+  ret_vec_str.clear();
+
+  ret_vec_str.emplace_back(linalg::ArrayInterfaceStr(
+      linalg::MakeTensorView(&ctx, common::Span{indptr.data(), indptr.size()}, indptr.size())));
+  ret_vec_str.emplace_back(linalg::ArrayInterfaceStr(
+      linalg::MakeTensorView(&ctx, common::Span{data.data(), data.size()}, data.size())));
+
+  auto &charp_vecs = p_m->GetThreadLocal().ret_vec_charp;
+  charp_vecs.resize(ret_vec_str.size());
+  std::transform(ret_vec_str.cbegin(), ret_vec_str.cend(), charp_vecs.begin(),
+                 [](auto const &str) { return str.c_str(); });
+
+  *out_indptr = charp_vecs[0];
+  *out_data = charp_vecs[1];
+  API_END();
+}
+
 // xgboost implementation
 XGB_DLL int XGBoosterCreate(const DMatrixHandle dmats[],
                             xgboost::bst_ulong len,
diff --git a/src/common/api_entry.h b/src/common/api_entry.h
index db3bcfbc3..df1fcd704 100644
--- a/src/common/api_entry.h
+++ b/src/common/api_entry.h
@@ -24,6 +24,8 @@ struct XGBAPIThreadLocalEntry {
   std::vector<const char *> ret_vec_charp;
   /*! \brief returning float vector. */
   std::vector<float> ret_vec_float;
+  /*! \brief returning uint vector. */
+  std::vector<std::uint64_t> ret_vec_u64;
   /*! \brief temp variable of gradient pairs. */
   std::vector<GradientPair> tmp_gpair;
   /*! \brief Temp variable for returning prediction result. */
diff --git a/src/data/array_interface.h b/src/data/array_interface.h
index bd66c2a53..99effffef 100644
--- a/src/data/array_interface.h
+++ b/src/data/array_interface.h
@@ -455,7 +455,7 @@ class ArrayInterface {
 
   explicit ArrayInterface(std::string const &str) : ArrayInterface{StringView{str}} {}
 
-  explicit ArrayInterface(StringView str) : ArrayInterface<D>{Json::Load(str)} {}
+  explicit ArrayInterface(StringView str) : ArrayInterface{Json::Load(str)} {}
 
   void AssignType(StringView typestr) {
     using T = ArrayInterfaceHandler::Type;
diff --git a/src/data/ellpack_page.cc b/src/data/ellpack_page.cc
index 1fd8f12b2..59cfd1943 100644
--- a/src/data/ellpack_page.cc
+++ b/src/data/ellpack_page.cc
@@ -3,12 +3,20 @@
  */
 #ifndef XGBOOST_USE_CUDA
 
+#include "ellpack_page.h"
+
 #include <xgboost/data.h>
 
 // dummy implementation of EllpackPage in case CUDA is not used
 namespace xgboost {
 
-class EllpackPageImpl {};
+class EllpackPageImpl {
+  common::HistogramCuts cuts_;
+
+ public:
+  [[nodiscard]] common::HistogramCuts& Cuts() { return cuts_; }
+  [[nodiscard]] common::HistogramCuts const& Cuts() const { return cuts_; }
+};
 
 EllpackPage::EllpackPage() = default;
 
@@ -32,6 +40,17 @@ size_t EllpackPage::Size() const {
   return 0;
 }
 
+[[nodiscard]] common::HistogramCuts& EllpackPage::Cuts() {
+  LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but "
+                "EllpackPage is required";
+  return impl_->Cuts();
+}
+
+[[nodiscard]] common::HistogramCuts const& EllpackPage::Cuts() const {
+  LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but "
+                "EllpackPage is required";
+  return impl_->Cuts();
+}
 }  // namespace xgboost
 
 #endif  // XGBOOST_USE_CUDA
diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index 13fcf9adf..0ccd7a081 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -4,6 +4,10 @@
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/iterator/transform_output_iterator.h>
 
+#include <algorithm>  // for copy
+#include <utility>    // for move
+#include <vector>     // for vector
+
 #include "../common/categorical.h"
 #include "../common/cuda_context.cuh"
 #include "../common/hist_util.cuh"
@@ -11,6 +15,7 @@
 #include "../common/transform_iterator.h"  // MakeIndexTransformIter
 #include "./ellpack_page.cuh"
 #include "device_adapter.cuh"  // for HasInfInData
+#include "ellpack_page.h"
 #include "gradient_index.h"
 #include "xgboost/data.h"
 
@@ -29,6 +34,16 @@ size_t EllpackPage::Size() const { return impl_->Size(); }
 
 void EllpackPage::SetBaseRowId(std::size_t row_id) { impl_->SetBaseRowId(row_id); }
 
+[[nodiscard]] common::HistogramCuts& EllpackPage::Cuts() {
+  CHECK(impl_);
+  return impl_->Cuts();
+}
+
+[[nodiscard]] common::HistogramCuts const& EllpackPage::Cuts() const {
+  CHECK(impl_);
+  return impl_->Cuts();
+}
+
 // Bin each input data entry, store the bin indices in compressed form.
 __global__ void CompressBinEllpackKernel(
     common::CompressedBufferWriter wr,
diff --git a/src/data/ellpack_page.cuh b/src/data/ellpack_page.cuh
index ee6a2c221..96963463b 100644
--- a/src/data/ellpack_page.cuh
+++ b/src/data/ellpack_page.cuh
@@ -1,17 +1,18 @@
-/*!
- * Copyright 2019 by XGBoost Contributors
+/**
+ * Copyright 2019-2023, XGBoost Contributors
  */
 
-#ifndef XGBOOST_DATA_ELLPACK_PAGE_H_
-#define XGBOOST_DATA_ELLPACK_PAGE_H_
+#ifndef XGBOOST_DATA_ELLPACK_PAGE_CUH_
+#define XGBOOST_DATA_ELLPACK_PAGE_CUH_
 
+#include <thrust/binary_search.h>
 #include <xgboost/data.h>
 
+#include "../common/categorical.h"
 #include "../common/compressed_iterator.h"
 #include "../common/device_helpers.cuh"
 #include "../common/hist_util.h"
-#include "../common/categorical.h"
-#include <thrust/binary_search.h>
+#include "ellpack_page.h"
 
 namespace xgboost {
 /** \brief Struct for accessing and manipulating an ELLPACK matrix on the
@@ -194,8 +195,8 @@ class EllpackPageImpl {
     base_rowid = row_id;
   }
 
-  common::HistogramCuts& Cuts() { return cuts_; }
-  common::HistogramCuts const& Cuts() const { return cuts_; }
+  [[nodiscard]] common::HistogramCuts& Cuts() { return cuts_; }
+  [[nodiscard]] common::HistogramCuts const& Cuts() const { return cuts_; }
 
   /*! \return Estimation of memory cost of this page. */
   static size_t MemCostBytes(size_t num_rows, size_t row_stride, const common::HistogramCuts&cuts) ;
@@ -256,4 +257,4 @@ inline size_t GetRowStride(DMatrix* dmat) {
 }
 }  // namespace xgboost
 
-#endif  // XGBOOST_DATA_ELLPACK_PAGE_H_
+#endif  // XGBOOST_DATA_ELLPACK_PAGE_CUH_
diff --git a/src/data/ellpack_page.h b/src/data/ellpack_page.h
new file mode 100644
index 000000000..07d6949b1
--- /dev/null
+++ b/src/data/ellpack_page.h
@@ -0,0 +1,59 @@
+/**
+ * Copyright 2017-2023 by XGBoost Contributors
+ */
+#ifndef XGBOOST_DATA_ELLPACK_PAGE_H_
+#define XGBOOST_DATA_ELLPACK_PAGE_H_
+
+#include <memory>  // for unique_ptr
+
+#include "../common/hist_util.h"  // for HistogramCuts
+#include "xgboost/context.h"      // for Context
+#include "xgboost/data.h"         // for DMatrix, BatchParam
+
+namespace xgboost {
+class EllpackPageImpl;
+/**
+ * @brief A page stored in ELLPACK format.
+ *
+ * This class uses the PImpl idiom (https://en.cppreference.com/w/cpp/language/pimpl) to avoid
+ * including CUDA-specific implementation details in the header.
+ */
+class EllpackPage {
+ public:
+  /**
+   * @brief Default constructor.
+   *
+   * This is used in the external memory case. An empty ELLPACK page is constructed with its content
+   * set later by the reader.
+   */
+  EllpackPage();
+  /**
+   * @brief Constructor from an existing DMatrix.
+   *
+   * This is used in the in-memory case. The ELLPACK page is constructed from an existing DMatrix
+   * in CSR format.
+   */
+  explicit EllpackPage(Context const* ctx, DMatrix* dmat, const BatchParam& param);
+
+  /*! \brief Destructor. */
+  ~EllpackPage();
+
+  EllpackPage(EllpackPage&& that);
+
+  /*! \return Number of instances in the page. */
+  [[nodiscard]] size_t Size() const;
+
+  /*! \brief Set the base row id for this page. */
+  void SetBaseRowId(std::size_t row_id);
+
+  [[nodiscard]] const EllpackPageImpl* Impl() const { return impl_.get(); }
+  EllpackPageImpl* Impl() { return impl_.get(); }
+
+  [[nodiscard]] common::HistogramCuts& Cuts();
+  [[nodiscard]] common::HistogramCuts const& Cuts() const;
+
+ private:
+  std::unique_ptr<EllpackPageImpl> impl_;
+};
+}  // namespace xgboost
+#endif  // XGBOOST_DATA_ELLPACK_PAGE_H_
diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu
index fb414f4ae..abfc400c1 100644
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -5,10 +5,10 @@
 #include <utility>
 
 #include "ellpack_page.cuh"
+#include "ellpack_page.h"  // for EllpackPage
 #include "ellpack_page_source.h"
 
-namespace xgboost {
-namespace data {
+namespace xgboost::data {
 void EllpackPageSource::Fetch() {
   dh::safe_cuda(cudaSetDevice(device_));
   if (!this->ReadCache()) {
@@ -27,5 +27,4 @@ void EllpackPageSource::Fetch() {
     this->WriteCache();
   }
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
diff --git a/src/data/ellpack_page_source.h b/src/data/ellpack_page_source.h
index 121ffcf9e..146db94ed 100644
--- a/src/data/ellpack_page_source.h
+++ b/src/data/ellpack_page_source.h
@@ -6,17 +6,17 @@
 #define XGBOOST_DATA_ELLPACK_PAGE_SOURCE_H_
 
 #include <xgboost/data.h>
+
 #include <memory>
 #include <string>
 #include <utility>
 
 #include "../common/common.h"
 #include "../common/hist_util.h"
+#include "ellpack_page.h"  // for EllpackPage
 #include "sparse_page_source.h"
 
-namespace xgboost {
-namespace data {
-
+namespace xgboost::data {
 class EllpackPageSource : public PageSourceIncMixIn<EllpackPage> {
   bool is_dense_;
   size_t row_stride_;
@@ -53,7 +53,6 @@ inline void EllpackPageSource::Fetch() {
   common::AssertGPUSupport();
 }
 #endif  // !defined(XGBOOST_USE_CUDA)
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
 
 #endif  // XGBOOST_DATA_ELLPACK_PAGE_SOURCE_H_
diff --git a/src/data/gradient_index.h b/src/data/gradient_index.h
index 840be4b06..901451ad9 100644
--- a/src/data/gradient_index.h
+++ b/src/data/gradient_index.h
@@ -245,6 +245,9 @@ class GHistIndexMatrix {
                                 std::vector<float> const& values, std::vector<float> const& mins,
                                 bst_row_t ridx, bst_feature_t fidx, bool is_cat) const;
 
+  [[nodiscard]] common::HistogramCuts& Cuts() { return cut; }
+  [[nodiscard]] common::HistogramCuts const& Cuts() const { return cut; }
+
  private:
   std::unique_ptr<common::ColumnMatrix> columns_;
   std::vector<size_t> hit_count_tloc_;
diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc
index b77c8fd84..5a2f6f8df 100644
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -16,7 +16,8 @@
 #include "../common/threading_utils.h"
 #include "./simple_batch_iterator.h"
 #include "adapter.h"
-#include "batch_utils.h"  // for CheckEmpty, RegenGHist
+#include "batch_utils.h"   // for CheckEmpty, RegenGHist
+#include "ellpack_page.h"  // for EllpackPage
 #include "gradient_index.h"
 #include "xgboost/c_api.h"
 #include "xgboost/data.h"
diff --git a/src/data/sparse_page_dmatrix.cc b/src/data/sparse_page_dmatrix.cc
index f84fa8c01..ec9c90b10 100644
--- a/src/data/sparse_page_dmatrix.cc
+++ b/src/data/sparse_page_dmatrix.cc
@@ -165,7 +165,10 @@ BatchSet<SortedCSCPage> SparsePageDMatrix::GetSortedColumnBatches(Context const
 
 BatchSet<GHistIndexMatrix> SparsePageDMatrix::GetGradientIndex(Context const *ctx,
                                                                const BatchParam &param) {
-  CHECK_GE(param.max_bin, 2);
+  if (param.Initialized()) {
+    CHECK_GE(param.max_bin, 2);
+  }
+  detail::CheckEmpty(batch_param_, param);
   auto id = MakeCache(this, ".gradient_index.page", cache_prefix_, &cache_info_);
   this->InitializeSparsePage(ctx);
   if (!cache_info_.at(id)->written || detail::RegenGHist(batch_param_, param)) {
diff --git a/src/data/sparse_page_dmatrix.cu b/src/data/sparse_page_dmatrix.cu
index 0a4cde43d..38304f725 100644
--- a/src/data/sparse_page_dmatrix.cu
+++ b/src/data/sparse_page_dmatrix.cu
@@ -1,6 +1,8 @@
 /**
  * Copyright 2021-2023 by XGBoost contributors
  */
+#include <memory>
+
 #include "../common/hist_util.cuh"
 #include "batch_utils.h"  // for CheckEmpty, RegenGHist
 #include "ellpack_page.cuh"
@@ -11,7 +13,9 @@ namespace xgboost::data {
 BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const* ctx,
                                                            const BatchParam& param) {
   CHECK(ctx->IsCUDA());
-  CHECK_GE(param.max_bin, 2);
+  if (param.Initialized()) {
+    CHECK_GE(param.max_bin, 2);
+  }
   detail::CheckEmpty(batch_param_, param);
   auto id = MakeCache(this, ".ellpack.page", cache_prefix_, &cache_info_);
   size_t row_stride = 0;
@@ -21,8 +25,8 @@ BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const* ctx,
     cache_info_.erase(id);
     MakeCache(this, ".ellpack.page", cache_prefix_, &cache_info_);
     std::unique_ptr<common::HistogramCuts> cuts;
-    cuts.reset(
-        new common::HistogramCuts{common::DeviceSketch(ctx->gpu_id, this, param.max_bin, 0)});
+    cuts = std::make_unique<common::HistogramCuts>(
+        common::DeviceSketch(ctx->gpu_id, this, param.max_bin, 0));
     this->InitializeSparsePage(ctx);  // reset after use.
 
     row_stride = GetRowStride(this);
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 9378bde20..e2a863e3d 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -21,6 +21,7 @@
 #include "../common/io.h"
 #include "../common/timer.h"
 #include "../data/ellpack_page.cuh"
+#include "../data/ellpack_page.h"
 #include "constraints.cuh"
 #include "driver.h"
 #include "gpu_hist/evaluate_splits.cuh"
diff --git a/tests/cpp/c_api/test_c_api.cc b/tests/cpp/c_api/test_c_api.cc
index 675da940c..4e1b342ae 100644
--- a/tests/cpp/c_api/test_c_api.cc
+++ b/tests/cpp/c_api/test_c_api.cc
@@ -8,6 +8,7 @@
 #include <xgboost/learner.h>
 #include <xgboost/version_config.h>
 
+#include <array>    // for array
 #include <cstddef>  // std::size_t
 #include <limits>   // std::numeric_limits
 #include <string>   // std::string
@@ -15,6 +16,11 @@
 
 #include "../../../src/c_api/c_api_error.h"
 #include "../../../src/common/io.h"
+#include "../../../src/data/adapter.h"              // for ArrayAdapter
+#include "../../../src/data/array_interface.h"      // for ArrayInterface
+#include "../../../src/data/gradient_index.h"       // for GHistIndexMatrix
+#include "../../../src/data/iterative_dmatrix.h"    // for IterativeDMatrix
+#include "../../../src/data/sparse_page_dmatrix.h"  // for SparsePageDMatrix
 #include "../helpers.h"
 
 TEST(CAPI, XGDMatrixCreateFromMatDT) {
@@ -137,9 +143,9 @@ TEST(CAPI, ConfigIO) {
   BoosterHandle handle = learner.get();
   learner->UpdateOneIter(0, p_dmat);
 
-  char const* out[1];
+  std::array<char const* , 1> out;
   bst_ulong len {0};
-  XGBoosterSaveJsonConfig(handle, &len, out);
+  XGBoosterSaveJsonConfig(handle, &len, out.data());
 
   std::string config_str_0 { out[0] };
   auto config_0 = Json::Load({config_str_0.c_str(), config_str_0.size()});
@@ -147,7 +153,7 @@ TEST(CAPI, ConfigIO) {
 
   bst_ulong len_1 {0};
   std::string config_str_1 { out[0] };
-  XGBoosterSaveJsonConfig(handle, &len_1, out);
+  XGBoosterSaveJsonConfig(handle, &len_1, out.data());
   auto config_1 = Json::Load({config_str_1.c_str(), config_str_1.size()});
 
   ASSERT_EQ(config_0, config_1);
@@ -266,9 +272,9 @@ TEST(CAPI, DMatrixSetFeatureName) {
     ASSERT_EQ(std::to_string(i), c_out_features[i]);
   }
 
-  char const* feat_types [] {"i", "q"};
+  std::array<char const *, 2> feat_types{"i", "q"};
   static_assert(sizeof(feat_types) / sizeof(feat_types[0]) == kCols);
-  XGDMatrixSetStrFeatureInfo(handle, "feature_type", feat_types, kCols);
+  XGDMatrixSetStrFeatureInfo(handle, "feature_type", feat_types.data(), kCols);
   char const **c_out_types;
   XGDMatrixGetStrFeatureInfo(handle, u8"feature_type", &out_len,
                              &c_out_types);
@@ -405,4 +411,210 @@ TEST(CAPI, JArgs) {
     ASSERT_THROW({ RequiredArg<String>(args, "null", __func__); }, dmlc::Error);
   }
 }
+
+namespace {
+void MakeLabelForTest(std::shared_ptr<DMatrix> Xy, DMatrixHandle cxy) {
+  auto n_samples = Xy->Info().num_row_;
+  std::vector<float> y(n_samples);
+  for (std::size_t i = 0; i < y.size(); ++i) {
+    y[i] = static_cast<float>(i);
+  }
+
+  Xy->Info().labels.Reshape(n_samples);
+  Xy->Info().labels.Data()->HostVector() = y;
+
+  auto y_int = GetArrayInterface(Xy->Info().labels.Data(), n_samples, 1);
+  std::string s_y_int;
+  Json::Dump(y_int, &s_y_int);
+
+  XGDMatrixSetInfoFromInterface(cxy, "label", s_y_int.c_str());
+}
+
+auto MakeSimpleDMatrixForTest(bst_row_t n_samples, bst_feature_t n_features, Json dconfig) {
+  HostDeviceVector<float> storage;
+  auto arr_int = RandomDataGenerator{n_samples, n_features, 0.5f}.GenerateArrayInterface(&storage);
+
+  data::ArrayAdapter adapter{StringView{arr_int}};
+  std::shared_ptr<DMatrix> Xy{
+      DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), Context{}.Threads())};
+
+  DMatrixHandle p_fmat;
+  std::string s_dconfig;
+  Json::Dump(dconfig, &s_dconfig);
+  CHECK_EQ(XGDMatrixCreateFromDense(arr_int.c_str(), s_dconfig.c_str(), &p_fmat), 0);
+
+  MakeLabelForTest(Xy, p_fmat);
+  return std::pair{p_fmat, Xy};
+}
+
+auto MakeQDMForTest(Context const *ctx, bst_row_t n_samples, bst_feature_t n_features,
+                    Json dconfig) {
+  bst_bin_t n_bins{16};
+  dconfig["max_bin"] = Integer{n_bins};
+
+  std::size_t n_batches{4};
+  std::unique_ptr<ArrayIterForTest> iter_0;
+  if (ctx->IsCUDA()) {
+    iter_0 = std::make_unique<CudaArrayIterForTest>(0.0f, n_samples, n_features, n_batches);
+  } else {
+    iter_0 = std::make_unique<NumpyArrayIterForTest>(0.0f, n_samples, n_features, n_batches);
+  }
+  std::string s_dconfig;
+  Json::Dump(dconfig, &s_dconfig);
+  DMatrixHandle p_fmat;
+  CHECK_EQ(XGQuantileDMatrixCreateFromCallback(static_cast<DataIterHandle>(iter_0.get()),
+                                               iter_0->Proxy(), nullptr, Reset, Next,
+                                               s_dconfig.c_str(), &p_fmat),
+           0);
+
+  std::unique_ptr<ArrayIterForTest> iter_1;
+  if (ctx->IsCUDA()) {
+    iter_1 = std::make_unique<CudaArrayIterForTest>(0.0f, n_samples, n_features, n_batches);
+  } else {
+    iter_1 = std::make_unique<NumpyArrayIterForTest>(0.0f, n_samples, n_features, n_batches);
+  }
+  auto Xy =
+      std::make_shared<data::IterativeDMatrix>(iter_1.get(), iter_1->Proxy(), nullptr, Reset, Next,
+                                               std::numeric_limits<float>::quiet_NaN(), 0, n_bins);
+  return std::pair{p_fmat, Xy};
+}
+
+auto MakeExtMemForTest(bst_row_t n_samples, bst_feature_t n_features, Json dconfig) {
+  std::size_t n_batches{4};
+  NumpyArrayIterForTest iter_0{0.0f, n_samples, n_features, n_batches};
+  std::string s_dconfig;
+  dconfig["cache_prefix"] = String{"cache"};
+  Json::Dump(dconfig, &s_dconfig);
+  DMatrixHandle p_fmat;
+  CHECK_EQ(XGDMatrixCreateFromCallback(static_cast<DataIterHandle>(&iter_0), iter_0.Proxy(), Reset,
+                                       Next, s_dconfig.c_str(), &p_fmat),
+           0);
+
+  NumpyArrayIterForTest iter_1{0.0f, n_samples, n_features, n_batches};
+  auto Xy = std::make_shared<data::SparsePageDMatrix>(
+      &iter_1, iter_1.Proxy(), Reset, Next, std::numeric_limits<float>::quiet_NaN(), 0, "");
+  MakeLabelForTest(Xy, p_fmat);
+  return std::pair{p_fmat, Xy};
+}
+
+template <typename Page>
+void CheckResult(Context const *ctx, bst_feature_t n_features, std::shared_ptr<DMatrix> Xy,
+                 float const *out_data, std::uint64_t const *out_indptr) {
+  for (auto const &page : Xy->GetBatches<Page>(ctx, BatchParam{16, 0.2})) {
+    auto const &cut = page.Cuts();
+    auto const &ptrs = cut.Ptrs();
+    auto const &vals = cut.Values();
+    auto const &mins = cut.MinValues();
+    for (bst_feature_t f = 0; f < Xy->Info().num_col_; ++f) {
+      ASSERT_EQ(ptrs[f] + f, out_indptr[f]);
+      ASSERT_EQ(mins[f], out_data[out_indptr[f]]);
+      auto beg = out_indptr[f];
+      auto end = out_indptr[f + 1];
+      auto val_beg = ptrs[f];
+      for (std::uint64_t i = beg + 1, j = val_beg; i < end; ++i, ++j) {
+        ASSERT_EQ(vals[j], out_data[i]);
+      }
+    }
+
+    ASSERT_EQ(ptrs[n_features] + n_features, out_indptr[n_features]);
+  }
+}
+
+void TestXGDMatrixGetQuantileCut(Context const *ctx) {
+  bst_row_t n_samples{1024};
+  bst_feature_t n_features{16};
+
+  Json dconfig{Object{}};
+  dconfig["ntread"] = Integer{Context{}.Threads()};
+  dconfig["missing"] = Number{std::numeric_limits<float>::quiet_NaN()};
+
+  auto check_result = [n_features, &ctx](std::shared_ptr<DMatrix> Xy, StringView s_out_data,
+                                         StringView s_out_indptr) {
+    auto i_out_data = ArrayInterface<1, false>{s_out_data};
+    ASSERT_EQ(i_out_data.type, ArrayInterfaceHandler::kF4);
+    auto out_data = static_cast<float const *>(i_out_data.data);
+    ASSERT_TRUE(out_data);
+
+    auto i_out_indptr = ArrayInterface<1, false>{s_out_indptr};
+    ASSERT_EQ(i_out_indptr.type, ArrayInterfaceHandler::kU8);
+    auto out_indptr = static_cast<std::uint64_t const *>(i_out_indptr.data);
+    ASSERT_TRUE(out_data);
+
+    if (ctx->IsCPU()) {
+      CheckResult<GHistIndexMatrix>(ctx, n_features, Xy, out_data, out_indptr);
+    } else {
+      CheckResult<EllpackPage>(ctx, n_features, Xy, out_data, out_indptr);
+    }
+  };
+
+  Json config{Null{}};
+  std::string s_config;
+  Json::Dump(config, &s_config);
+  char const *out_indptr;
+  char const *out_data;
+
+  {
+    // SimpleDMatrix
+    auto [p_fmat, Xy] = MakeSimpleDMatrixForTest(n_samples, n_features, dconfig);
+    // assert fail, we don't have the quantile yet.
+    ASSERT_EQ(XGDMatrixGetQuantileCut(p_fmat, s_config.c_str(), &out_indptr, &out_data), -1);
+
+    std::array<DMatrixHandle, 1> mats{p_fmat};
+    BoosterHandle booster;
+    ASSERT_EQ(XGBoosterCreate(mats.data(), 1, &booster), 0);
+    ASSERT_EQ(XGBoosterSetParam(booster, "max_bin", "16"), 0);
+    if (ctx->IsCUDA()) {
+      ASSERT_EQ(XGBoosterSetParam(booster, "tree_method", "gpu_hist"), 0);
+    }
+    ASSERT_EQ(XGBoosterUpdateOneIter(booster, 0, p_fmat), 0);
+    ASSERT_EQ(XGDMatrixGetQuantileCut(p_fmat, s_config.c_str(), &out_indptr, &out_data), 0);
+
+    check_result(Xy, out_data, out_indptr);
+
+    XGDMatrixFree(p_fmat);
+    XGBoosterFree(booster);
+  }
+
+  {
+    // IterativeDMatrix
+    auto [p_fmat, Xy] = MakeQDMForTest(ctx, n_samples, n_features, dconfig);
+    ASSERT_EQ(XGDMatrixGetQuantileCut(p_fmat, s_config.c_str(), &out_indptr, &out_data), 0);
+
+    check_result(Xy, out_data, out_indptr);
+    XGDMatrixFree(p_fmat);
+  }
+
+  {
+    // SparsePageDMatrix
+    auto [p_fmat, Xy] = MakeExtMemForTest(n_samples, n_features, dconfig);
+    // assert fail, we don't have the quantile yet.
+    ASSERT_EQ(XGDMatrixGetQuantileCut(p_fmat, s_config.c_str(), &out_indptr, &out_data), -1);
+
+    std::array<DMatrixHandle, 1> mats{p_fmat};
+    BoosterHandle booster;
+    ASSERT_EQ(XGBoosterCreate(mats.data(), 1, &booster), 0);
+    ASSERT_EQ(XGBoosterSetParam(booster, "max_bin", "16"), 0);
+    if (ctx->IsCUDA()) {
+      ASSERT_EQ(XGBoosterSetParam(booster, "tree_method", "gpu_hist"), 0);
+    }
+    ASSERT_EQ(XGBoosterUpdateOneIter(booster, 0, p_fmat), 0);
+    ASSERT_EQ(XGDMatrixGetQuantileCut(p_fmat, s_config.c_str(), &out_indptr, &out_data), 0);
+
+    XGDMatrixFree(p_fmat);
+    XGBoosterFree(booster);
+  }
+}
+}  // namespace
+
+TEST(CAPI, XGDMatrixGetQuantileCut) {
+  Context ctx;
+  TestXGDMatrixGetQuantileCut(&ctx);
+}
+
+#if defined(XGBOOST_USE_CUDA)
+TEST(CAPI, GPUXGDMatrixGetQuantileCut) {
+  auto ctx = MakeCUDACtx(0);
+  TestXGDMatrixGetQuantileCut(&ctx);
+}
+#endif  // defined(XGBOOST_USE_CUDA)
 }  // namespace xgboost
diff --git a/tests/cpp/data/test_ellpack_page.cu b/tests/cpp/data/test_ellpack_page.cu
index d56f1c7b5..4b279a1a4 100644
--- a/tests/cpp/data/test_ellpack_page.cu
+++ b/tests/cpp/data/test_ellpack_page.cu
@@ -8,6 +8,7 @@
 #include "../../../src/common/categorical.h"
 #include "../../../src/common/hist_util.h"
 #include "../../../src/data/ellpack_page.cuh"
+#include "../../../src/data/ellpack_page.h"
 #include "../../../src/tree/param.h"  // TrainParam
 #include "../helpers.h"
 #include "../histogram_helpers.h"
diff --git a/tests/cpp/data/test_iterative_dmatrix.cu b/tests/cpp/data/test_iterative_dmatrix.cu
index 2f2f1f84f..6b856f3fa 100644
--- a/tests/cpp/data/test_iterative_dmatrix.cu
+++ b/tests/cpp/data/test_iterative_dmatrix.cu
@@ -5,6 +5,7 @@
 
 #include "../../../src/data/device_adapter.cuh"
 #include "../../../src/data/ellpack_page.cuh"
+#include "../../../src/data/ellpack_page.h"
 #include "../../../src/data/iterative_dmatrix.h"
 #include "../../../src/tree/param.h"  // TrainParam
 #include "../helpers.h"
diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cu b/tests/cpp/data/test_sparse_page_dmatrix.cu
index f2f828507..17ed64c90 100644
--- a/tests/cpp/data/test_sparse_page_dmatrix.cu
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cu
@@ -5,6 +5,7 @@
 
 #include "../../../src/common/compressed_iterator.h"
 #include "../../../src/data/ellpack_page.cuh"
+#include "../../../src/data/ellpack_page.h"
 #include "../../../src/data/sparse_page_dmatrix.h"
 #include "../../../src/tree/param.h"  // TrainParam
 #include "../filesystem.h"            // dmlc::TemporaryDirectory
diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index 4f44b7b1e..111c7b30e 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -472,6 +472,18 @@ std::shared_ptr<DMatrix> RandomDataGenerator::GenerateQuantileDMatrix(bool with_
   return m;
 }
 
+#if !defined(XGBOOST_USE_CUDA)
+CudaArrayIterForTest::CudaArrayIterForTest(float sparsity, size_t rows, size_t cols, size_t batches)
+    : ArrayIterForTest{sparsity, rows, cols, batches} {
+  common::AssertGPUSupport();
+}
+
+int CudaArrayIterForTest::Next() {
+  common::AssertGPUSupport();
+  return 0;
+}
+#endif  // !defined(XGBOOST_USE_CUDA)
+
 NumpyArrayIterForTest::NumpyArrayIterForTest(float sparsity, size_t rows, size_t cols,
                                              size_t batches)
     : ArrayIterForTest{sparsity, rows, cols, batches} {
@@ -650,7 +662,7 @@ std::unique_ptr<GradientBooster> CreateTrainedGBM(std::string name, Args kwargs,
 ArrayIterForTest::ArrayIterForTest(float sparsity, size_t rows, size_t cols, size_t batches)
     : rows_{rows}, cols_{cols}, n_batches_{batches} {
   XGProxyDMatrixCreate(&proxy_);
-  rng_.reset(new RandomDataGenerator{rows_, cols_, sparsity});
+  rng_ = std::make_unique<RandomDataGenerator>(rows_, cols_, sparsity);
   std::tie(batches_, interface_) = rng_->GenerateArrayInterfaceBatch(&data_, n_batches_);
 }
 
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index fd3034db5..b250cd2ab 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -11,6 +11,8 @@
 #include <vector>
 
 #include "../../../src/common/common.h"
+#include "../../../src/data/ellpack_page.cuh"  // for EllpackPageImpl
+#include "../../../src/data/ellpack_page.h"    // for EllpackPage
 #include "../../../src/data/sparse_page_source.h"
 #include "../../../src/tree/constraints.cuh"
 #include "../../../src/tree/param.h"  // for TrainParam
diff --git a/tests/python-gpu/test_from_cupy.py b/tests/python-gpu/test_from_cupy.py
index 70080b13a..71667fa7b 100644
--- a/tests/python-gpu/test_from_cupy.py
+++ b/tests/python-gpu/test_from_cupy.py
@@ -1,3 +1,4 @@
+import json
 import sys
 
 import numpy as np
@@ -10,6 +11,16 @@ from test_dmatrix import set_base_margin_info
 
 from xgboost import testing as tm
 
+cupy = pytest.importorskip("cupy")
+
+
+def test_array_interface() -> None:
+    arr = cupy.array([[1, 2, 3, 4], [1, 2, 3, 4]])
+    i_arr = arr.__cuda_array_interface__
+    i_arr = json.loads(json.dumps(i_arr))
+    ret = xgb.core.from_array_interface(i_arr)
+    np.testing.assert_equal(cupy.asnumpy(arr), cupy.asnumpy(ret))
+
 
 def dmatrix_from_cupy(input_type, DMatrixT, missing=np.NAN):
     '''Test constructing DMatrix from cupy'''
diff --git a/tests/python-gpu/test_gpu_updaters.py b/tests/python-gpu/test_gpu_updaters.py
index a6b183daf..7fea42f60 100644
--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@@ -8,7 +8,11 @@ from hypothesis import assume, given, note, settings, strategies
 import xgboost as xgb
 from xgboost import testing as tm
 from xgboost.testing.params import cat_parameter_strategy, hist_parameter_strategy
-from xgboost.testing.updater import check_init_estimation, check_quantile_loss
+from xgboost.testing.updater import (
+    check_get_quantile_cut,
+    check_init_estimation,
+    check_quantile_loss,
+)
 
 sys.path.append("tests/python")
 import test_updaters as test_up
@@ -264,3 +268,7 @@ class TestGPUUpdaters:
             },
             num_boost_round=150,
         )
+
+    @pytest.mark.skipif(**tm.no_cudf())
+    def test_get_quantile_cut(self) -> None:
+        check_get_quantile_cut("gpu_hist")
diff --git a/tests/python/test_updaters.py b/tests/python/test_updaters.py
index 095c9936a..2027942fe 100644
--- a/tests/python/test_updaters.py
+++ b/tests/python/test_updaters.py
@@ -14,7 +14,11 @@ from xgboost.testing.params import (
     hist_multi_parameter_strategy,
     hist_parameter_strategy,
 )
-from xgboost.testing.updater import check_init_estimation, check_quantile_loss
+from xgboost.testing.updater import (
+    check_get_quantile_cut,
+    check_init_estimation,
+    check_quantile_loss,
+)
 
 
 def train_result(param, dmat, num_rounds):
@@ -537,3 +541,8 @@ class TestTreeMethod:
     @pytest.mark.parametrize("weighted", [True, False])
     def test_quantile_loss(self, weighted: bool) -> None:
         check_quantile_loss("hist", weighted)
+
+    @pytest.mark.skipif(**tm.no_pandas())
+    @pytest.mark.parametrize("tree_method", ["hist"])
+    def test_get_quantile_cut(self, tree_method: str) -> None:
+        check_get_quantile_cut(tree_method)

From 97ed944209807f552bcf28a693cb8d3ba2440a8a Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 11 Jul 2023 10:04:39 +0800
Subject: [PATCH 027/136] Unify the hist tree method for different devices.
 (#9363)

---
 R-package/src/Makevars.in                  |   1 +
 R-package/src/Makevars.win                 |   1 +
 src/common/error_msg.cc                    |  36 ++++
 src/common/error_msg.h                     |   5 +-
 src/gbm/gbtree.cc                          | 182 ++++++++-------------
 src/gbm/gbtree.h                           |  32 ++--
 tests/cpp/gbm/test_gbtree.cc               | 117 ++++++++++++-
 tests/python/test_training_continuation.py |  10 +-
 8 files changed, 242 insertions(+), 142 deletions(-)
 create mode 100644 src/common/error_msg.cc

diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in
index a84459db9..f199544a3 100644
--- a/R-package/src/Makevars.in
+++ b/R-package/src/Makevars.in
@@ -82,6 +82,7 @@ OBJECTS= \
     $(PKGROOT)/src/common/charconv.o \
     $(PKGROOT)/src/common/column_matrix.o \
     $(PKGROOT)/src/common/common.o \
+    $(PKGROOT)/src/common/error_msg.o \
     $(PKGROOT)/src/common/hist_util.o \
     $(PKGROOT)/src/common/host_device_vector.o \
     $(PKGROOT)/src/common/io.o \
diff --git a/R-package/src/Makevars.win b/R-package/src/Makevars.win
index 25c577e3a..2e7f98113 100644
--- a/R-package/src/Makevars.win
+++ b/R-package/src/Makevars.win
@@ -82,6 +82,7 @@ OBJECTS= \
     $(PKGROOT)/src/common/charconv.o \
     $(PKGROOT)/src/common/column_matrix.o \
     $(PKGROOT)/src/common/common.o \
+    $(PKGROOT)/src/common/error_msg.o \
     $(PKGROOT)/src/common/hist_util.o \
     $(PKGROOT)/src/common/host_device_vector.o \
     $(PKGROOT)/src/common/io.o \
diff --git a/src/common/error_msg.cc b/src/common/error_msg.cc
new file mode 100644
index 000000000..813cbe8b1
--- /dev/null
+++ b/src/common/error_msg.cc
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2023 by XGBoost contributors
+ */
+#include "error_msg.h"
+
+#include "xgboost/logging.h"
+
+namespace xgboost::error {
+void WarnDeprecatedGPUHist() {
+  bool static thread_local logged{false};
+  if (logged) {
+    return;
+  }
+  auto msg =
+      "The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` "
+      R"(parameter to CUDA instead.
+
+    E.g. tree_method = "hist", device = "CUDA"
+
+)";
+  LOG(WARNING) << msg;
+  logged = true;
+}
+
+void WarnManualUpdater() {
+  bool static thread_local logged{false};
+  if (logged) {
+    return;
+  }
+  LOG(WARNING)
+      << "You have manually specified the `updater` parameter. The `tree_method` parameter "
+         "will be ignored. Incorrect sequence of updaters will produce undefined "
+         "behavior. For common uses, we recommend using `tree_method` parameter instead.";
+  logged = true;
+}
+}  // namespace xgboost::error
diff --git a/src/common/error_msg.h b/src/common/error_msg.h
index 37d70b144..c19197007 100644
--- a/src/common/error_msg.h
+++ b/src/common/error_msg.h
@@ -75,9 +75,12 @@ inline void WarnOldSerialization() {
   if (logged) {
     return;
   }
-
   LOG(WARNING) << OldSerialization();
   logged = true;
 }
+
+void WarnDeprecatedGPUHist();
+
+void WarnManualUpdater();
 }  // namespace xgboost::error
 #endif  // XGBOOST_COMMON_ERROR_MSG_H_
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index 4f8cd47bb..55b935ea0 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -9,7 +9,7 @@
 #include <dmlc/omp.h>
 #include <dmlc/parameter.h>
 
-#include <algorithm>
+#include <algorithm>  // for equal
 #include <cinttypes>  // for uint32_t
 #include <limits>
 #include <memory>
@@ -40,8 +40,53 @@
 namespace xgboost::gbm {
 DMLC_REGISTRY_FILE_TAG(gbtree);
 
+namespace {
+/** @brief Map the `tree_method` parameter to the `updater` parameter. */
+std::string MapTreeMethodToUpdaters(Context const* ctx_, TreeMethod tree_method) {
+  // Choose updaters according to tree_method parameters
+  switch (tree_method) {
+    case TreeMethod::kAuto:  // Use hist as default in 2.0
+    case TreeMethod::kHist: {
+      return ctx_->DispatchDevice([] { return "grow_quantile_histmaker"; },
+                                  [] {
+                                    common::AssertGPUSupport();
+                                    return "grow_gpu_hist";
+                                  });
+    }
+    case TreeMethod::kApprox:
+      CHECK(ctx_->IsCPU()) << "The `approx` tree method is not supported on GPU.";
+      return "grow_histmaker";
+    case TreeMethod::kExact:
+      CHECK(ctx_->IsCPU()) << "The `exact` tree method is not supported on GPU.";
+      return "grow_colmaker,prune";
+    case TreeMethod::kGPUHist: {
+      common::AssertGPUSupport();
+      error::WarnDeprecatedGPUHist();
+      return "grow_gpu_hist";
+    }
+    default:
+      auto tm = static_cast<std::underlying_type_t<TreeMethod>>(tree_method);
+      LOG(FATAL) << "Unknown tree_method: `" << tm << "`.";
+  }
+
+  LOG(FATAL) << "unreachable";
+  return "";
+}
+
+bool UpdatersMatched(std::vector<std::string> updater_seq,
+                     std::vector<std::unique_ptr<TreeUpdater>> const& updaters) {
+  if (updater_seq.size() != updaters.size()) {
+    return false;
+  }
+
+  return std::equal(updater_seq.cbegin(), updater_seq.cend(), updaters.cbegin(),
+                    [](std::string const& name, std::unique_ptr<TreeUpdater> const& up) {
+                      return name == up->Name();
+                    });
+}
+}  // namespace
+
 void GBTree::Configure(Args const& cfg) {
-  std::string updater_seq = tparam_.updater_seq;
   tparam_.UpdateAllowUnknown(cfg);
   tree_param_.UpdateAllowUnknown(cfg);
 
@@ -54,8 +99,7 @@ void GBTree::Configure(Args const& cfg) {
 
   // configure predictors
   if (!cpu_predictor_) {
-    cpu_predictor_ = std::unique_ptr<Predictor>(
-        Predictor::Create("cpu_predictor", this->ctx_));
+    cpu_predictor_ = std::unique_ptr<Predictor>(Predictor::Create("cpu_predictor", this->ctx_));
   }
   cpu_predictor_->Configure(cfg);
 #if defined(XGBOOST_USE_CUDA)
@@ -70,26 +114,17 @@ void GBTree::Configure(Args const& cfg) {
 
 #if defined(XGBOOST_USE_ONEAPI)
   if (!oneapi_predictor_) {
-    oneapi_predictor_ = std::unique_ptr<Predictor>(
-        Predictor::Create("oneapi_predictor", this->ctx_));
+    oneapi_predictor_ =
+        std::unique_ptr<Predictor>(Predictor::Create("oneapi_predictor", this->ctx_));
   }
   oneapi_predictor_->Configure(cfg);
 #endif  // defined(XGBOOST_USE_ONEAPI)
 
-  monitor_.Init("GBTree");
-
-  specified_updater_ = std::any_of(
-      cfg.cbegin(), cfg.cend(),
-      [](std::pair<std::string, std::string> const& arg) { return arg.first == "updater"; });
-
-  if (specified_updater_ && !showed_updater_warning_) {
-    LOG(WARNING) << "DANGER AHEAD: You have manually specified `updater` "
-        "parameter. The `tree_method` parameter will be ignored. "
-        "Incorrect sequence of updaters will produce undefined "
-        "behavior. For common uses, we recommend using "
-        "`tree_method` parameter instead.";
-    // Don't drive users to silent XGBOost.
-    showed_updater_warning_ = true;
+  // `updater` parameter was manually specified
+  specified_updater_ =
+      std::any_of(cfg.cbegin(), cfg.cend(), [](auto const& arg) { return arg.first == "updater"; });
+  if (specified_updater_) {
+    error::WarnManualUpdater();
   }
 
   if (model_.learner_model_param->IsVectorLeaf()) {
@@ -97,47 +132,25 @@ void GBTree::Configure(Args const& cfg) {
         << "Only the hist tree method is supported for building multi-target trees with vector "
            "leaf.";
   }
+
   LOG(DEBUG) << "Using tree method: " << static_cast<int>(tparam_.tree_method);
-  this->ConfigureUpdaters();
 
-  if (updater_seq != tparam_.updater_seq) {
+  if (!specified_updater_) {
+    this->tparam_.updater_seq = MapTreeMethodToUpdaters(ctx_, tparam_.tree_method);
+  }
+
+  auto up_names = common::Split(tparam_.updater_seq, ',');
+  if (!UpdatersMatched(up_names, updaters_)) {
     updaters_.clear();
-    this->InitUpdater(cfg);
-  } else {
-    for (auto& up : updaters_) {
-      up->Configure(cfg);
+    for (auto const& name : up_names) {
+      std::unique_ptr<TreeUpdater> up(
+          TreeUpdater::Create(name.c_str(), ctx_, &model_.learner_model_param->task));
+      updaters_.push_back(std::move(up));
     }
   }
 
-  configured_ = true;
-}
-
-void GBTree::ConfigureUpdaters() {
-  if (specified_updater_) {
-    return;
-  }
-  // `updater` parameter was manually specified
-  /* Choose updaters according to tree_method parameters */
-  switch (tparam_.tree_method) {
-    case TreeMethod::kAuto:  // Use hist as default in 2.0
-    case TreeMethod::kHist: {
-      tparam_.updater_seq = "grow_quantile_histmaker";
-      break;
-    }
-    case TreeMethod::kApprox:
-      tparam_.updater_seq = "grow_histmaker";
-      break;
-    case TreeMethod::kExact:
-      tparam_.updater_seq = "grow_colmaker,prune";
-      break;
-    case TreeMethod::kGPUHist: {
-      common::AssertGPUSupport();
-      tparam_.updater_seq = "grow_gpu_hist";
-      break;
-    }
-    default:
-      LOG(FATAL) << "Unknown tree_method (" << static_cast<int>(tparam_.tree_method)
-                 << ") detected";
+  for (auto& up : updaters_) {
+    up->Configure(cfg);
   }
 }
 
@@ -195,14 +208,8 @@ void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
   bst_target_t const n_groups = model_.learner_model_param->OutputLength();
   monitor_.Start("BoostNewTrees");
 
-  // Weird case that tree method is cpu-based but gpu_id is set.  Ideally we should let
-  // `gpu_id` be the single source of determining what algorithms to run, but that will
-  // break a lots of existing code.
-  auto device = tparam_.tree_method != TreeMethod::kGPUHist ? Context::kCpuId : ctx_->gpu_id;
-  auto out = linalg::MakeTensorView(
-      device,
-      device == Context::kCpuId ? predt->predictions.HostSpan() : predt->predictions.DeviceSpan(),
-      p_fmat->Info().num_row_, model_.learner_model_param->OutputLength());
+  auto out = linalg::MakeTensorView(ctx_, &predt->predictions, p_fmat->Info().num_row_,
+                                    model_.learner_model_param->OutputLength());
   CHECK_NE(n_groups, 0);
 
   if (!p_fmat->SingleColBlock() && obj->Task().UpdateTreeLeaf()) {
@@ -261,47 +268,6 @@ void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
   this->CommitModel(std::move(new_trees));
 }
 
-void GBTree::InitUpdater(Args const& cfg) {
-  std::string tval = tparam_.updater_seq;
-  std::vector<std::string> ups = common::Split(tval, ',');
-
-  if (updaters_.size() != 0) {
-    // Assert we have a valid set of updaters.
-    CHECK_EQ(ups.size(), updaters_.size());
-    for (auto const& up : updaters_) {
-      bool contains = std::any_of(ups.cbegin(), ups.cend(),
-                        [&up](std::string const& name) {
-                          return name == up->Name();
-                        });
-      if (!contains) {
-        std::stringstream ss;
-        ss << "Internal Error: " << " mismatched updater sequence.\n";
-        ss << "Specified updaters: ";
-        std::for_each(ups.cbegin(), ups.cend(),
-                      [&ss](std::string const& name){
-                        ss << name << " ";
-                      });
-        ss << "\n" << "Actual updaters: ";
-        std::for_each(updaters_.cbegin(), updaters_.cend(),
-                      [&ss](std::unique_ptr<TreeUpdater> const& updater){
-                        ss << updater->Name() << " ";
-                      });
-        LOG(FATAL) << ss.str();
-      }
-    }
-    // Do not push new updater in.
-    return;
-  }
-
-  // create new updaters
-  for (const std::string& pstr : ups) {
-    std::unique_ptr<TreeUpdater> up(
-        TreeUpdater::Create(pstr.c_str(), ctx_, &model_.learner_model_param->task));
-    up->Configure(cfg);
-    updaters_.push_back(std::move(up));
-  }
-}
-
 void GBTree::BoostNewTrees(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat, int bst_group,
                            std::vector<HostDeviceVector<bst_node_t>>* out_position,
                            TreesOneGroup* ret) {
@@ -310,6 +276,7 @@ void GBTree::BoostNewTrees(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fma
   // create the trees
   for (int i = 0; i < model_.param.num_parallel_tree; ++i) {
     if (tparam_.process_type == TreeProcessType::kDefault) {
+      CHECK(!updaters_.empty());
       CHECK(!updaters_.front()->CanModifyTree())
           << "Updater: `" << updaters_.front()->Name() << "` "
           << "can not be used to create new trees. "
@@ -465,7 +432,6 @@ void GBTree::SaveModel(Json* p_out) const {
 
 void GBTree::Slice(bst_layer_t begin, bst_layer_t end, bst_layer_t step, GradientBooster* out,
                    bool* out_of_bound) const {
-  CHECK(configured_);
   CHECK(out);
 
   auto p_gbtree = dynamic_cast<GBTree*>(out);
@@ -517,7 +483,6 @@ void GBTree::Slice(bst_layer_t begin, bst_layer_t end, bst_layer_t step, Gradien
 
 void GBTree::PredictBatchImpl(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool is_training,
                               bst_layer_t layer_begin, bst_layer_t layer_end) const {
-  CHECK(configured_);
   if (layer_end == 0) {
     layer_end = this->BoostedRounds();
   }
@@ -577,7 +542,6 @@ void GBTree::PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool
 void GBTree::InplacePredict(std::shared_ptr<DMatrix> p_m, float missing,
                             PredictionCacheEntry* out_preds, bst_layer_t layer_begin,
                             bst_layer_t layer_end) const {
-  CHECK(configured_);
   auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
   CHECK_LE(tree_end, model_.trees.size()) << "Invalid number of trees.";
   if (p_m->Ctx()->Device() != this->ctx_->Device()) {
@@ -606,8 +570,6 @@ void GBTree::InplacePredict(std::shared_ptr<DMatrix> p_m, float missing,
 
 [[nodiscard]] std::unique_ptr<Predictor> const& GBTree::GetPredictor(
     bool is_training, HostDeviceVector<float> const* out_pred, DMatrix* f_dmat) const {
-  CHECK(configured_);
-
   // Data comes from SparsePageDMatrix. Since we are loading data in pages, no need to
   // prevent data copy.
   if (f_dmat && !f_dmat->SingleColBlock()) {
@@ -914,7 +876,6 @@ class Dart : public GBTree {
   void PredictContribution(DMatrix* p_fmat, HostDeviceVector<bst_float>* out_contribs,
                            bst_layer_t layer_begin, bst_layer_t layer_end,
                            bool approximate) override {
-    CHECK(configured_);
     auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
     cpu_predictor_->PredictContribution(p_fmat, out_contribs, model_, tree_end, &weight_drop_,
                                         approximate);
@@ -923,7 +884,6 @@ class Dart : public GBTree {
   void PredictInteractionContributions(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,
                                        bst_layer_t layer_begin, bst_layer_t layer_end,
                                        bool approximate) override {
-    CHECK(configured_);
     auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
     cpu_predictor_->PredictInteractionContributions(p_fmat, out_contribs, model_, tree_end,
                                                     &weight_drop_, approximate);
diff --git a/src/gbm/gbtree.h b/src/gbm/gbtree.h
index aa45433df..81e568368 100644
--- a/src/gbm/gbtree.h
+++ b/src/gbm/gbtree.h
@@ -59,9 +59,7 @@ struct GBTreeTrainParam : public XGBoostParameter<GBTreeTrainParam> {
   TreeMethod tree_method;
   // declare parameters
   DMLC_DECLARE_PARAMETER(GBTreeTrainParam) {
-    DMLC_DECLARE_FIELD(updater_seq)
-        .set_default("grow_colmaker,prune")
-        .describe("Tree updater sequence.");
+    DMLC_DECLARE_FIELD(updater_seq).describe("Tree updater sequence.").set_default("");
     DMLC_DECLARE_FIELD(process_type)
         .set_default(TreeProcessType::kDefault)
         .add_enum("default", TreeProcessType::kDefault)
@@ -170,22 +168,21 @@ bool SliceTrees(bst_layer_t begin, bst_layer_t end, bst_layer_t step, GBTreeMode
 class GBTree : public GradientBooster {
  public:
   explicit GBTree(LearnerModelParam const* booster_config, Context const* ctx)
-      : GradientBooster{ctx}, model_(booster_config, ctx_) {}
-
-  void Configure(const Args& cfg) override;
-  /*! \brief Map `tree_method` parameter to `updater` parameter */
-  void ConfigureUpdaters();
+      : GradientBooster{ctx}, model_(booster_config, ctx_) {
+    monitor_.Init(__func__);
+  }
 
+  void Configure(Args const& cfg) override;
   /**
-   * \brief Optionally update the leaf value.
+   * @brief Optionally update the leaf value.
    */
   void UpdateTreeLeaf(DMatrix const* p_fmat, HostDeviceVector<float> const& predictions,
-                      ObjFunction const* obj,
-                      std::int32_t group_idx,
+                      ObjFunction const* obj, std::int32_t group_idx,
                       std::vector<HostDeviceVector<bst_node_t>> const& node_position,
                       std::vector<std::unique_ptr<RegTree>>* p_trees);
-
-  /*! \brief Carry out one iteration of boosting */
+  /**
+   * @brief Carry out one iteration of boosting.
+   */
   void DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
                PredictionCacheEntry* predt, ObjFunction const* obj) override;
 
@@ -289,7 +286,6 @@ class GBTree : public GradientBooster {
 
   void PredictInstance(const SparsePage::Inst& inst, std::vector<bst_float>* out_preds,
                        uint32_t layer_begin, uint32_t layer_end) override {
-    CHECK(configured_);
     std::uint32_t _, tree_end;
     std::tie(_, tree_end) = detail::LayerToTree(model_, layer_begin, layer_end);
     cpu_predictor_->PredictInstance(inst, out_preds, model_, tree_end);
@@ -307,7 +303,6 @@ class GBTree : public GradientBooster {
   void PredictContribution(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,
                            bst_layer_t layer_begin, bst_layer_t layer_end,
                            bool approximate) override {
-    CHECK(configured_);
     auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
     CHECK_EQ(tree_begin, 0) << "Predict contribution supports only iteration end: (0, "
                                "n_iteration), using model slicing instead.";
@@ -318,7 +313,6 @@ class GBTree : public GradientBooster {
   void PredictInteractionContributions(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,
                                        bst_layer_t layer_begin, bst_layer_t layer_end,
                                        bool approximate) override {
-    CHECK(configured_);
     auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
     CHECK_EQ(tree_begin, 0) << "Predict interaction contribution supports only iteration end: (0, "
                                "n_iteration), using model slicing instead.";
@@ -332,9 +326,6 @@ class GBTree : public GradientBooster {
   }
 
  protected:
-  // initialize updater before using them
-  void InitUpdater(Args const& cfg);
-
   void BoostNewTrees(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat, int bst_group,
                      std::vector<HostDeviceVector<bst_node_t>>* out_position,
                      std::vector<std::unique_ptr<RegTree>>* ret);
@@ -352,10 +343,7 @@ class GBTree : public GradientBooster {
   GBTreeTrainParam tparam_;
   // Tree training parameter
   tree::TrainParam tree_param_;
-  // ----training fields----
-  bool showed_updater_warning_ {false};
   bool specified_updater_   {false};
-  bool configured_ {false};
   // the updaters that can be applied to each of tree
   std::vector<std::unique_ptr<TreeUpdater>> updaters_;
   // Predictors
diff --git a/tests/cpp/gbm/test_gbtree.cc b/tests/cpp/gbm/test_gbtree.cc
index 7a3008cff..1c0927031 100644
--- a/tests/cpp/gbm/test_gbtree.cc
+++ b/tests/cpp/gbm/test_gbtree.cc
@@ -4,11 +4,13 @@
 #include <gtest/gtest.h>
 #include <xgboost/context.h>
 #include <xgboost/host_device_vector.h>  // for HostDeviceVector
+#include <xgboost/json.h>                // for Json, Object
 #include <xgboost/learner.h>             // for Learner
 
-#include <limits>  // for numeric_limits
-#include <memory>  // for shared_ptr
-#include <string>  // for string
+#include <limits>    // for numeric_limits
+#include <memory>    // for shared_ptr
+#include <optional>  // for optional
+#include <string>    // for string
 
 #include "../../../src/data/proxy_dmatrix.h"  // for DMatrixProxy
 #include "../../../src/gbm/gbtree.h"
@@ -165,6 +167,115 @@ TEST(GBTree, ChoosePredictor) {
   // data is not pulled back into host
   ASSERT_FALSE(data.HostCanWrite());
 }
+
+TEST(GBTree, ChooseTreeMethod) {
+  bst_row_t n_samples{128};
+  bst_feature_t n_features{64};
+  auto Xy = RandomDataGenerator{n_samples, n_features, 0.5f}.GenerateDMatrix(true);
+
+  auto with_update = [&](std::optional<std::string> device,
+                         std::optional<std::string> tree_method) {
+    auto learner = std::unique_ptr<Learner>(Learner::Create({Xy}));
+    if (tree_method.has_value()) {
+      learner->SetParam("tree_method", tree_method.value());
+    }
+    if (device.has_value()) {
+      learner->SetParam("gpu_id", device.value());
+    }
+    learner->Configure();
+    for (std::int32_t i = 0; i < 3; ++i) {
+      learner->UpdateOneIter(0, Xy);
+    }
+    Json config{Object{}};
+    learner->SaveConfig(&config);
+    auto updater = config["learner"]["gradient_booster"]["updater"];
+    CHECK(!IsA<Null>(updater));
+    return updater;
+  };
+
+  auto with_boost = [&](std::optional<std::string> device, std::optional<std::string> tree_method) {
+    auto learner = std::unique_ptr<Learner>(Learner::Create({Xy}));
+    if (tree_method.has_value()) {
+      learner->SetParam("tree_method", tree_method.value());
+    }
+    if (device.has_value()) {
+      learner->SetParam("gpu_id", device.value());
+    }
+    learner->Configure();
+    for (std::int32_t i = 0; i < 3; ++i) {
+      HostDeviceVector<GradientPair> gpair{GenerateRandomGradients(Xy->Info().num_row_)};
+      learner->BoostOneIter(0, Xy, &gpair);
+    }
+
+    Json config{Object{}};
+    learner->SaveConfig(&config);
+    auto updater = config["learner"]["gradient_booster"]["updater"];
+    return updater;
+  };
+
+  // |        | hist    | gpu_hist | exact | NA  |
+  // |--------+---------+----------+-------+-----|
+  // | CUDA:0 | GPU     | GPU (w)  | Err   | GPU | # not yet tested
+  // | CPU    | CPU     | Err      | CPU   | CPU | # not yet tested
+  // |--------+---------+----------+-------+-----|
+  // | -1     | CPU     | GPU (w)  | CPU   | CPU |
+  // | 0      | GPU     | GPU (w)  | Err   | GPU |
+  // | NA     | CPU     | GPU (w)  | CPU   | CPU |
+  //
+  // - (w): warning
+  // - CPU: Run on CPU.
+  // - GPU: Run on CUDA.
+  // - Err: Not feasible.
+  // - NA:  Parameter is not specified.
+
+  // When GPU hist is specified with a CPU context, we should emit an error. However, it's
+  // quite difficult to detect whether the CPU context is being used because it's the
+  // default or because it's specified by the user.
+
+  std::map<std::pair<std::optional<std::string>, std::optional<std::string>>, std::string>
+      expectation{
+          // hist
+          {{"hist", "-1"}, "grow_quantile_histmaker"},
+          {{"hist", "0"}, "grow_gpu_hist"},
+          {{"hist", std::nullopt}, "grow_quantile_histmaker"},
+          // gpu_hist
+          {{"gpu_hist", "-1"}, "grow_gpu_hist"},
+          {{"gpu_hist", "0"}, "grow_gpu_hist"},
+          {{"gpu_hist", std::nullopt}, "grow_gpu_hist"},
+          // exact
+          {{"exact", "-1"}, "grow_colmaker,prune"},
+          {{"exact", "0"}, "err"},
+          {{"exact", std::nullopt}, "grow_colmaker,prune"},
+          // NA
+          {{std::nullopt, "-1"}, "grow_quantile_histmaker"},
+          {{std::nullopt, "0"}, "grow_gpu_hist"},  // default to hist
+          {{std::nullopt, std::nullopt}, "grow_quantile_histmaker"},
+      };
+
+  auto run_test = [&](auto fn) {
+    for (auto const& kv : expectation) {
+      auto device = kv.first.second;
+      auto tm = kv.first.first;
+
+      if (kv.second == "err") {
+        ASSERT_THROW({ fn(device, tm); }, dmlc::Error)
+            << " device:" << device.value_or("NA") << " tm:" << tm.value_or("NA");
+        continue;
+      }
+      auto up = fn(device, tm);
+      auto ups = get<Array const>(up);
+      auto exp_names = common::Split(kv.second, ',');
+      ASSERT_EQ(exp_names.size(), ups.size());
+      for (std::size_t i = 0; i < exp_names.size(); ++i) {
+        ASSERT_EQ(get<String const>(ups[i]["name"]), exp_names[i])
+            << " device:" << device.value_or("NA") << " tm:" << tm.value_or("NA");
+      }
+    }
+  };
+
+  run_test(with_update);
+  run_test(with_boost);
+}
 #endif  // XGBOOST_USE_CUDA
 
 // Some other parts of test are in `Tree.JsonIO'.
diff --git a/tests/python/test_training_continuation.py b/tests/python/test_training_continuation.py
index 3ec1f1ffb..3cbe6a421 100644
--- a/tests/python/test_training_continuation.py
+++ b/tests/python/test_training_continuation.py
@@ -57,12 +57,12 @@ class TestTrainingContinuation:
 
         gbdt_02 = xgb.train(xgb_params_01, dtrain_2class,
                             num_boost_round=0)
-        gbdt_02.save_model('xgb_tc.model')
+        gbdt_02.save_model('xgb_tc.json')
 
         gbdt_02a = xgb.train(xgb_params_01, dtrain_2class,
                              num_boost_round=10, xgb_model=gbdt_02)
         gbdt_02b = xgb.train(xgb_params_01, dtrain_2class,
-                             num_boost_round=10, xgb_model="xgb_tc.model")
+                             num_boost_round=10, xgb_model="xgb_tc.json")
         ntrees_02a = len(gbdt_02a.get_dump())
         ntrees_02b = len(gbdt_02b.get_dump())
         assert ntrees_02a == 10
@@ -78,18 +78,18 @@ class TestTrainingContinuation:
 
         gbdt_03 = xgb.train(xgb_params_01, dtrain_2class,
                             num_boost_round=3)
-        gbdt_03.save_model('xgb_tc.model')
+        gbdt_03.save_model('xgb_tc.json')
 
         gbdt_03a = xgb.train(xgb_params_01, dtrain_2class,
                              num_boost_round=7, xgb_model=gbdt_03)
         gbdt_03b = xgb.train(xgb_params_01, dtrain_2class,
-                             num_boost_round=7, xgb_model="xgb_tc.model")
+                             num_boost_round=7, xgb_model="xgb_tc.json")
         ntrees_03a = len(gbdt_03a.get_dump())
         ntrees_03b = len(gbdt_03b.get_dump())
         assert ntrees_03a == 10
         assert ntrees_03b == 10
 
-        os.remove('xgb_tc.model')
+        os.remove('xgb_tc.json')
 
         res1 = mean_squared_error(y_2class, gbdt_03a.predict(dtrain_2class))
         res2 = mean_squared_error(y_2class, gbdt_03b.predict(dtrain_2class))

From 3632242e0b680592a0bbae7b086c42e52741cfaa Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Mon, 10 Jul 2023 21:15:56 -0700
Subject: [PATCH 028/136] Support column split with GPU quantile (#9370)

---
 src/common/hist_util.cu            |  2 +-
 src/common/quantile.cu             |  8 ++---
 src/common/quantile.cuh            |  4 +--
 src/data/iterative_dmatrix.cu      |  2 +-
 tests/cpp/common/test_hist_util.cu | 12 +++----
 tests/cpp/common/test_quantile.cu  | 56 ++++++++++++++++++++++++++++--
 6 files changed, 68 insertions(+), 16 deletions(-)

diff --git a/src/common/hist_util.cu b/src/common/hist_util.cu
index 76fff8a98..1c9525a62 100644
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@@ -352,7 +352,7 @@ HistogramCuts DeviceSketch(int device, DMatrix* dmat, int max_bins,
       }
     }
   }
-  sketch_container.MakeCuts(&cuts);
+  sketch_container.MakeCuts(&cuts, dmat->Info().IsColumnSplit());
   return cuts;
 }
 }  // namespace common
diff --git a/src/common/quantile.cu b/src/common/quantile.cu
index 5c81ec2ea..25c4543c6 100644
--- a/src/common/quantile.cu
+++ b/src/common/quantile.cu
@@ -501,10 +501,10 @@ void SketchContainer::FixError() {
   });
 }
 
-void SketchContainer::AllReduce() {
+void SketchContainer::AllReduce(bool is_column_split) {
   dh::safe_cuda(cudaSetDevice(device_));
   auto world = collective::GetWorldSize();
-  if (world == 1) {
+  if (world == 1 || is_column_split) {
     return;
   }
 
@@ -582,13 +582,13 @@ struct InvalidCatOp {
 };
 }  // anonymous namespace
 
-void SketchContainer::MakeCuts(HistogramCuts* p_cuts) {
+void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) {
   timer_.Start(__func__);
   dh::safe_cuda(cudaSetDevice(device_));
   p_cuts->min_vals_.Resize(num_columns_);
 
   // Sync between workers.
-  this->AllReduce();
+  this->AllReduce(is_column_split);
 
   // Prune to final number of bins.
   this->Prune(num_bins_ + 1);
diff --git a/src/common/quantile.cuh b/src/common/quantile.cuh
index 7ebd4ff51..fedbdbd82 100644
--- a/src/common/quantile.cuh
+++ b/src/common/quantile.cuh
@@ -154,9 +154,9 @@ class SketchContainer {
              Span<SketchEntry const> that);
 
   /* \brief Merge quantiles from other GPU workers. */
-  void AllReduce();
+  void AllReduce(bool is_column_split);
   /* \brief Create the final histogram cut values. */
-  void MakeCuts(HistogramCuts* cuts);
+  void MakeCuts(HistogramCuts* cuts, bool is_column_split);
 
   Span<SketchEntry const> Data() const {
     return {this->Current().data().get(), this->Current().size()};
diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu
index a760ec9ab..1e74cb23c 100644
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@@ -106,7 +106,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
     sketch_containers.clear();
     sketch_containers.shrink_to_fit();
 
-    final_sketch.MakeCuts(&cuts);
+    final_sketch.MakeCuts(&cuts, this->info_.IsColumnSplit());
   } else {
     GetCutsFromRef(ctx, ref, Info().num_col_, p, &cuts);
   }
diff --git a/tests/cpp/common/test_hist_util.cu b/tests/cpp/common/test_hist_util.cu
index 20fd1043d..127cd95d4 100644
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@@ -351,7 +351,7 @@ auto MakeUnweightedCutsForTest(Adapter adapter, int32_t num_bins, float missing,
   SketchContainer sketch_container(ft, num_bins, adapter.NumColumns(), adapter.NumRows(), 0);
   MetaInfo info;
   AdapterDeviceSketch(adapter.Value(), num_bins, info, missing, &sketch_container, batch_size);
-  sketch_container.MakeCuts(&batched_cuts);
+  sketch_container.MakeCuts(&batched_cuts, info.IsColumnSplit());
   return batched_cuts;
 }
 
@@ -419,7 +419,7 @@ TEST(HistUtil, AdapterSketchSlidingWindowMemory) {
   AdapterDeviceSketch(adapter.Value(), num_bins, info, std::numeric_limits<float>::quiet_NaN(),
                       &sketch_container);
   HistogramCuts cuts;
-  sketch_container.MakeCuts(&cuts);
+  sketch_container.MakeCuts(&cuts, info.IsColumnSplit());
   size_t bytes_required = detail::RequiredMemory(
       num_rows, num_columns, num_rows * num_columns, num_bins, false);
   EXPECT_LE(dh::GlobalMemoryLogger().PeakMemory(), bytes_required * 1.05);
@@ -449,7 +449,7 @@ TEST(HistUtil, AdapterSketchSlidingWindowWeightedMemory) {
                       &sketch_container);
 
   HistogramCuts cuts;
-  sketch_container.MakeCuts(&cuts);
+  sketch_container.MakeCuts(&cuts, info.IsColumnSplit());
   ConsoleLogger::Configure({{"verbosity", "0"}});
   size_t bytes_required = detail::RequiredMemory(
       num_rows, num_columns, num_rows * num_columns, num_bins, true);
@@ -482,7 +482,7 @@ void TestCategoricalSketchAdapter(size_t n, size_t num_categories,
   AdapterDeviceSketch(adapter.Value(), num_bins, info,
                       std::numeric_limits<float>::quiet_NaN(), &container);
   HistogramCuts cuts;
-  container.MakeCuts(&cuts);
+  container.MakeCuts(&cuts, info.IsColumnSplit());
 
   thrust::sort(x.begin(), x.end());
   auto n_uniques = thrust::unique(x.begin(), x.end()) - x.begin();
@@ -710,7 +710,7 @@ void TestAdapterSketchFromWeights(bool with_group) {
                       &sketch_container);
 
   common::HistogramCuts cuts;
-  sketch_container.MakeCuts(&cuts);
+  sketch_container.MakeCuts(&cuts, info.IsColumnSplit());
 
   auto dmat = GetDMatrixFromData(storage.HostVector(), kRows, kCols);
   if (with_group) {
@@ -751,7 +751,7 @@ void TestAdapterSketchFromWeights(bool with_group) {
     SketchContainer sketch_container(ft, kBins, kCols, kRows, 0);
     AdapterDeviceSketch(adapter.Value(), kBins, info, std::numeric_limits<float>::quiet_NaN(),
                         &sketch_container);
-    sketch_container.MakeCuts(&weighted);
+    sketch_container.MakeCuts(&weighted, info.IsColumnSplit());
     ValidateCuts(weighted, dmat.get(), kBins);
   }
 }
diff --git a/tests/cpp/common/test_quantile.cu b/tests/cpp/common/test_quantile.cu
index 935d88ab6..d2dc802a9 100644
--- a/tests/cpp/common/test_quantile.cu
+++ b/tests/cpp/common/test_quantile.cu
@@ -388,7 +388,7 @@ void TestAllReduceBasic(int32_t n_gpus) {
     AdapterDeviceSketch(adapter.Value(), n_bins, info,
                         std::numeric_limits<float>::quiet_NaN(),
                         &sketch_distributed);
-    sketch_distributed.AllReduce();
+    sketch_distributed.AllReduce(false);
     sketch_distributed.Unique();
 
     ASSERT_EQ(sketch_distributed.ColumnsPtr().size(),
@@ -425,6 +425,58 @@ TEST(GPUQuantile, MGPUAllReduceBasic) {
   RunWithInMemoryCommunicator(n_gpus, TestAllReduceBasic, n_gpus);
 }
 
+namespace {
+void TestColumnSplitBasic() {
+  auto const world = collective::GetWorldSize();
+  auto const rank = collective::GetRank();
+  std::size_t constexpr kRows = 1000, kCols = 100, kBins = 64;
+
+  auto m = std::unique_ptr<DMatrix>{[=]() {
+    auto dmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
+    return dmat->SliceCol(world, rank);
+  }()};
+
+  // Generate cuts for distributed environment.
+  auto const device = rank;
+  HistogramCuts distributed_cuts = common::DeviceSketch(device, m.get(), kBins);
+
+  // Generate cuts for single node environment
+  collective::Finalize();
+  CHECK_EQ(collective::GetWorldSize(), 1);
+  HistogramCuts single_node_cuts = common::DeviceSketch(device, m.get(), kBins);
+
+  auto const& sptrs = single_node_cuts.Ptrs();
+  auto const& dptrs = distributed_cuts.Ptrs();
+  auto const& svals = single_node_cuts.Values();
+  auto const& dvals = distributed_cuts.Values();
+  auto const& smins = single_node_cuts.MinValues();
+  auto const& dmins = distributed_cuts.MinValues();
+
+  EXPECT_EQ(sptrs.size(), dptrs.size());
+  for (size_t i = 0; i < sptrs.size(); ++i) {
+    EXPECT_EQ(sptrs[i], dptrs[i]) << "rank: " << rank << ", i: " << i;
+  }
+
+  EXPECT_EQ(svals.size(), dvals.size());
+  for (size_t i = 0; i < svals.size(); ++i) {
+    EXPECT_NEAR(svals[i], dvals[i], 2e-2f) << "rank: " << rank << ", i: " << i;
+  }
+
+  EXPECT_EQ(smins.size(), dmins.size());
+  for (size_t i = 0; i < smins.size(); ++i) {
+    EXPECT_FLOAT_EQ(smins[i], dmins[i]) << "rank: " << rank << ", i: " << i;
+  }
+}
+}  // anonymous namespace
+
+TEST(GPUQuantile, MGPUColumnSplitBasic) {
+  auto const n_gpus = AllVisibleGPUs();
+  if (n_gpus <= 1) {
+    GTEST_SKIP() << "Skipping MGPUColumnSplitBasic test with # GPUs = " << n_gpus;
+  }
+  RunWithInMemoryCommunicator(n_gpus, TestColumnSplitBasic);
+}
+
 namespace {
 void TestSameOnAllWorkers(std::int32_t n_gpus) {
   auto world = collective::GetWorldSize();
@@ -445,7 +497,7 @@ void TestSameOnAllWorkers(std::int32_t n_gpus) {
     AdapterDeviceSketch(adapter.Value(), n_bins, info,
                         std::numeric_limits<float>::quiet_NaN(),
                         &sketch_distributed);
-    sketch_distributed.AllReduce();
+    sketch_distributed.AllReduce(false);
     sketch_distributed.Unique();
     TestQuantileElemRank(device, sketch_distributed.Data(), sketch_distributed.ColumnsPtr(), true);
 

From a1367ea1f834bdf6fcaa3e9f38af0305fbed5ce8 Mon Sep 17 00:00:00 2001
From: jinmfeng001 <102719116+jinmfeng001@users.noreply.github.com>
Date: Wed, 12 Jul 2023 15:18:46 +0800
Subject: [PATCH 029/136] Set feature_names and feature_types in jvm-packages
 (#9364)

* 1. Add parameters to set feature names and feature types
2. Save feature names and feature types to native json model

* Change serialization and deserialization format to ubj.
---
 .../dmlc/xgboost4j/scala/spark/XGBoost.scala  | 35 +++++++++-
 .../scala/spark/XGBoostClassifier.scala       |  6 ++
 .../scala/spark/XGBoostRegressor.scala        |  6 ++
 .../scala/spark/params/GeneralParams.scala    | 15 +++++
 .../scala/spark/XGBoostClassifierSuite.scala  | 24 +++++++
 .../java/ml/dmlc/xgboost4j/java/Booster.java  | 49 +++++++++++++-
 .../java/ml/dmlc/xgboost4j/java/XGBoost.java  |  2 +
 .../ml/dmlc/xgboost4j/java/XGBoostJNI.java    |  4 ++
 .../ml/dmlc/xgboost4j/scala/DMatrix.scala     | 40 ++++++++++++
 .../xgboost4j/src/native/xgboost4j.cpp        | 65 +++++++++++++++++++
 jvm-packages/xgboost4j/src/native/xgboost4j.h | 18 +++++
 .../dmlc/xgboost4j/java/BoosterImplTest.java  | 39 +++++++++--
 12 files changed, 295 insertions(+), 8 deletions(-)

diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
index 0aeae791a..9208449ca 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
@@ -74,7 +74,9 @@ private[scala] case class XGBoostExecutionParams(
     earlyStoppingParams: XGBoostExecutionEarlyStoppingParams,
     cacheTrainingSet: Boolean,
     treeMethod: Option[String],
-    isLocal: Boolean) {
+    isLocal: Boolean,
+    featureNames: Option[Array[String]],
+    featureTypes: Option[Array[String]]) {
 
   private var rawParamMap: Map[String, Any] = _
 
@@ -213,6 +215,13 @@ private[this] class XGBoostExecutionParamsFactory(rawParams: Map[String, Any], s
     val cacheTrainingSet = overridedParams.getOrElse("cache_training_set", false)
       .asInstanceOf[Boolean]
 
+    val featureNames = if (overridedParams.contains("feature_names")) {
+      Some(overridedParams("feature_names").asInstanceOf[Array[String]])
+    } else None
+    val featureTypes = if (overridedParams.contains("feature_types")){
+      Some(overridedParams("feature_types").asInstanceOf[Array[String]])
+    } else None
+
     val xgbExecParam = XGBoostExecutionParams(nWorkers, round, useExternalMemory, obj, eval,
       missing, allowNonZeroForMissing, trackerConf,
       checkpointParam,
@@ -220,7 +229,10 @@ private[this] class XGBoostExecutionParamsFactory(rawParams: Map[String, Any], s
       xgbExecEarlyStoppingParams,
       cacheTrainingSet,
       treeMethod,
-      isLocal)
+      isLocal,
+      featureNames,
+      featureTypes
+    )
     xgbExecParam.setRawParamMap(overridedParams)
     xgbExecParam
   }
@@ -531,6 +543,16 @@ private object Watches {
     if (trainMargin.isDefined) trainMatrix.setBaseMargin(trainMargin.get)
     if (testMargin.isDefined) testMatrix.setBaseMargin(testMargin.get)
 
+    if (xgbExecutionParams.featureNames.isDefined) {
+      trainMatrix.setFeatureNames(xgbExecutionParams.featureNames.get)
+      testMatrix.setFeatureNames(xgbExecutionParams.featureNames.get)
+    }
+
+    if (xgbExecutionParams.featureTypes.isDefined) {
+      trainMatrix.setFeatureTypes(xgbExecutionParams.featureTypes.get)
+      testMatrix.setFeatureTypes(xgbExecutionParams.featureTypes.get)
+    }
+
     new Watches(Array(trainMatrix, testMatrix), Array("train", "test"), cacheDirName)
   }
 
@@ -643,6 +665,15 @@ private object Watches {
     if (trainMargin.isDefined) trainMatrix.setBaseMargin(trainMargin.get)
     if (testMargin.isDefined) testMatrix.setBaseMargin(testMargin.get)
 
+    if (xgbExecutionParams.featureNames.isDefined) {
+      trainMatrix.setFeatureNames(xgbExecutionParams.featureNames.get)
+      testMatrix.setFeatureNames(xgbExecutionParams.featureNames.get)
+    }
+    if (xgbExecutionParams.featureTypes.isDefined) {
+      trainMatrix.setFeatureTypes(xgbExecutionParams.featureTypes.get)
+      testMatrix.setFeatureTypes(xgbExecutionParams.featureTypes.get)
+    }
+
     new Watches(Array(trainMatrix, testMatrix), Array("train", "test"), cacheDirName)
   }
 }
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifier.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifier.scala
index 32b2c2c02..fd4633a0d 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifier.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifier.scala
@@ -139,6 +139,12 @@ class XGBoostClassifier (
   def setSinglePrecisionHistogram(value: Boolean): this.type =
     set(singlePrecisionHistogram, value)
 
+  def setFeatureNames(value: Array[String]): this.type =
+    set(featureNames, value)
+
+  def setFeatureTypes(value: Array[String]): this.type =
+    set(featureTypes, value)
+
   // called at the start of fit/train when 'eval_metric' is not defined
   private def setupDefaultEvalMetric(): String = {
     require(isDefined(objective), "Users must set \'objective\' via xgboostParams.")
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressor.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressor.scala
index 01d001a56..99dbdc580 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressor.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressor.scala
@@ -141,6 +141,12 @@ class XGBoostRegressor (
   def setSinglePrecisionHistogram(value: Boolean): this.type =
     set(singlePrecisionHistogram, value)
 
+  def setFeatureNames(value: Array[String]): this.type =
+    set(featureNames, value)
+
+  def setFeatureTypes(value: Array[String]): this.type =
+    set(featureTypes, value)
+
   // called at the start of fit/train when 'eval_metric' is not defined
   private def setupDefaultEvalMetric(): String = {
     require(isDefined(objective), "Users must set \'objective\' via xgboostParams.")
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/GeneralParams.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/GeneralParams.scala
index fc8954530..3f387de9b 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/GeneralParams.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/GeneralParams.scala
@@ -177,6 +177,21 @@ private[spark] trait GeneralParams extends Params {
 
   final def getSeed: Long = $(seed)
 
+  /** Feature's name, it will be set to DMatrix and Booster, and in the final native json model.
+   * In native code, the parameter name is feature_name.
+   * */
+  final val featureNames = new StringArrayParam(this, "feature_names",
+  "an array of feature names")
+
+  final def getFeatureNames: Array[String] = $(featureNames)
+
+  /** Feature types, q is numeric and c is categorical.
+   * In native code, the parameter name is feature_type
+   * */
+  final val featureTypes = new StringArrayParam(this, "feature_types",
+  "an array of feature types")
+
+  final def getFeatureTypes: Array[String] = $(featureTypes)
 }
 
 trait HasLeafPredictionCol extends Params {
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala
index 0031be9c7..1290465ea 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala
@@ -27,6 +27,8 @@ import org.apache.commons.io.IOUtils
 
 import org.apache.spark.Partitioner
 import org.apache.spark.ml.feature.VectorAssembler
+import org.json4s.{DefaultFormats, Formats}
+import org.json4s.jackson.parseJson
 
 class XGBoostClassifierSuite extends AnyFunSuite with PerTest with TmpFolderPerSuite {
 
@@ -453,4 +455,26 @@ class XGBoostClassifierSuite extends AnyFunSuite with PerTest with TmpFolderPerS
     assert(!compareTwoFiles(new File(modelJsonPath, "data/XGBoostClassificationModel").getPath,
       nativeUbjModelPath))
   }
+
+  test("native json model file should store feature_name and feature_type") {
+    val featureNames = (1 to 33).map(idx => s"feature_${idx}").toArray
+    val featureTypes = (1 to 33).map(idx => "q").toArray
+    val paramMap = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
+      "objective" -> "multi:softprob", "num_class" -> "6", "num_round" -> 5,
+      "num_workers" -> numWorkers, "tree_method" -> treeMethod
+    )
+    val trainingDF = buildDataFrame(MultiClassification.train)
+    val xgb = new XGBoostClassifier(paramMap)
+      .setFeatureNames(featureNames)
+      .setFeatureTypes(featureTypes)
+    val model = xgb.fit(trainingDF)
+    val modelStr = new String(model._booster.toByteArray("json"))
+    System.out.println(modelStr)
+    val jsonModel = parseJson(modelStr)
+    implicit val formats: Formats = DefaultFormats
+    val featureNamesInModel = (jsonModel \ "learner" \ "feature_names").extract[List[String]]
+    val featureTypesInModel = (jsonModel \ "learner" \ "feature_types").extract[List[String]]
+    assert(featureNamesInModel.length == 33)
+    assert(featureTypesInModel.length == 33)
+  }
 }
diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java
index ed1a3f5c9..23b8b1a80 100644
--- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java
+++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java
@@ -162,6 +162,51 @@ public class Booster implements Serializable, KryoSerializable {
     }
   }
 
+  /**
+   * Get feature names from the Booster.
+   * @return
+   * @throws XGBoostError
+   */
+  public final String[] getFeatureNames() throws XGBoostError {
+    int numFeature = (int) getNumFeature();
+    String[] out = new String[numFeature];
+    XGBoostJNI.checkCall(XGBoostJNI.XGBoosterGetStrFeatureInfo(handle, "feature_name", out));
+    return out;
+  }
+
+  /**
+   * Set feature names to the Booster.
+   *
+   * @param featureNames
+   * @throws XGBoostError
+   */
+  public void setFeatureNames(String[] featureNames) throws XGBoostError {
+    XGBoostJNI.checkCall(XGBoostJNI.XGBoosterSetStrFeatureInfo(
+        handle, "feature_name", featureNames));
+  }
+
+  /**
+   * Get feature types from the Booster.
+   * @return
+   * @throws XGBoostError
+   */
+  public final String[] getFeatureTypes() throws XGBoostError {
+    int numFeature = (int) getNumFeature();
+    String[] out = new String[numFeature];
+    XGBoostJNI.checkCall(XGBoostJNI.XGBoosterGetStrFeatureInfo(handle, "feature_type", out));
+    return out;
+  }
+
+  /**
+   * Set feature types to the Booster.
+   * @param featureTypes
+   * @throws XGBoostError
+   */
+  public void setFeatureTypes(String[] featureTypes) throws XGBoostError {
+    XGBoostJNI.checkCall(XGBoostJNI.XGBoosterSetStrFeatureInfo(
+        handle, "feature_type", featureTypes));
+  }
+
   /**
    * Update the booster for one iteration.
    *
@@ -744,7 +789,7 @@ public class Booster implements Serializable, KryoSerializable {
   private void writeObject(java.io.ObjectOutputStream out) throws IOException {
     try {
       out.writeInt(version);
-      out.writeObject(this.toByteArray());
+      out.writeObject(this.toByteArray("ubj"));
     } catch (XGBoostError ex) {
       ex.printStackTrace();
       logger.error(ex.getMessage());
@@ -780,7 +825,7 @@ public class Booster implements Serializable, KryoSerializable {
   @Override
   public void write(Kryo kryo, Output output) {
     try {
-      byte[] serObj = this.toByteArray();
+      byte[] serObj = this.toByteArray("ubj");
       int serObjSize = serObj.length;
       output.writeInt(serObjSize);
       output.writeInt(version);
diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoost.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoost.java
index 75e18957f..d765a3cab 100644
--- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoost.java
+++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoost.java
@@ -198,6 +198,8 @@ public class XGBoost {
     if (booster == null) {
       // Start training on a new booster
       booster = new Booster(params, allMats);
+      booster.setFeatureNames(dtrain.getFeatureNames());
+      booster.setFeatureTypes(dtrain.getFeatureTypes());
       booster.loadRabitCheckpoint();
     } else {
       // Start training on an existing booster
diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoostJNI.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoostJNI.java
index 3b8db941d..abe584f05 100644
--- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoostJNI.java
+++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoostJNI.java
@@ -164,4 +164,8 @@ class XGBoostJNI {
   public final static native int XGDMatrixCreateFromArrayInterfaceColumns(
     String featureJson, float missing, int nthread, long[] out);
 
+  public final static native int XGBoosterSetStrFeatureInfo(long handle, String field, String[] features);
+
+  public final static native int XGBoosterGetStrFeatureInfo(long handle, String field, String[] out);
+
 }
diff --git a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/DMatrix.scala b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/DMatrix.scala
index 9269f3fde..714adf726 100644
--- a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/DMatrix.scala
+++ b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/DMatrix.scala
@@ -205,6 +205,26 @@ class DMatrix private[scala](private[scala] val jDMatrix: JDMatrix) {
     jDMatrix.setBaseMargin(column)
   }
 
+  /**
+   * set feature names
+   * @param values feature names
+   * @throws ml.dmlc.xgboost4j.java.XGBoostError
+   */
+  @throws(classOf[XGBoostError])
+  def setFeatureNames(values: Array[String]): Unit = {
+    jDMatrix.setFeatureNames(values)
+  }
+
+  /**
+   * set feature types
+   * @param values feature types
+   * @throws ml.dmlc.xgboost4j.java.XGBoostError
+   */
+  @throws(classOf[XGBoostError])
+  def setFeatureTypes(values: Array[String]): Unit = {
+    jDMatrix.setFeatureTypes(values)
+  }
+
   /**
    * Get group sizes of DMatrix (used for ranking)
    */
@@ -243,6 +263,26 @@ class DMatrix private[scala](private[scala] val jDMatrix: JDMatrix) {
     jDMatrix.getBaseMargin
   }
 
+  /**
+   * get feature names
+   * @throws ml.dmlc.xgboost4j.java.XGBoostError
+   * @return
+   */
+  @throws(classOf[XGBoostError])
+  def getFeatureNames: Array[String] = {
+    jDMatrix.getFeatureNames
+  }
+
+  /**
+   * get feature types
+   * @throws ml.dmlc.xgboost4j.java.XGBoostError
+   * @return
+   */
+  @throws(classOf[XGBoostError])
+  def getFeatureTypes: Array[String] = {
+    jDMatrix.getFeatureTypes
+  }
+
   /**
    * Slice the DMatrix and return a new DMatrix that only contains `rowIndex`.
    *
diff --git a/jvm-packages/xgboost4j/src/native/xgboost4j.cpp b/jvm-packages/xgboost4j/src/native/xgboost4j.cpp
index 141ec51bc..a61a68dbc 100644
--- a/jvm-packages/xgboost4j/src/native/xgboost4j.cpp
+++ b/jvm-packages/xgboost4j/src/native/xgboost4j.cpp
@@ -1148,3 +1148,68 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixGetStrFea
   if (field) jenv->ReleaseStringUTFChars(jfield, field);
   return ret;
 }
+
+/*
+ * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
+ * Method:    XGBoosterSetStrFeatureInfo
+ * Signature: (JLjava/lang/String;[Ljava/lang/String;])I
+ */
+JNIEXPORT jint JNICALL
+Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterSetStrFeatureInfo(
+    JNIEnv *jenv, jclass jclz, jlong jhandle, jstring jfield,
+    jobjectArray jfeatures) {
+  BoosterHandle handle = (BoosterHandle)jhandle;
+
+  const char *field = jenv->GetStringUTFChars(jfield, 0);
+
+  bst_ulong feature_num = (bst_ulong)jenv->GetArrayLength(jfeatures);
+
+  std::vector<std::string> features;
+  std::vector<char const*> features_char;
+
+  for (bst_ulong i = 0; i < feature_num; ++i) {
+    jstring jfeature = (jstring)jenv->GetObjectArrayElement(jfeatures, i);
+    const char *s = jenv->GetStringUTFChars(jfeature, 0);
+    features.push_back(std::string(s, jenv->GetStringLength(jfeature)));
+    if (s != nullptr) jenv->ReleaseStringUTFChars(jfeature, s);
+  }
+
+  for (size_t i = 0; i < features.size(); ++i) {
+    features_char.push_back(features[i].c_str());
+  }
+
+  int ret = XGBoosterSetStrFeatureInfo(
+      handle, field, dmlc::BeginPtr(features_char), feature_num);
+  JVM_CHECK_CALL(ret);
+  return ret;
+}
+
+/*
+ * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
+ * Method:    XGBoosterSetGtrFeatureInfo
+ * Signature: (JLjava/lang/String;[Ljava/lang/String;])I
+ */
+JNIEXPORT jint JNICALL
+Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterGetStrFeatureInfo(
+    JNIEnv *jenv, jclass jclz, jlong jhandle, jstring jfield,
+    jobjectArray jout) {
+  BoosterHandle handle = (BoosterHandle)jhandle;
+
+  const char *field = jenv->GetStringUTFChars(jfield, 0);
+
+  bst_ulong feature_num = (bst_ulong)jenv->GetArrayLength(jout);
+
+  const char **features;
+  std::vector<char *> features_char;
+
+  int ret = XGBoosterGetStrFeatureInfo(handle, field, &feature_num,
+                                       (const char ***)&features);
+  JVM_CHECK_CALL(ret);
+
+  for (bst_ulong i = 0; i < feature_num; i++) {
+    jstring jfeature = jenv->NewStringUTF(features[i]);
+    jenv->SetObjectArrayElement(jout, i, jfeature);
+  }
+
+  return ret;
+}
diff --git a/jvm-packages/xgboost4j/src/native/xgboost4j.h b/jvm-packages/xgboost4j/src/native/xgboost4j.h
index 75dcd4b77..11a2f86ff 100644
--- a/jvm-packages/xgboost4j/src/native/xgboost4j.h
+++ b/jvm-packages/xgboost4j/src/native/xgboost4j.h
@@ -383,6 +383,24 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGQuantileDMatrixC
 JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixCreateFromArrayInterfaceColumns
   (JNIEnv *, jclass, jstring, jfloat, jint, jlongArray);
 
+/*
+ * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
+ * Method:    XGBoosterSetStrFeatureInfo
+ * Signature: (JLjava/lang/String;[Ljava/lang/String;])I
+ */
+JNIEXPORT jint JNICALL
+Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterSetStrFeatureInfo
+  (JNIEnv *, jclass, jlong, jstring, jobjectArray);
+
+/*
+ * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
+ * Method:    XGBoosterGetStrFeatureInfo
+ * Signature: (JLjava/lang/String;[Ljava/lang/String;])I
+ */
+JNIEXPORT jint JNICALL
+Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterGetStrFeatureInfo
+  (JNIEnv *, jclass, jlong, jstring, jobjectArray);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java
index 20a243f5b..d53c003a4 100644
--- a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java
+++ b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java
@@ -16,10 +16,7 @@
 package ml.dmlc.xgboost4j.java;
 
 import java.io.*;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.LinkedHashMap;
-import java.util.Map;
+import java.util.*;
 
 import junit.framework.TestCase;
 import org.junit.Test;
@@ -122,6 +119,40 @@ public class BoosterImplTest {
     TestCase.assertTrue(eval.eval(predicts2, testMat) < 0.1f);
   }
 
+  @Test
+  public void saveLoadModelWithFeaturesWithPath() throws XGBoostError, IOException {
+    DMatrix trainMat = new DMatrix(this.train_uri);
+    DMatrix testMat = new DMatrix(this.test_uri);
+    IEvaluation eval = new EvalError();
+
+    String[] featureNames = new String[126];
+    String[] featureTypes = new String[126];
+    for(int i = 0; i < 126; i++) {
+      featureNames[i] = "test_feature_name_" + i;
+      featureTypes[i] = "q";
+    }
+    trainMat.setFeatureNames(featureNames);
+    testMat.setFeatureNames(featureNames);
+    trainMat.setFeatureTypes(featureTypes);
+    testMat.setFeatureTypes(featureTypes);
+
+    Booster booster = trainBooster(trainMat, testMat);
+    // save and load, only json format save and load feature_name and feature_type
+    File temp = File.createTempFile("temp", ".json");
+    temp.deleteOnExit();
+    booster.saveModel(temp.getAbsolutePath());
+
+    String modelString = new String(booster.toByteArray("json"));
+    System.out.println(modelString);
+
+    Booster bst2 = XGBoost.loadModel(temp.getAbsolutePath());
+    assert (Arrays.equals(bst2.toByteArray("ubj"), booster.toByteArray("ubj")));
+    assert (Arrays.equals(bst2.toByteArray("json"), booster.toByteArray("json")));
+    assert (Arrays.equals(bst2.toByteArray("deprecated"), booster.toByteArray("deprecated")));
+    float[][] predicts2 = bst2.predict(testMat, true, 0);
+    TestCase.assertTrue(eval.eval(predicts2, testMat) < 0.1f);
+  }
+
   @Test
   public void saveLoadModelWithStream() throws XGBoostError, IOException {
     DMatrix trainMat = new DMatrix(this.train_uri);

From 2d0cd2817e868e1975b1cebf183b88226a22c01d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?C=C3=A1ssia=20Sampaio?=
 <4005687+cassiasamp@users.noreply.github.com>
Date: Thu, 13 Jul 2023 00:00:24 -0300
Subject: [PATCH 030/136] [doc] Fux learning_to_rank.rst (#9381)

just adding one missing bracket
---
 doc/tutorials/learning_to_rank.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/tutorials/learning_to_rank.rst b/doc/tutorials/learning_to_rank.rst
index e0c27b87b..965a623c3 100644
--- a/doc/tutorials/learning_to_rank.rst
+++ b/doc/tutorials/learning_to_rank.rst
@@ -70,7 +70,7 @@ Please note that, as of writing, there's no learning-to-rank interface in scikit
 
 .. code-block:: python
 
-  df = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1]))
+  df = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])])
   df["qid"] = qid
   ranker.fit(df, y)  # No need to pass qid as a separate argument
 

From 04aff3af8e3742bea33cfb59038926849443cb98 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 13 Jul 2023 19:30:25 +0800
Subject: [PATCH 031/136] Define the new `device` parameter. (#9362)

---
 CITATION                                      |   1 -
 doc/gpu/index.rst                             |   9 +-
 doc/install.rst                               |   8 +-
 doc/parameter.rst                             |  39 ++--
 doc/treemethod.rst                            |  60 +++--
 doc/tutorials/dask.rst                        |   2 +-
 doc/tutorials/saving_model.rst                |   2 +-
 include/xgboost/base.h                        |   2 +-
 include/xgboost/context.h                     |  72 ++++--
 include/xgboost/json.h                        |   6 +-
 include/xgboost/learner.h                     |  17 +-
 include/xgboost/predictor.h                   |   5 +-
 .../scala/rapids/spark/GpuPreXGBoost.scala    |   2 +-
 .../dmlc/xgboost4j/scala/spark/XGBoost.scala  |   2 +-
 python-package/xgboost/core.py                |  22 +-
 python-package/xgboost/sklearn.py             |   6 +-
 python-package/xgboost/spark/core.py          |  13 +-
 python-package/xgboost/spark/data.py          |  14 +-
 python-package/xgboost/spark/estimator.py     |  13 +-
 python-package/xgboost/testing/__init__.py    |  18 --
 src/c_api/c_api.cu                            |   5 +-
 src/common/error_msg.cc                       |  26 ++-
 src/common/error_msg.h                        |   4 +
 src/context.cc                                | 214 +++++++++++++++---
 src/data/iterative_dmatrix.cc                 |   5 +-
 src/data/proxy_dmatrix.cc                     |   1 +
 src/data/proxy_dmatrix.cu                     |  21 +-
 src/data/simple_dmatrix.cu                    |   2 +-
 src/gbm/gbtree.cc                             |  32 ++-
 src/learner.cc                                |  19 +-
 tests/ci_build/lint_python.py                 |   1 +
 tests/cpp/common/test_algorithm.cu            |   6 +-
 tests/cpp/common/test_hist_util.cu            |   2 +-
 tests/cpp/common/test_linalg.cu               |   4 +-
 tests/cpp/common/test_ranking_utils.cu        |  12 +-
 tests/cpp/common/test_stats.cc                |  10 +-
 tests/cpp/common/test_stats.cu                |  17 +-
 tests/cpp/data/test_gradient_index.cc         |   3 +-
 tests/cpp/gbm/test_gbtree.cc                  |  43 +++-
 tests/cpp/gbm/test_gbtree.cu                  |  22 +-
 tests/cpp/helpers.h                           |  12 -
 tests/cpp/metric/test_multiclass_metric.h     |   2 -
 tests/cpp/objective/test_lambdarank_obj.cu    |  18 +-
 tests/cpp/objective/test_regression_obj.cc    |   4 +-
 .../cpp/plugin/test_regression_obj_oneapi.cc  |   2 +-
 tests/cpp/predictor/test_cpu_predictor.cc     |   5 +-
 tests/cpp/predictor/test_gpu_predictor.cu     |  49 ++--
 tests/cpp/predictor/test_predictor.cc         |  89 +++-----
 tests/cpp/predictor/test_predictor.h          |   5 +-
 tests/cpp/test_context.cc                     |  31 +++
 tests/cpp/test_context.cu                     |  99 ++++++++
 tests/cpp/test_learner.cc                     |  30 ++-
 tests/cpp/tree/test_node_partition.cc         |   6 +-
 tests/cpp/tree/test_prediction_cache.cc       |   4 +-
 tests/python-gpu/load_pickle.py               |   6 +-
 .../test_device_quantile_dmatrix.py           |   2 +-
 tests/python-gpu/test_gpu_basic_models.py     |  16 +-
 tests/python-gpu/test_gpu_eval_metrics.py     |  12 +-
 tests/python-gpu/test_gpu_pickling.py         |  10 +-
 tests/python-gpu/test_gpu_prediction.py       | 127 +++++++----
 tests/python/test_predict.py                  |   8 +-
 .../test_with_spark/test_data.py              |   2 +-
 .../test_with_spark/test_spark_local.py       |   3 +
 63 files changed, 827 insertions(+), 477 deletions(-)
 create mode 100644 tests/cpp/test_context.cc
 create mode 100644 tests/cpp/test_context.cu

diff --git a/CITATION b/CITATION
index 189062510..b2acce7c1 100644
--- a/CITATION
+++ b/CITATION
@@ -15,4 +15,3 @@
  address = {New York, NY, USA},
  keywords = {large-scale machine learning},
 }
-
diff --git a/doc/gpu/index.rst b/doc/gpu/index.rst
index 97c9799fd..3cee0cdf5 100644
--- a/doc/gpu/index.rst
+++ b/doc/gpu/index.rst
@@ -22,7 +22,8 @@ Supported parameters
 GPU accelerated prediction is enabled by default for the above mentioned ``tree_method`` parameters but can be switched to CPU prediction by setting ``predictor`` to ``cpu_predictor``. This could be useful if you want to conserve GPU memory. Likewise when using CPU algorithms, GPU accelerated prediction can be enabled by setting ``predictor`` to ``gpu_predictor``.
 
 The device ordinal (which GPU to use if you have many of them) can be selected using the
-``gpu_id`` parameter, which defaults to 0 (the first device reported by CUDA runtime).
+``device`` parameter, which defaults to 0 when "CUDA" is specified(the first device reported by CUDA
+runtime).
 
 
 The GPU algorithms currently work with CLI, Python, R, and JVM packages. See :doc:`/install` for details.
@@ -30,13 +31,13 @@ The GPU algorithms currently work with CLI, Python, R, and JVM packages. See :do
 .. code-block:: python
   :caption: Python example
 
-  param['gpu_id'] = 0
+  param["device"] = "cuda:0"
   param['tree_method'] = 'gpu_hist'
 
 .. code-block:: python
   :caption: With Scikit-Learn interface
 
-  XGBRegressor(tree_method='gpu_hist', gpu_id=0)
+  XGBRegressor(tree_method='gpu_hist', device="cuda")
 
 
 GPU-Accelerated SHAP values
@@ -45,7 +46,7 @@ XGBoost makes use of `GPUTreeShap <https://github.com/rapidsai/gputreeshap>`_ as
 
 .. code-block:: python
 
-  model.set_param({"gpu_id": "0", "tree_method": "gpu_hist"})
+  model.set_param({"device": "cuda:0", "tree_method": "gpu_hist"})
   shap_values = model.predict(dtrain, pred_contribs=True)
   shap_interaction_values = model.predict(dtrain, pred_interactions=True)
 
diff --git a/doc/install.rst b/doc/install.rst
index 0e155f647..51f0d0d60 100644
--- a/doc/install.rst
+++ b/doc/install.rst
@@ -3,10 +3,10 @@ Installation Guide
 ##################
 
 XGBoost provides binary packages for some language bindings.  The binary packages support
-the GPU algorithm (``gpu_hist``) on machines with NVIDIA GPUs. Please note that **training
-with multiple GPUs is only supported for Linux platform**. See :doc:`gpu/index`.  Also we
-have both stable releases and nightly builds, see below for how to install them.  For
-building from source, visit :doc:`this page </build>`.
+the GPU algorithm (``device=cuda:0``) on machines with NVIDIA GPUs. Please note that
+**training with multiple GPUs is only supported for Linux platform**. See
+:doc:`gpu/index`.  Also we have both stable releases and nightly builds, see below for how
+to install them.  For building from source, visit :doc:`this page </build>`.
 
 .. contents:: Contents
 
diff --git a/doc/parameter.rst b/doc/parameter.rst
index 22893e400..d628d161b 100644
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -59,6 +59,18 @@ General Parameters
 
   - Feature dimension used in boosting, set to maximum dimension of the feature
 
+* ``device`` [default= ``cpu``]
+
+  .. versionadded:: 2.0.0
+
+  - Device for XGBoost to run. User can set it to one of the following values:
+
+    + ``cpu``: Use CPU.
+    + ``cuda``: Use a GPU (CUDA device).
+    + ``cuda:<ordinal>``: ``<ordinal>`` is an integer that specifies the ordinal of the GPU (which GPU do you want to use if you have more than one devices).
+    + ``gpu``: Default GPU device selection from the list of available and supported devices. Only ``cuda`` devices are supported currently.
+    + ``gpu:<ordinal>``: Default GPU device selection from the list of available and supported devices. Only ``cuda`` devices are supported currently.
+
 Parameters for Tree Booster
 ===========================
 * ``eta`` [default=0.3, alias: ``learning_rate``]
@@ -99,7 +111,7 @@ Parameters for Tree Booster
   - ``gradient_based``: the selection probability for each training instance is proportional to the
     *regularized absolute value* of gradients (more specifically, :math:`\sqrt{g^2+\lambda h^2}`).
     ``subsample`` may be set to as low as 0.1 without loss of model accuracy. Note that this
-    sampling method is only supported when ``tree_method`` is set to ``gpu_hist``; other tree
+    sampling method is only supported when ``tree_method`` is set to ``hist`` and the device is ``cuda``; other tree
     methods only support ``uniform`` sampling.
 
 * ``colsample_bytree``, ``colsample_bylevel``, ``colsample_bynode`` [default=1]
@@ -131,26 +143,15 @@ Parameters for Tree Booster
 * ``tree_method`` string [default= ``auto``]
 
   - The tree construction algorithm used in XGBoost. See description in the `reference paper <http://arxiv.org/abs/1603.02754>`_ and :doc:`treemethod`.
-  - XGBoost supports  ``approx``, ``hist`` and ``gpu_hist`` for distributed training.  Experimental support for external memory is available for ``approx`` and ``gpu_hist``.
 
-  - Choices: ``auto``, ``exact``, ``approx``, ``hist``, ``gpu_hist``, this is a
-    combination of commonly used updaters.  For other updaters like ``refresh``, set the
-    parameter ``updater`` directly.
+  - Choices: ``auto``, ``exact``, ``approx``, ``hist``, this is a combination of commonly
+    used updaters.  For other updaters like ``refresh``, set the parameter ``updater``
+    directly.
 
-    - ``auto``: Use heuristic to choose the fastest method.
-
-      - For small dataset, exact greedy (``exact``) will be used.
-      - For larger dataset, approximate algorithm (``approx``) will be chosen.  It's
-        recommended to try ``hist`` and ``gpu_hist`` for higher performance with large
-        dataset.
-        (``gpu_hist``)has support for ``external memory``.
-
-      - Because old behavior is always use exact greedy in single machine, user will get a
-        message when approximate algorithm is chosen to notify this choice.
+    - ``auto``: Same as the ``hist`` tree method.
     - ``exact``: Exact greedy algorithm.  Enumerates all split candidates.
     - ``approx``: Approximate greedy algorithm using quantile sketch and gradient histogram.
     - ``hist``: Faster histogram optimized approximate greedy algorithm.
-    - ``gpu_hist``: GPU implementation of ``hist`` algorithm.
 
 * ``scale_pos_weight`` [default=1]
 
@@ -163,7 +164,7 @@ Parameters for Tree Booster
     - ``grow_colmaker``: non-distributed column-based construction of trees.
     - ``grow_histmaker``: distributed tree construction with row-based data splitting based on global proposal of histogram counting.
     - ``grow_quantile_histmaker``: Grow tree using quantized histogram.
-    - ``grow_gpu_hist``: Grow tree with GPU.
+    - ``grow_gpu_hist``: Grow tree with GPU. Same as setting tree method to ``hist`` and use ``device=cuda``.
     - ``sync``: synchronizes trees in all distributed nodes.
     - ``refresh``: refreshes tree's statistics and/or leaf values based on the current data. Note that no random subsampling of data rows is performed.
     - ``prune``: prunes the splits where loss < min_split_loss (or gamma) and nodes that have depth greater than ``max_depth``.
@@ -183,7 +184,7 @@ Parameters for Tree Booster
 * ``grow_policy`` [default= ``depthwise``]
 
   - Controls a way new nodes are added to the tree.
-  - Currently supported only if ``tree_method`` is set to ``hist``, ``approx`` or ``gpu_hist``.
+  - Currently supported only if ``tree_method`` is set to ``hist`` or ``approx``.
   - Choices: ``depthwise``, ``lossguide``
 
     - ``depthwise``: split at nodes closest to the root.
@@ -195,7 +196,7 @@ Parameters for Tree Booster
 
 * ``max_bin``, [default=256]
 
-  - Only used if ``tree_method`` is set to ``hist``, ``approx`` or ``gpu_hist``.
+  - Only used if ``tree_method`` is set to ``hist`` or ``approx``.
   - Maximum number of discrete bins to bucket continuous features.
   - Increasing this number improves the optimality of splits at the cost of higher computation time.
 
diff --git a/doc/treemethod.rst b/doc/treemethod.rst
index 254eafb28..8ecddc066 100644
--- a/doc/treemethod.rst
+++ b/doc/treemethod.rst
@@ -3,14 +3,14 @@ Tree Methods
 ############
 
 For training boosted tree models, there are 2 parameters used for choosing algorithms,
-namely ``updater`` and ``tree_method``.  XGBoost has 4 builtin tree methods, namely
-``exact``, ``approx``, ``hist`` and ``gpu_hist``.  Along with these tree methods, there
-are also some free standing updaters including ``refresh``,
-``prune`` and ``sync``.  The parameter ``updater`` is more primitive than ``tree_method``
-as the latter is just a pre-configuration of the former.  The difference is mostly due to
-historical reasons that each updater requires some specific configurations and might has
-missing features.  As we are moving forward, the gap between them is becoming more and
-more irrelevant.  We will collectively document them under tree methods.
+namely ``updater`` and ``tree_method``.  XGBoost has 3 builtin tree methods, namely
+``exact``, ``approx`` and ``hist``.  Along with these tree methods, there are also some
+free standing updaters including ``refresh``, ``prune`` and ``sync``.  The parameter
+``updater`` is more primitive than ``tree_method`` as the latter is just a
+pre-configuration of the former.  The difference is mostly due to historical reasons that
+each updater requires some specific configurations and might has missing features.  As we
+are moving forward, the gap between them is becoming more and more irrelevant.  We will
+collectively document them under tree methods.
 
 **************
 Exact Solution
@@ -19,23 +19,23 @@ Exact Solution
 Exact means XGBoost considers all candidates from data for tree splitting, but underlying
 the objective is still interpreted as a Taylor expansion.
 
-1. ``exact``: Vanilla gradient boosting tree algorithm described in `reference paper
-   <http://arxiv.org/abs/1603.02754>`_.  During each split finding procedure, it iterates
-   over all entries of input data.  It's more accurate (among other greedy methods) but
-   slow in computation performance.  Also it doesn't support distributed training as
-   XGBoost employs row spliting data distribution while ``exact`` tree method works on a
-   sorted column format.  This tree method can be used with parameter ``tree_method`` set
-   to ``exact``.
+1. ``exact``: The vanilla gradient boosting tree algorithm described in `reference paper
+   <http://arxiv.org/abs/1603.02754>`_.  During split-finding, it iterates over all
+   entries of input data.  It's more accurate (among other greedy methods) but
+   computationally slower in compared to other tree methods.  Further more, its feature
+   set is limited. Features like distributed training and external memory that require
+   approximated quantiles are not supported. This tree method can be used with the
+   parameter ``tree_method`` set to ``exact``.
 
 
 **********************
 Approximated Solutions
 **********************
 
-As ``exact`` tree method is slow in performance and not scalable, we often employ
-approximated training algorithms.  These algorithms build a gradient histogram for each
-node and iterate through the histogram instead of real dataset.  Here we introduce the
-implementations in XGBoost below.
+As ``exact`` tree method is slow in computation performance and difficult to scale, we
+often employ approximated training algorithms.  These algorithms build a gradient
+histogram for each node and iterate through the histogram instead of real dataset.  Here
+we introduce the implementations in XGBoost.
 
 1. ``approx`` tree method: An approximation tree method described in `reference paper
    <http://arxiv.org/abs/1603.02754>`_.  It runs sketching before building each tree
@@ -48,22 +48,18 @@ implementations in XGBoost below.
    this global sketch.  This is the fastest algorithm as it runs sketching only once.  The
    algorithm can be accessed by setting ``tree_method`` to ``hist``.
 
-3. ``gpu_hist`` tree method: The ``gpu_hist`` tree method is a GPU implementation of
-   ``hist``, with additional support for gradient based sampling.  The algorithm can be
-   accessed by setting ``tree_method`` to ``gpu_hist``.
-
 ************
 Implications
 ************
 
-Some objectives like ``reg:squarederror`` have constant hessian.  In this case, ``hist``
-or ``gpu_hist`` should be preferred as weighted sketching doesn't make sense with constant
+Some objectives like ``reg:squarederror`` have constant hessian.  In this case, the
+``hist`` should be preferred as weighted sketching doesn't make sense with constant
 weights.  When using non-constant hessian objectives, sometimes ``approx`` yields better
-accuracy, but with slower computation performance.  Most of the time using ``(gpu)_hist``
-with higher ``max_bin`` can achieve similar or even superior accuracy while maintaining
-good performance.  However, as xgboost is largely driven by community effort, the actual
-implementations have some differences than pure math description.  Result might have
-slight differences than expectation, which we are currently trying to overcome.
+accuracy, but with slower computation performance.  Most of the time using ``hist`` with
+higher ``max_bin`` can achieve similar or even superior accuracy while maintaining good
+performance.  However, as xgboost is largely driven by community effort, the actual
+implementations have some differences than pure math description.  Result might be
+slightly different than expectation, which we are currently trying to overcome.
 
 **************
 Other Updaters
@@ -106,8 +102,8 @@ solely for the interest of documentation.
    histogram creation step and uses sketching values directly during split evaluation.  It
    was never tested and contained some unknown bugs, we decided to remove it and focus our
    resources on more promising algorithms instead.  For accuracy, most of the time
-   ``approx``, ``hist`` and ``gpu_hist`` are enough with some parameters tuning, so
-   removing them don't have any real practical impact.
+   ``approx`` and ``hist`` are enough with some parameters tuning, so removing them don't
+   have any real practical impact.
 
 3. ``grow_local_histmaker`` updater: An approximation tree method described in `reference
    paper <http://arxiv.org/abs/1603.02754>`_.  This updater was rarely used in practice so
diff --git a/doc/tutorials/dask.rst b/doc/tutorials/dask.rst
index 8cb2e6ee2..7fde35b0e 100644
--- a/doc/tutorials/dask.rst
+++ b/doc/tutorials/dask.rst
@@ -149,7 +149,7 @@ Also for inplace prediction:
 .. code-block:: python
 
   # where X is a dask DataFrame or dask Array backed by cupy or cuDF.
-  booster.set_param({"gpu_id": "0"})
+  booster.set_param({"device": "cuda:0"})
   prediction = xgb.dask.inplace_predict(client, booster, X)
 
 When input is ``da.Array`` object, output is always ``da.Array``.  However, if the input
diff --git a/doc/tutorials/saving_model.rst b/doc/tutorials/saving_model.rst
index e536f3fcc..5d9ba1d55 100644
--- a/doc/tutorials/saving_model.rst
+++ b/doc/tutorials/saving_model.rst
@@ -163,7 +163,7 @@ Will print out something similar to (not actual output as it's too long for demo
     {
       "Learner": {
         "generic_parameter": {
-          "gpu_id": "0",
+          "device": "cuda:0",
           "gpu_page_size": "0",
           "n_jobs": "0",
           "random_state": "0",
diff --git a/include/xgboost/base.h b/include/xgboost/base.h
index 6ccd168f3..9a61151f4 100644
--- a/include/xgboost/base.h
+++ b/include/xgboost/base.h
@@ -119,7 +119,7 @@ using bst_group_t = std::uint32_t;  // NOLINT
  */
 using bst_target_t = std::uint32_t;  // NOLINT
 /**
- * brief Type for indexing boosted layers.
+ * @brief Type for indexing boosted layers.
  */
 using bst_layer_t = std::int32_t;  // NOLINT
 /**
diff --git a/include/xgboost/context.h b/include/xgboost/context.h
index de7648079..262733b22 100644
--- a/include/xgboost/context.h
+++ b/include/xgboost/context.h
@@ -12,12 +12,18 @@
 #include <cstdint>      // for int16_t, int32_t, int64_t
 #include <memory>       // for shared_ptr
 #include <string>       // for string, to_string
-#include <type_traits>  // for invoke_result_t, is_same_v
+#include <type_traits>  // for invoke_result_t, is_same_v, underlying_type_t
 
 namespace xgboost {
 
 struct CUDAContext;
 
+// symbolic names
+struct DeviceSym {
+  static auto constexpr CPU() { return "cpu"; }
+  static auto constexpr CUDA() { return "cuda"; }
+};
+
 /**
  * @brief A type for device ordinal. The type is packed into 32-bit for efficient use in
  *        viewing types like `linalg::TensorView`.
@@ -59,9 +65,9 @@ struct DeviceOrd {
   [[nodiscard]] std::string Name() const {
     switch (device) {
       case DeviceOrd::kCPU:
-        return "CPU";
+        return DeviceSym::CPU();
       case DeviceOrd::kCUDA:
-        return "CUDA:" + std::to_string(ordinal);
+        return DeviceSym::CUDA() + (':' + std::to_string(ordinal));
       default: {
         LOG(FATAL) << "Unknown device.";
         return "";
@@ -76,26 +82,39 @@ static_assert(sizeof(DeviceOrd) == sizeof(std::int32_t));
  * @brief Runtime context for XGBoost. Contains information like threads and device.
  */
 struct Context : public XGBoostParameter<Context> {
+ private:
+  std::string device{DeviceSym::CPU()};  // NOLINT
+  // The device object for the current context. We are in the middle of replacing the
+  // `gpu_id` with this device field.
+  DeviceOrd device_{DeviceOrd::CPU()};
+
  public:
   // Constant representing the device ID of CPU.
-  static std::int32_t constexpr kCpuId = -1;
+  static bst_d_ordinal_t constexpr kCpuId = -1;
+  static bst_d_ordinal_t constexpr InvalidOrdinal() { return -2; }
   static std::int64_t constexpr kDefaultSeed = 0;
 
  public:
   Context();
 
+  template <typename Container>
+  Args UpdateAllowUnknown(Container const& kwargs) {
+    auto args = XGBoostParameter<Context>::UpdateAllowUnknown(kwargs);
+    this->SetDeviceOrdinal(kwargs);
+    return args;
+  }
+
+  std::int32_t gpu_id{kCpuId};
+  // The number of threads to use if OpenMP is enabled. If equals 0, use the system default.
+  std::int32_t nthread{0};  // NOLINT
   // stored random seed
   std::int64_t seed{kDefaultSeed};
   // whether seed the PRNG each iteration
   bool seed_per_iteration{false};
-  // number of threads to use if OpenMP is enabled
-  // if equals 0, use system default
-  std::int32_t nthread{0};
-  // primary device, -1 means no gpu.
-  std::int32_t gpu_id{kCpuId};
   // fail when gpu_id is invalid
   bool fail_on_invalid_gpu_id{false};
   bool validate_parameters{false};
+
   /**
    * @brief Configure the parameter `gpu_id'.
    *
@@ -111,21 +130,19 @@ struct Context : public XGBoostParameter<Context> {
   /**
    * @brief Is XGBoost running on CPU?
    */
-  [[nodiscard]] bool IsCPU() const { return gpu_id == kCpuId; }
+  [[nodiscard]] bool IsCPU() const { return Device().IsCPU(); }
   /**
    * @brief Is XGBoost running on a CUDA device?
    */
-  [[nodiscard]] bool IsCUDA() const { return !IsCPU(); }
+  [[nodiscard]] bool IsCUDA() const { return Device().IsCUDA(); }
   /**
    * @brief Get the current device and ordinal.
    */
-  [[nodiscard]] DeviceOrd Device() const {
-    return IsCPU() ? DeviceOrd::CPU() : DeviceOrd::CUDA(static_cast<bst_d_ordinal_t>(gpu_id));
-  }
+  [[nodiscard]] DeviceOrd Device() const { return device_; }
   /**
    * @brief Get the CUDA device ordinal. -1 if XGBoost is running on CPU.
    */
-  [[nodiscard]] bst_d_ordinal_t Ordinal() const { return this->gpu_id; }
+  [[nodiscard]] bst_d_ordinal_t Ordinal() const { return Device().ordinal; }
   /**
    * @brief Name of the current device.
    */
@@ -134,24 +151,22 @@ struct Context : public XGBoostParameter<Context> {
    * @brief Get a CUDA device context for allocator and stream.
    */
   [[nodiscard]] CUDAContext const* CUDACtx() const;
+
   /**
    * @brief Make a CUDA context based on the current context.
    *
    * @param ordinal The CUDA device ordinal.
    */
-  [[nodiscard]] Context MakeCUDA(std::int32_t ordinal = 0) const {
+  [[nodiscard]] Context MakeCUDA(bst_d_ordinal_t ordinal = 0) const {
     Context ctx = *this;
-    CHECK_GE(ordinal, 0);
-    ctx.gpu_id = ordinal;
-    return ctx;
+    return ctx.SetDevice(DeviceOrd::CUDA(ordinal));
   }
   /**
    * @brief Make a CPU context based on the current context.
    */
   [[nodiscard]] Context MakeCPU() const {
     Context ctx = *this;
-    ctx.gpu_id = kCpuId;
-    return ctx;
+    return ctx.SetDevice(DeviceOrd::CPU());
   }
   /**
    * @brief Call function based on the current device.
@@ -167,7 +182,8 @@ struct Context : public XGBoostParameter<Context> {
       default:
         // Do not use the device name as this is likely an internal error, the name
         // wouldn't be valid.
-        LOG(FATAL) << "Unknown device type:" << static_cast<std::int16_t>(this->Device().device);
+        LOG(FATAL) << "Unknown device type:"
+                   << static_cast<std::underlying_type_t<DeviceOrd::Type>>(this->Device().device);
         break;
     }
     return std::invoke_result_t<CPUFn>();
@@ -182,11 +198,9 @@ struct Context : public XGBoostParameter<Context> {
     DMLC_DECLARE_FIELD(seed_per_iteration)
         .set_default(false)
         .describe("Seed PRNG determnisticly via iterator number.");
+    DMLC_DECLARE_FIELD(device).set_default(DeviceSym::CPU()).describe("Device ordinal.");
     DMLC_DECLARE_FIELD(nthread).set_default(0).describe("Number of threads to use.");
     DMLC_DECLARE_ALIAS(nthread, n_jobs);
-
-    DMLC_DECLARE_FIELD(gpu_id).set_default(-1).set_lower_bound(-1).describe(
-        "The primary GPU device ordinal.");
     DMLC_DECLARE_FIELD(fail_on_invalid_gpu_id)
         .set_default(false)
         .describe("Fail with error when gpu_id is invalid.");
@@ -196,6 +210,14 @@ struct Context : public XGBoostParameter<Context> {
   }
 
  private:
+  void SetDeviceOrdinal(Args const& kwargs);
+  Context& SetDevice(DeviceOrd d) {
+    this->device_ = d;
+    this->gpu_id = d.ordinal;  // this can be removed once we move away from `gpu_id`.
+    this->device = d.Name();
+    return *this;
+  }
+
   // mutable for lazy cuda context initialization. This avoids initializing CUDA at load.
   // shared_ptr is used instead of unique_ptr as with unique_ptr it's difficult to define
   // p_impl while trying to hide CUDA code from the host compiler.
diff --git a/include/xgboost/json.h b/include/xgboost/json.h
index 3b34c2874..cb22e120e 100644
--- a/include/xgboost/json.h
+++ b/include/xgboost/json.h
@@ -664,11 +664,11 @@ Object ToJson(Parameter const& param) {
 template <typename Parameter>
 Args FromJson(Json const& obj, Parameter* param) {
   auto const& j_param = get<Object const>(obj);
-  std::map<std::string, std::string> m;
+  Args args;
   for (auto const& kv : j_param) {
-    m[kv.first] = get<String const>(kv.second);
+    args.emplace_back(kv.first, get<String const>(kv.second));
   }
-  return param->UpdateAllowUnknown(m);
+  return param->UpdateAllowUnknown(args);
 }
 }  // namespace xgboost
 #endif  // XGBOOST_JSON_H_
diff --git a/include/xgboost/learner.h b/include/xgboost/learner.h
index f2b377ac1..8adb3cb27 100644
--- a/include/xgboost/learner.h
+++ b/include/xgboost/learner.h
@@ -110,15 +110,10 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
    * \param approx_contribs whether to approximate the feature contributions for speed
    * \param pred_interactions whether to compute the feature pair contributions
    */
-  virtual void Predict(std::shared_ptr<DMatrix> data,
-                       bool output_margin,
-                       HostDeviceVector<bst_float> *out_preds,
-                       unsigned layer_begin,
-                       unsigned layer_end,
-                       bool training = false,
-                       bool pred_leaf = false,
-                       bool pred_contribs = false,
-                       bool approx_contribs = false,
+  virtual void Predict(std::shared_ptr<DMatrix> data, bool output_margin,
+                       HostDeviceVector<bst_float>* out_preds, bst_layer_t layer_begin,
+                       bst_layer_t layer_end, bool training = false, bool pred_leaf = false,
+                       bool pred_contribs = false, bool approx_contribs = false,
                        bool pred_interactions = false) = 0;
 
   /*!
@@ -132,8 +127,8 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
    * \param          layer_end   End of booster layer. 0 means do not limit trees.
    */
   virtual void InplacePredict(std::shared_ptr<DMatrix> p_m, PredictionType type, float missing,
-                              HostDeviceVector<bst_float>** out_preds, uint32_t layer_begin,
-                              uint32_t layer_end) = 0;
+                              HostDeviceVector<float>** out_preds, bst_layer_t layer_begin,
+                              bst_layer_t layer_end) = 0;
 
   /*!
    * \brief Calculate feature score.  See doc in C API for outputs.
diff --git a/include/xgboost/predictor.h b/include/xgboost/predictor.h
index f0d2e8e37..2c69cf648 100644
--- a/include/xgboost/predictor.h
+++ b/include/xgboost/predictor.h
@@ -39,9 +39,8 @@ struct PredictionCacheEntry {
    *
    * \param v Added versions.
    */
-  void Update(std::uint32_t v) {
-    version += v;
-  }
+  void Update(std::uint32_t v) { version += v; }
+  void Reset() { version = 0; }
 };
 
 /**
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala
index eef10a36d..9ff42e370 100644
--- a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala
@@ -280,7 +280,7 @@ object GpuPreXGBoost extends PreXGBoostProvider {
             // - gpu id
             // - predictor: Force to gpu predictor since native doesn't save predictor.
             val gpuId = if (!isLocal) XGBoost.getGPUAddrFromResources else 0
-            booster.setParam("gpu_id", gpuId.toString)
+            booster.setParam("device", s"cuda:$gpuId")
             logger.info("GPU transform on device: " + gpuId)
             boosterFlag.isGpuParamsSet = true;
           }
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
index 9208449ca..48b31a99f 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
@@ -326,7 +326,7 @@ object XGBoost extends Serializable {
           getGPUAddrFromResources
         }
         logger.info("Leveraging gpu device " + gpuId + " to train")
-        params = params + ("gpu_id" -> gpuId)
+        params = params + ("device" -> s"cuda:$gpuId")
       }
       val booster = if (makeCheckpoint) {
         SXGBoost.trainAndSaveCheckpoint(
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 31f34256d..d6214c7a6 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -1393,13 +1393,13 @@ class _ProxyDMatrix(DMatrix):
 
 
 class QuantileDMatrix(DMatrix):
-    """A DMatrix variant that generates quantilized data directly from input for
-    ``hist`` and ``gpu_hist`` tree methods. This DMatrix is primarily designed to save
-    memory in training by avoiding intermediate storage. Set ``max_bin`` to control the
-    number of bins during quantisation, which should be consistent with the training
-    parameter ``max_bin``. When ``QuantileDMatrix`` is used for validation/test dataset,
-    ``ref`` should be another ``QuantileDMatrix``(or ``DMatrix``, but not recommended as
-    it defeats the purpose of saving memory) constructed from training dataset.  See
+    """A DMatrix variant that generates quantilized data directly from input for the
+    ``hist`` tree method. This DMatrix is primarily designed to save memory in training
+    by avoiding intermediate storage. Set ``max_bin`` to control the number of bins
+    during quantisation, which should be consistent with the training parameter
+    ``max_bin``. When ``QuantileDMatrix`` is used for validation/test dataset, ``ref``
+    should be another ``QuantileDMatrix``(or ``DMatrix``, but not recommended as it
+    defeats the purpose of saving memory) constructed from training dataset.  See
     :py:obj:`xgboost.DMatrix` for documents on meta info.
 
     .. note::
@@ -2277,10 +2277,10 @@ class Booster:
 
         .. code-block:: python
 
-            booster.set_param({"gpu_id": "0", "tree_method": "gpu_hist"})
+            booster.set_param({"device": "cuda:0"})
             booster.inplace_predict(cupy_array)
 
-            booster.set_param({"gpu_id": "-1", "tree_method": "hist"})
+            booster.set_param({"device": "cpu"})
             booster.inplace_predict(numpy_array)
 
         .. versionadded:: 1.1.0
@@ -2311,8 +2311,8 @@ class Booster:
         Returns
         -------
         prediction : numpy.ndarray/cupy.ndarray
-            The prediction result.  When input data is on GPU, prediction
-            result is stored in a cupy array.
+            The prediction result.  When input data is on GPU, prediction result is
+            stored in a cupy array.
 
         """
         preds = ctypes.POINTER(ctypes.c_float)()
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index a46ba14d0..e9f9e9f10 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -273,7 +273,7 @@ __model_doc = f"""
         * For linear model, only "weight" is defined and it's the normalized coefficients
           without bias.
 
-    gpu_id : Optional[int]
+    device : Optional[str]
         Device ordinal.
     validate_parameters : Optional[bool]
         Give warnings for unknown parameter.
@@ -647,7 +647,7 @@ class XGBModel(XGBModelBase):
         monotone_constraints: Optional[Union[Dict[str, int], str]] = None,
         interaction_constraints: Optional[Union[str, Sequence[Sequence[str]]]] = None,
         importance_type: Optional[str] = None,
-        gpu_id: Optional[int] = None,
+        device: Optional[str] = None,
         validate_parameters: Optional[bool] = None,
         enable_categorical: bool = False,
         feature_types: Optional[FeatureTypes] = None,
@@ -693,7 +693,7 @@ class XGBModel(XGBModelBase):
         self.monotone_constraints = monotone_constraints
         self.interaction_constraints = interaction_constraints
         self.importance_type = importance_type
-        self.gpu_id = gpu_id
+        self.device = device
         self.validate_parameters = validate_parameters
         self.enable_categorical = enable_categorical
         self.feature_types = feature_types
diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py
index 0181e678d..a170fbf9f 100644
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@@ -1,4 +1,4 @@
-"""Xgboost pyspark integration submodule for core code."""
+"""XGBoost pyspark integration submodule for core code."""
 import base64
 
 # pylint: disable=fixme, too-many-ancestors, protected-access, no-member, invalid-name
@@ -133,6 +133,7 @@ _inverse_pyspark_param_alias_map = {v: k for k, v in _pyspark_param_alias_map.it
 
 _unsupported_xgb_params = [
     "gpu_id",  # we have "use_gpu" pyspark param instead.
+    "device",  # we have "use_gpu" pyspark param instead.
     "enable_categorical",  # Use feature_types param to specify categorical feature instead
     "use_label_encoder",
     "n_jobs",  # Do not allow user to set it, will use `spark.task.cpus` value instead.
@@ -899,12 +900,14 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
 
             context = BarrierTaskContext.get()
 
-            gpu_id = None
+            dev_ordinal = None
             use_hist = booster_params.get("tree_method", None) in ("hist", "gpu_hist")
 
             if use_gpu:
-                gpu_id = context.partitionId() if is_local else _get_gpu_id(context)
-                booster_params["gpu_id"] = gpu_id
+                dev_ordinal = (
+                    context.partitionId() if is_local else _get_gpu_id(context)
+                )
+                booster_params["device"] = "cuda:" + str(dev_ordinal)
                 # If cuDF is not installed, then using DMatrix instead of QDM,
                 # because without cuDF, DMatrix performs better than QDM.
                 # Note: Checking `is_cudf_available` in spark worker side because
@@ -945,7 +948,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
                 dtrain, dvalid = create_dmatrix_from_partitions(
                     pandas_df_iter,
                     feature_prop.features_cols_names,
-                    gpu_id,
+                    dev_ordinal,
                     use_qdm,
                     dmatrix_kwargs,
                     enable_sparse_data_optim=feature_prop.enable_sparse_data_optim,
diff --git a/python-package/xgboost/spark/data.py b/python-package/xgboost/spark/data.py
index 8f84459d7..f9c12ba66 100644
--- a/python-package/xgboost/spark/data.py
+++ b/python-package/xgboost/spark/data.py
@@ -157,7 +157,7 @@ def _read_csr_matrix_from_unwrapped_spark_vec(part: pd.DataFrame) -> csr_matrix:
 
 def make_qdm(
     data: Dict[str, List[np.ndarray]],
-    gpu_id: Optional[int],
+    dev_ordinal: Optional[int],
     meta: Dict[str, Any],
     ref: Optional[DMatrix],
     params: Dict[str, Any],
@@ -165,7 +165,7 @@ def make_qdm(
     """Handle empty partition for QuantileDMatrix."""
     if not data:
         return QuantileDMatrix(np.empty((0, 0)), ref=ref)
-    it = PartIter(data, gpu_id, **meta)
+    it = PartIter(data, dev_ordinal, **meta)
     m = QuantileDMatrix(it, **params, ref=ref)
     return m
 
@@ -173,7 +173,7 @@ def make_qdm(
 def create_dmatrix_from_partitions(  # pylint: disable=too-many-arguments
     iterator: Iterator[pd.DataFrame],
     feature_cols: Optional[Sequence[str]],
-    gpu_id: Optional[int],
+    dev_ordinal: Optional[int],
     use_qdm: bool,
     kwargs: Dict[str, Any],  # use dict to make sure this parameter is passed.
     enable_sparse_data_optim: bool,
@@ -187,7 +187,7 @@ def create_dmatrix_from_partitions(  # pylint: disable=too-many-arguments
         Pyspark partition iterator.
     feature_cols:
         A sequence of feature names, used only when rapids plugin is enabled.
-    gpu_id:
+    dev_ordinal:
         Device ordinal, used when GPU is enabled.
     use_qdm :
         Whether QuantileDMatrix should be used instead of DMatrix.
@@ -304,13 +304,13 @@ def create_dmatrix_from_partitions(  # pylint: disable=too-many-arguments
 
     if feature_cols is not None and use_qdm:
         cache_partitions(iterator, append_fn)
-        dtrain: DMatrix = make_qdm(train_data, gpu_id, meta, None, params)
+        dtrain: DMatrix = make_qdm(train_data, dev_ordinal, meta, None, params)
     elif feature_cols is not None and not use_qdm:
         cache_partitions(iterator, append_fn)
         dtrain = make(train_data, kwargs)
     elif feature_cols is None and use_qdm:
         cache_partitions(iterator, append_fn)
-        dtrain = make_qdm(train_data, gpu_id, meta, None, params)
+        dtrain = make_qdm(train_data, dev_ordinal, meta, None, params)
     else:
         cache_partitions(iterator, append_fn)
         dtrain = make(train_data, kwargs)
@@ -324,7 +324,7 @@ def create_dmatrix_from_partitions(  # pylint: disable=too-many-arguments
     if has_validation_col:
         if use_qdm:
             dvalid: Optional[DMatrix] = make_qdm(
-                valid_data, gpu_id, meta, dtrain, params
+                valid_data, dev_ordinal, meta, dtrain, params
             )
         else:
             dvalid = make(valid_data, kwargs) if has_validation_col else None
diff --git a/python-package/xgboost/spark/estimator.py b/python-package/xgboost/spark/estimator.py
index 5054ef0dd..ba75aca7f 100644
--- a/python-package/xgboost/spark/estimator.py
+++ b/python-package/xgboost/spark/estimator.py
@@ -78,8 +78,7 @@ def _set_pyspark_xgb_cls_param_attrs(
 
 
 class SparkXGBRegressor(_SparkXGBEstimator):
-    """
-    SparkXGBRegressor is a PySpark ML estimator. It implements the XGBoost regression
+    """SparkXGBRegressor is a PySpark ML estimator. It implements the XGBoost regression
     algorithm based on XGBoost python library, and it can be used in PySpark Pipeline
     and PySpark ML meta algorithms like :py:class:`~pyspark.ml.tuning.CrossValidator`/
     :py:class:`~pyspark.ml.tuning.TrainValidationSplit`/
@@ -89,8 +88,8 @@ class SparkXGBRegressor(_SparkXGBEstimator):
     :py:class:`xgboost.XGBRegressor` constructor and most of the parameters used in
     :py:meth:`xgboost.XGBRegressor.fit` and :py:meth:`xgboost.XGBRegressor.predict` method.
 
-    SparkXGBRegressor doesn't support setting `gpu_id` but support another param `use_gpu`,
-    see doc below for more details.
+    SparkXGBRegressor doesn't support setting `device` but supports another param
+    `use_gpu`, see doc below for more details.
 
     SparkXGBRegressor doesn't support setting `base_margin` explicitly as well, but support
     another param called `base_margin_col`. see doc below for more details.
@@ -247,8 +246,8 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
     :py:class:`xgboost.XGBClassifier` constructor and most of the parameters used in
     :py:meth:`xgboost.XGBClassifier.fit` and :py:meth:`xgboost.XGBClassifier.predict` method.
 
-    SparkXGBClassifier doesn't support setting `gpu_id` but support another param `use_gpu`,
-    see doc below for more details.
+    SparkXGBClassifier doesn't support setting `device` but support another param
+    `use_gpu`, see doc below for more details.
 
     SparkXGBClassifier doesn't support setting `base_margin` explicitly as well, but support
     another param called `base_margin_col`. see doc below for more details.
@@ -423,7 +422,7 @@ class SparkXGBRanker(_SparkXGBEstimator):
     :py:class:`xgboost.XGBRanker` constructor and most of the parameters used in
     :py:meth:`xgboost.XGBRanker.fit` and :py:meth:`xgboost.XGBRanker.predict` method.
 
-    SparkXGBRanker doesn't support setting `gpu_id` but support another param `use_gpu`,
+    SparkXGBRanker doesn't support setting `device` but support another param `use_gpu`,
     see doc below for more details.
 
     SparkXGBRanker doesn't support setting `base_margin` explicitly as well, but support
diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py
index 8e2e13f43..6445f1c94 100644
--- a/python-package/xgboost/testing/__init__.py
+++ b/python-package/xgboost/testing/__init__.py
@@ -723,24 +723,6 @@ def predictor_equal(lhs: xgb.DMatrix, rhs: xgb.DMatrix) -> bool:
 M = TypeVar("M", xgb.Booster, xgb.XGBModel)
 
 
-def set_ordinal(ordinal: int, booster: M) -> M:
-    """Temporary solution for setting the device ordinal until we move away from
-    `gpu_id`.
-
-    """
-    if ordinal < 0:
-        params = {"gpu_id": -1, "tree_method": "hist"}
-    else:
-        params = {"gpu_id": ordinal, "tree_method": "gpu_hist"}
-
-    if isinstance(booster, xgb.Booster):
-        booster.set_param(params)
-    elif isinstance(booster, xgb.XGBModel):
-        booster.set_params(**params)
-
-    return booster
-
-
 def eval_error_metric(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, np.float64]:
     """Evaluation metric for xgb.train"""
     label = dtrain.get_label()
diff --git a/src/c_api/c_api.cu b/src/c_api/c_api.cu
index af060f6dc..964ab0c3f 100644
--- a/src/c_api/c_api.cu
+++ b/src/c_api/c_api.cu
@@ -117,10 +117,7 @@ int InplacePreidctCUDA(BoosterHandle handle, char const *c_array_interface,
                           RequiredArg<Integer>(config, "iteration_begin", __func__),
                           RequiredArg<Integer>(config, "iteration_end", __func__));
   CHECK(p_predt);
-  if (learner->Ctx()->IsCPU()) {
-    // Prediction using DMatrix as fallback.
-    CHECK(p_predt->HostCanRead() && !p_predt->DeviceCanRead());
-  } else {
+  if (learner->Ctx()->IsCUDA()) {
     CHECK(p_predt->DeviceCanRead() && !p_predt->HostCanRead());
   }
   p_predt->SetDevice(proxy->DeviceIdx());
diff --git a/src/common/error_msg.cc b/src/common/error_msg.cc
index 813cbe8b1..bb57014a6 100644
--- a/src/common/error_msg.cc
+++ b/src/common/error_msg.cc
@@ -3,23 +3,18 @@
  */
 #include "error_msg.h"
 
+#include "../collective/communicator-inl.h"  // for GetRank
 #include "xgboost/logging.h"
 
 namespace xgboost::error {
 void WarnDeprecatedGPUHist() {
-  bool static thread_local logged{false};
-  if (logged) {
-    return;
-  }
   auto msg =
       "The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` "
       R"(parameter to CUDA instead.
 
     E.g. tree_method = "hist", device = "CUDA"
-
 )";
   LOG(WARNING) << msg;
-  logged = true;
 }
 
 void WarnManualUpdater() {
@@ -33,4 +28,23 @@ void WarnManualUpdater() {
          "behavior. For common uses, we recommend using `tree_method` parameter instead.";
   logged = true;
 }
+
+void WarnDeprecatedGPUId() {
+  static thread_local bool logged{false};
+  if (logged) {
+    return;
+  }
+  LOG(WARNING) << "`gpu_id` is deprecated in favor of the new `device` parameter: "
+               << "device = cpu/cuda/cuda:0";
+  logged = true;
+}
+
+void WarnEmptyDataset() {
+  static thread_local bool logged{false};
+  if (logged) {
+    return;
+  }
+  LOG(WARNING) << "Empty dataset at worker: " << collective::GetRank();
+  logged = true;
+}
 }  // namespace xgboost::error
diff --git a/src/common/error_msg.h b/src/common/error_msg.h
index c19197007..07b5c3e53 100644
--- a/src/common/error_msg.h
+++ b/src/common/error_msg.h
@@ -82,5 +82,9 @@ inline void WarnOldSerialization() {
 void WarnDeprecatedGPUHist();
 
 void WarnManualUpdater();
+
+void WarnDeprecatedGPUId();
+
+void WarnEmptyDataset();
 }  // namespace xgboost::error
 #endif  // XGBOOST_COMMON_ERROR_MSG_H_
diff --git a/src/context.cc b/src/context.cc
index 28fda9c45..1acaa6443 100644
--- a/src/context.cc
+++ b/src/context.cc
@@ -3,53 +3,201 @@
  *
  * \brief Context object used for controlling runtime parameters.
  */
-#include <xgboost/context.h>
+#include "xgboost/context.h"
 
-#include "common/common.h"  // AssertGPUSupport
+#include <algorithm>  // for find_if
+#include <charconv>   // for from_chars
+#include <iterator>   // for distance
+#include <optional>   // for optional
+#include <regex>      // for regex_replace, regex_match
+
+#include "common/common.h"     // AssertGPUSupport
+#include "common/error_msg.h"  // WarnDeprecatedGPUId
 #include "common/threading_utils.h"
+#include "xgboost/string_view.h"
 
 namespace xgboost {
 
 DMLC_REGISTER_PARAMETER(Context);
 
-std::int32_t constexpr Context::kCpuId;
+bst_d_ordinal_t constexpr Context::kCpuId;
 std::int64_t constexpr Context::kDefaultSeed;
 
 Context::Context() : cfs_cpu_count_{common::GetCfsCPUCount()} {}
 
-void Context::ConfigureGpuId(bool require_gpu) {
-#if defined(XGBOOST_USE_CUDA)
-  if (gpu_id == kCpuId) {  // 0. User didn't specify the `gpu_id'
-    if (require_gpu) {     // 1. `tree_method' or `predictor' or both are using
-                           // GPU.
-      // 2. Use device 0 as default.
-      this->UpdateAllowUnknown(Args{{"gpu_id", "0"}});
-    }
-  }
+namespace {
+inline constexpr char const* kDevice = "device";
 
-  // 3. When booster is loaded from a memory image (Python pickle or R
-  // raw model), number of available GPUs could be different.  Wrap around it.
-  int32_t n_gpus = common::AllVisibleGPUs();
-  if (n_gpus == 0) {
-    if (gpu_id != kCpuId) {
-      LOG(WARNING) << "No visible GPU is found, setting `gpu_id` to -1";
-    }
-    this->UpdateAllowUnknown(Args{{"gpu_id", std::to_string(kCpuId)}});
-  } else if (fail_on_invalid_gpu_id) {
-    CHECK(gpu_id == kCpuId || gpu_id < n_gpus)
-        << "Only " << n_gpus << " GPUs are visible, gpu_id " << gpu_id << " is invalid.";
-  } else if (gpu_id != kCpuId && gpu_id >= n_gpus) {
-    LOG(WARNING) << "Only " << n_gpus << " GPUs are visible, setting `gpu_id` to "
-                 << gpu_id % n_gpus;
-    this->UpdateAllowUnknown(Args{{"gpu_id", std::to_string(gpu_id % n_gpus)}});
-  }
+#if !defined(XGBOOST_USE_CUDA)
+DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
+  device = DeviceOrd::CPU();
+  return device;
+}
 #else
-  // Just set it to CPU, don't think about it.
-  this->UpdateAllowUnknown(Args{{"gpu_id", std::to_string(kCpuId)}});
-  (void)(require_gpu);
-#endif  // defined(XGBOOST_USE_CUDA)
+// Check CUDA on the current device, wrap the ordinal if necessary.
+[[nodiscard]] DeviceOrd CUDAOrdinal(DeviceOrd device, bool fail_on_invalid) {
+  // When booster is loaded from a memory image (Python pickle or R raw model), number of
+  // available GPUs could be different.  Wrap around it.
+  std::int32_t n_visible = common::AllVisibleGPUs();
+  if (n_visible == 0) {
+    if (device.IsCUDA()) {
+      LOG(WARNING) << "No visible GPU is found, setting device to CPU.";
+    }
+    device = DeviceOrd::CPU();
+  } else if (fail_on_invalid) {
+    CHECK(device.IsCPU() || device.ordinal < n_visible)
+        << "Only " << n_visible << " GPUs are visible, ordinal " << device.ordinal
+        << " is invalid.";
+  } else if (device.IsCUDA() && device.ordinal >= n_visible) {
+    device.ordinal = device.ordinal % n_visible;
+    LOG(WARNING) << "Only " << n_visible << " GPUs are visible, setting device ordinal to "
+                 << device.ordinal;
+  }
 
-  common::SetDevice(this->gpu_id);
+  if (device.IsCUDA()) {
+    common::SetDevice(device.ordinal);
+  }
+  return device;
+}
+#endif  //  !defined(XGBOOST_USE_CUDA)
+
+[[nodiscard]] std::optional<std::int32_t> ParseInt(StringView ordinal) {
+  // Some basic checks to ensure valid `gpu_id` and device ordinal instead of directly parsing and
+  // letting go of unknown characters.
+  if (ordinal.empty()) {
+    return std::nullopt;
+  }
+
+  std::size_t offset{0};
+  if (ordinal[0] == '-') {
+    offset = 1;
+  }
+  if (ordinal.size() <= offset) {
+    return std::nullopt;
+  }
+
+  bool valid = std::all_of(ordinal.cbegin() + offset, ordinal.cend(),
+                           [](auto c) { return std::isdigit(c); });
+  if (!valid) {
+    return std::nullopt;
+  }
+
+  std::int32_t parsed_id{Context::kCpuId};
+  auto res = std::from_chars(ordinal.c_str(), ordinal.c_str() + ordinal.size(), parsed_id);
+  if (res.ec != std::errc()) {
+    return std::nullopt;
+  }
+
+  return parsed_id;
+}
+
+[[nodiscard]] DeviceOrd MakeDeviceOrd(std::string const& input, bool fail_on_invalid_gpu_id) {
+  StringView msg{R"(Invalid argument for `device`. Expected to be one of the following:
+- cpu
+- cuda
+- cuda:<device ordinal>  # e.g. cuda:0
+- gpu
+- gpu:<device ordinal>   # e.g. gpu:0
+)"};
+  auto fatal = [&] { LOG(FATAL) << msg << "Got: `" << input << "`."; };
+
+#if defined(__MINGW32__)
+  // mingw hangs on regex using rtools 430. Basic checks only.
+  CHECK_GE(input.size(), 3) << msg;
+  auto substr = input.substr(0, 3);
+  bool valid = substr == "cpu" || substr == "cud" || substr == "gpu";
+  CHECK(valid) << msg;
+#else
+  std::regex pattern{"gpu(:[0-9]+)?|cuda(:[0-9]+)?|cpu"};
+  if (!std::regex_match(input, pattern)) {
+    fatal();
+  }
+#endif  // defined(__MINGW32__)
+
+  // handle alias
+  std::string s_device = std::regex_replace(input, std::regex{"gpu"}, DeviceSym::CUDA());
+
+  auto split_it = std::find(s_device.cbegin(), s_device.cend(), ':');
+  DeviceOrd device;
+  device.ordinal = Context::InvalidOrdinal();  // mark it invalid for check.
+  if (split_it == s_device.cend()) {
+    // no ordinal.
+    if (s_device == DeviceSym::CPU()) {
+      device = DeviceOrd::CPU();
+    } else if (s_device == DeviceSym::CUDA()) {
+      device = DeviceOrd::CUDA(0);  // use 0 as default;
+    } else {
+      fatal();
+    }
+  } else {
+    // must be CUDA when ordinal is specifed.
+    // +1 for colon
+    std::size_t offset = std::distance(s_device.cbegin(), split_it) + 1;
+    // substr
+    StringView s_ordinal = {s_device.data() + offset, s_device.size() - offset};
+    if (s_ordinal.empty()) {
+      fatal();
+    }
+    auto opt_id = ParseInt(s_ordinal);
+    if (!opt_id.has_value()) {
+      fatal();
+    }
+    CHECK_LE(opt_id.value(), std::numeric_limits<bst_d_ordinal_t>::max())
+        << "Ordinal value too large.";
+    device = DeviceOrd::CUDA(opt_id.value());
+  }
+
+  if (device.ordinal < Context::kCpuId) {
+    fatal();
+  }
+  device = CUDAOrdinal(device, fail_on_invalid_gpu_id);
+
+  return device;
+}
+}  // namespace
+
+void Context::ConfigureGpuId(bool require_gpu) {
+  if (this->IsCPU() && require_gpu) {
+    this->UpdateAllowUnknown(Args{{kDevice, DeviceSym::CUDA()}});
+  }
+}
+
+void Context::SetDeviceOrdinal(Args const& kwargs) {
+  auto gpu_id_it = std::find_if(kwargs.cbegin(), kwargs.cend(),
+                                [](auto const& p) { return p.first == "gpu_id"; });
+  auto has_gpu_id = gpu_id_it != kwargs.cend();
+  auto device_it = std::find_if(kwargs.cbegin(), kwargs.cend(),
+                                [](auto const& p) { return p.first == kDevice; });
+  auto has_device = device_it != kwargs.cend();
+  if (has_device && has_gpu_id) {
+    LOG(FATAL) << "Both `device` and `gpu_id` are specified. Use `device` instead.";
+  }
+
+  if (has_gpu_id) {
+    // Compatible with XGBoost < 2.0.0
+    error::WarnDeprecatedGPUId();
+    auto opt_id = ParseInt(StringView{gpu_id_it->second});
+    CHECK(opt_id.has_value()) << "Invalid value for `gpu_id`. Got:" << gpu_id_it->second;
+    if (opt_id.value() > Context::kCpuId) {
+      this->UpdateAllowUnknown(Args{{kDevice, DeviceOrd::CUDA(opt_id.value()).Name()}});
+    } else {
+      this->UpdateAllowUnknown(Args{{kDevice, DeviceOrd::CPU().Name()}});
+    }
+    return;
+  }
+
+  auto new_d = MakeDeviceOrd(this->device, this->fail_on_invalid_gpu_id);
+
+  if (!has_device) {
+    CHECK_EQ(new_d.ordinal, this->device_.ordinal);  // unchanged
+  }
+  this->SetDevice(new_d);
+
+  if (this->IsCPU()) {
+    CHECK_EQ(this->device_.ordinal, kCpuId);
+  } else {
+    CHECK_GT(this->device_.ordinal, kCpuId);
+  }
 }
 
 std::int32_t Context::Threads() const {
diff --git a/src/data/iterative_dmatrix.cc b/src/data/iterative_dmatrix.cc
index c2c9a1d70..a53b88c13 100644
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@@ -33,10 +33,11 @@ IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle pro
   bool valid = iter.Next();
   CHECK(valid) << "Iterative DMatrix must have at least 1 batch.";
 
-  auto d = MakeProxy(proxy_)->DeviceIdx();
+  auto pctx = MakeProxy(proxy_)->Ctx();
 
   Context ctx;
-  ctx.UpdateAllowUnknown(Args{{"nthread", std::to_string(nthread)}, {"gpu_id", std::to_string(d)}});
+  ctx.UpdateAllowUnknown(
+      Args{{"nthread", std::to_string(nthread)}, {"device", pctx->DeviceName()}});
   // hardcoded parameter.
   BatchParam p{max_bin, tree::TrainParam::DftSparseThreshold()};
 
diff --git a/src/data/proxy_dmatrix.cc b/src/data/proxy_dmatrix.cc
index e0a28142d..cb8e290c8 100644
--- a/src/data/proxy_dmatrix.cc
+++ b/src/data/proxy_dmatrix.cc
@@ -54,6 +54,7 @@ std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *ctx,
     p_fmat = cuda_impl::CreateDMatrixFromProxy(ctx, proxy, missing);
   }
 
+  CHECK(p_fmat) << "Failed to fallback.";
   return p_fmat;
 }
 }  // namespace xgboost::data
diff --git a/src/data/proxy_dmatrix.cu b/src/data/proxy_dmatrix.cu
index 65abd1b7d..ded1c3aef 100644
--- a/src/data/proxy_dmatrix.cu
+++ b/src/data/proxy_dmatrix.cu
@@ -7,28 +7,31 @@
 
 namespace xgboost::data {
 void DMatrixProxy::FromCudaColumnar(StringView interface_str) {
-  std::shared_ptr<data::CudfAdapter> adapter{new CudfAdapter{interface_str}};
-  auto const& value = adapter->Value();
+  auto adapter{std::make_shared<CudfAdapter>(interface_str)};
   this->batch_ = adapter;
-  ctx_.gpu_id = adapter->DeviceIdx();
   this->Info().num_col_ = adapter->NumColumns();
   this->Info().num_row_ = adapter->NumRows();
-  if (ctx_.gpu_id < 0) {
+  if (adapter->DeviceIdx() < 0) {
+    // empty data
     CHECK_EQ(this->Info().num_row_, 0);
-    ctx_.gpu_id = dh::CurrentDevice();
+    ctx_ = ctx_.MakeCUDA(dh::CurrentDevice());
+    return;
   }
+  ctx_ = ctx_.MakeCUDA(adapter->DeviceIdx());
 }
 
 void DMatrixProxy::FromCudaArray(StringView interface_str) {
-  std::shared_ptr<CupyAdapter> adapter(new CupyAdapter{StringView{interface_str}});
+  auto adapter(std::make_shared<CupyAdapter>(StringView{interface_str}));
   this->batch_ = adapter;
-  ctx_.gpu_id = adapter->DeviceIdx();
   this->Info().num_col_ = adapter->NumColumns();
   this->Info().num_row_ = adapter->NumRows();
-  if (ctx_.gpu_id < 0) {
+  if (adapter->DeviceIdx() < 0) {
+    // empty data
     CHECK_EQ(this->Info().num_row_, 0);
-    ctx_.gpu_id = dh::CurrentDevice();
+    ctx_ = ctx_.MakeCUDA(dh::CurrentDevice());
+    return;
   }
+  ctx_ = ctx_.MakeCUDA(adapter->DeviceIdx());
 }
 
 namespace cuda_impl {
diff --git a/src/data/simple_dmatrix.cu b/src/data/simple_dmatrix.cu
index b2be701d5..68cab0d5a 100644
--- a/src/data/simple_dmatrix.cu
+++ b/src/data/simple_dmatrix.cu
@@ -27,7 +27,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthr
   dh::safe_cuda(cudaSetDevice(device));
 
   Context ctx;
-  ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"gpu_id", std::to_string(device)}});
+  ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", DeviceOrd::CUDA(device).Name()}});
 
   CHECK(adapter->NumRows() != kAdapterUnknownSize);
   CHECK(adapter->NumColumns() != kAdapterUnknownSize);
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index 55b935ea0..e97b27665 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -84,6 +84,25 @@ bool UpdatersMatched(std::vector<std::string> updater_seq,
                       return name == up->Name();
                     });
 }
+
+void MismatchedDevices(Context const* booster, Context const* data) {
+  bool thread_local static logged{false};
+  if (logged) {
+    return;
+  }
+  LOG(WARNING) << "Falling back to prediction using DMatrix due to mismatched devices. This might "
+                  "lead to higher memory usage and slower performance. XGBoost is running on: "
+               << booster->DeviceName() << ", while the input data is on: " << data->DeviceName()
+               << ".\n"
+               << R"(Potential solutions:
+- Use a data structure that matches the device ordinal in the booster.
+- Set the device for booster before call to inplace_predict.
+
+This warning will only be shown once, and subsequent warnings made by the current thread will be
+suppressed.
+)";
+  logged = true;
+}
 }  // namespace
 
 void GBTree::Configure(Args const& cfg) {
@@ -208,6 +227,7 @@ void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
   bst_target_t const n_groups = model_.learner_model_param->OutputLength();
   monitor_.Start("BoostNewTrees");
 
+  predt->predictions.SetDevice(ctx_->Ordinal());
   auto out = linalg::MakeTensorView(ctx_, &predt->predictions, p_fmat->Info().num_row_,
                                     model_.learner_model_param->OutputLength());
   CHECK_NE(n_groups, 0);
@@ -521,18 +541,6 @@ void GBTree::PredictBatchImpl(DMatrix* p_fmat, PredictionCacheEntry* out_preds,
   }
 }
 
-namespace {
-inline void MismatchedDevices(Context const* booster, Context const* data) {
-  LOG(WARNING) << "Falling back to prediction using DMatrix due to mismatched devices. XGBoost "
-               << "is running on: " << booster->DeviceName()
-               << ", while the input data is on: " << data->DeviceName() << ".\n"
-               << R"(Potential solutions:
-- Use a data structure that matches the device ordinal in the booster.
-- Set the device for booster before call to inplace_predict.
-)";
-}
-};  // namespace
-
 void GBTree::PredictBatch(DMatrix* p_fmat, PredictionCacheEntry* out_preds, bool is_training,
                           bst_layer_t layer_begin, bst_layer_t layer_end) {
   // dispatch to const function.
diff --git a/src/learner.cc b/src/learner.cc
index 4fd0a0f09..03714a056 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -40,7 +40,7 @@
 #include "common/api_entry.h"             // for XGBAPIThreadLocalEntry
 #include "common/charconv.h"              // for to_chars, to_chars_result, NumericLimits, from_...
 #include "common/common.h"                // for ToString, Split
-#include "common/error_msg.h"             // for MaxFeatureSize, WarnOldSerialization
+#include "common/error_msg.h"             // for MaxFeatureSize, WarnOldSerialization, ...
 #include "common/io.h"                    // for PeekableInStream, ReadAll, FixedSizeStream, Mem...
 #include "common/observer.h"              // for TrainingObserver
 #include "common/random.h"                // for GlobalRandom
@@ -711,6 +711,7 @@ class LearnerConfiguration : public Learner {
     // FIXME(trivialfis): Make eval_metric a training parameter.
     keys.emplace_back(kEvalMetric);
     keys.emplace_back("num_output_group");
+    keys.emplace_back("gpu_id");  // deprecated param.
 
     std::sort(keys.begin(), keys.end());
 
@@ -1340,10 +1341,9 @@ class LearnerImpl : public LearnerIO {
   }
 
   void Predict(std::shared_ptr<DMatrix> data, bool output_margin,
-               HostDeviceVector<bst_float> *out_preds, unsigned layer_begin,
-               unsigned layer_end, bool training,
-               bool pred_leaf, bool pred_contribs, bool approx_contribs,
-               bool pred_interactions) override {
+               HostDeviceVector<bst_float>* out_preds, bst_layer_t layer_begin,
+               bst_layer_t layer_end, bool training, bool pred_leaf, bool pred_contribs,
+               bool approx_contribs, bool pred_interactions) override {
     int multiple_predictions = static_cast<int>(pred_leaf) +
                                static_cast<int>(pred_interactions) +
                                static_cast<int>(pred_contribs);
@@ -1391,15 +1391,16 @@ class LearnerImpl : public LearnerIO {
   }
 
   void InplacePredict(std::shared_ptr<DMatrix> p_m, PredictionType type, float missing,
-                      HostDeviceVector<bst_float>** out_preds, uint32_t iteration_begin,
-                      uint32_t iteration_end) override {
+                      HostDeviceVector<float>** out_preds, bst_layer_t iteration_begin,
+                      bst_layer_t iteration_end) override {
     this->Configure();
     this->CheckModelInitialized();
 
     auto& out_predictions = this->GetThreadLocal().prediction_entry;
-    out_predictions.version = 0;
+    out_predictions.Reset();
 
     this->gbm_->InplacePredict(p_m, missing, &out_predictions, iteration_begin, iteration_end);
+
     if (type == PredictionType::kValue) {
       obj_->PredTransform(&out_predictions.predictions);
     } else if (type == PredictionType::kMargin) {
@@ -1454,7 +1455,7 @@ class LearnerImpl : public LearnerIO {
     }
 
     if (p_fmat->Info().num_row_ == 0) {
-      LOG(WARNING) << "Empty dataset at worker: " << collective::GetRank();
+      error::WarnEmptyDataset();
     }
   }
 
diff --git a/tests/ci_build/lint_python.py b/tests/ci_build/lint_python.py
index dda2746bf..08baa844b 100644
--- a/tests/ci_build/lint_python.py
+++ b/tests/ci_build/lint_python.py
@@ -28,6 +28,7 @@ class LintersPaths:
         "tests/python-gpu/test_gpu_prediction.py",
         "tests/python-gpu/load_pickle.py",
         "tests/python-gpu/test_gpu_pickling.py",
+        "tests/python-gpu/test_gpu_eval_metrics.py",
         "tests/test_distributed/test_with_spark/",
         "tests/test_distributed/test_gpu_with_spark/",
         # demo
diff --git a/tests/cpp/common/test_algorithm.cu b/tests/cpp/common/test_algorithm.cu
index 26c9aea4d..c36073397 100644
--- a/tests/cpp/common/test_algorithm.cu
+++ b/tests/cpp/common/test_algorithm.cu
@@ -16,8 +16,7 @@
 namespace xgboost {
 namespace common {
 void TestSegmentedArgSort() {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);
 
   size_t constexpr kElements = 100, kGroups = 3;
   dh::device_vector<size_t> sorted_idx(kElements, 0);
@@ -55,8 +54,7 @@ void TestSegmentedArgSort() {
 TEST(Algorithm, SegmentedArgSort) { TestSegmentedArgSort(); }
 
 TEST(Algorithm, GpuArgSort) {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);
 
   dh::device_vector<float> values(20);
   dh::Iota(dh::ToSpan(values));                                    // accending
diff --git a/tests/cpp/common/test_hist_util.cu b/tests/cpp/common/test_hist_util.cu
index 127cd95d4..2d5735925 100644
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@@ -227,7 +227,7 @@ TEST(HistUtil, RemoveDuplicatedCategories) {
   }
   // check categorical
   beg = n_samples;
-  for (std::size_t i = 0; i < n_categories; ++i) {
+  for (bst_cat_t i = 0; i < n_categories; ++i) {
     // all from the second column
     ASSERT_EQ(static_cast<bst_feature_t>(weight[i + beg]) % n_features, 1);
   }
diff --git a/tests/cpp/common/test_linalg.cu b/tests/cpp/common/test_linalg.cu
index fe38f0f9b..be89d51bc 100644
--- a/tests/cpp/common/test_linalg.cu
+++ b/tests/cpp/common/test_linalg.cu
@@ -4,6 +4,7 @@
 #include <gtest/gtest.h>
 
 #include "../../../src/common/linalg_op.cuh"
+#include "../helpers.h"
 #include "xgboost/context.h"
 #include "xgboost/linalg.h"
 
@@ -54,8 +55,7 @@ void TestElementWiseKernel() {
 }
 
 void TestSlice() {
-  Context ctx;
-  ctx.gpu_id = 1;
+  auto ctx = MakeCUDACtx(1);
   thrust::device_vector<double> data(2 * 3 * 4);
   auto t = MakeTensorView(&ctx, dh::ToSpan(data), 2, 3, 4);
   dh::LaunchN(1, [=] __device__(size_t) {
diff --git a/tests/cpp/common/test_ranking_utils.cu b/tests/cpp/common/test_ranking_utils.cu
index db0ff3b66..d62f5f171 100644
--- a/tests/cpp/common/test_ranking_utils.cu
+++ b/tests/cpp/common/test_ranking_utils.cu
@@ -23,8 +23,7 @@
 
 namespace xgboost::ltr {
 void TestCalcQueriesInvIDCG() {
-  Context ctx;
-  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  auto ctx = MakeCUDACtx(0);
   std::size_t n_groups = 5, n_samples_per_group = 32;
 
   dh::device_vector<float> scores(n_samples_per_group * n_groups);
@@ -85,20 +84,17 @@ void TestRankingCache(Context const* ctx) {
 }  // namespace
 
 TEST(RankingCache, InitFromGPU) {
-  Context ctx;
-  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  auto ctx = MakeCUDACtx(0);
   TestRankingCache(&ctx);
 }
 
 TEST(NDCGCache, InitFromGPU) {
-  Context ctx;
-  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  auto ctx = MakeCUDACtx(0);
   TestNDCGCache(&ctx);
 }
 
 TEST(MAPCache, InitFromGPU) {
-  Context ctx;
-  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  auto ctx = MakeCUDACtx(0);
   TestMAPCache(&ctx);
 }
 }  // namespace xgboost::ltr
diff --git a/tests/cpp/common/test_stats.cc b/tests/cpp/common/test_stats.cc
index abdf00425..e74caeb3a 100644
--- a/tests/cpp/common/test_stats.cc
+++ b/tests/cpp/common/test_stats.cc
@@ -7,6 +7,7 @@
 
 #include "../../../src/common/stats.h"
 #include "../../../src/common/transform_iterator.h"  // common::MakeIndexTransformIter
+#include "../helpers.h"
 
 namespace xgboost {
 namespace common {
@@ -71,7 +72,7 @@ TEST(Stats, Median) {
     ASSERT_EQ(m, .5f);
 
 #if defined(XGBOOST_USE_CUDA)
-    ctx.gpu_id = 0;
+    ctx = ctx.MakeCUDA(0);
     ASSERT_FALSE(ctx.IsCPU());
     Median(&ctx, values, weights, &out);
     m = out(0);
@@ -80,7 +81,7 @@ TEST(Stats, Median) {
   }
 
   {
-    ctx.gpu_id = Context::kCpuId;
+    ctx = ctx.MakeCPU();
     // 4x2 matrix
     linalg::Tensor<float, 2> values{{0.f, 0.f, 0.f, 0.f, 1.f, 1.f, 2.f, 2.f}, {4, 2}, ctx.gpu_id};
     HostDeviceVector<float> weights;
@@ -90,7 +91,7 @@ TEST(Stats, Median) {
     ASSERT_EQ(out(1), .5f);
 
 #if defined(XGBOOST_USE_CUDA)
-    ctx.gpu_id = 0;
+    ctx = ctx.MakeCUDA(0);
     Median(&ctx, values, weights, &out);
     ASSERT_EQ(out(0), .5f);
     ASSERT_EQ(out(1), .5f);
@@ -123,8 +124,7 @@ TEST(Stats, Mean) {
 
 #if defined(XGBOOST_USE_CUDA)
 TEST(Stats, GPUMean) {
-  Context ctx;
-  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  auto ctx = MakeCUDACtx(0);
   TestMean(&ctx);
 }
 #endif  // defined(XGBOOST_USE_CUDA)
diff --git a/tests/cpp/common/test_stats.cu b/tests/cpp/common/test_stats.cu
index 8643e75a7..08877ac8d 100644
--- a/tests/cpp/common/test_stats.cu
+++ b/tests/cpp/common/test_stats.cu
@@ -3,16 +3,17 @@
  */
 #include <gtest/gtest.h>
 
-#include <cstddef>                            // std::size_t
-#include <utility>                            // std::pair
-#include <vector>                             // std::vector
+#include <cstddef>  // std::size_t
+#include <utility>  // std::pair
+#include <vector>   // std::vector
 
 #include "../../../src/common/linalg_op.cuh"  // ElementWiseTransformDevice
 #include "../../../src/common/stats.cuh"
-#include "xgboost/base.h"                     // XGBOOST_DEVICE
-#include "xgboost/context.h"                  // Context
-#include "xgboost/host_device_vector.h"       // HostDeviceVector
-#include "xgboost/linalg.h"                   // Tensor
+#include "../helpers.h"
+#include "xgboost/base.h"                // XGBOOST_DEVICE
+#include "xgboost/context.h"             // Context
+#include "xgboost/host_device_vector.h"  // HostDeviceVector
+#include "xgboost/linalg.h"              // Tensor
 
 namespace xgboost {
 namespace common {
@@ -33,7 +34,7 @@ class StatsGPU : public ::testing::Test {
   }
 
  public:
-  void SetUp() override { ctx_.gpu_id = 0; }
+  void SetUp() override { ctx_  = MakeCUDACtx(0); }
 
   void WeightedMulti() {
     // data for one segment
diff --git a/tests/cpp/data/test_gradient_index.cc b/tests/cpp/data/test_gradient_index.cc
index 5354c2f1a..f2ade711b 100644
--- a/tests/cpp/data/test_gradient_index.cc
+++ b/tests/cpp/data/test_gradient_index.cc
@@ -171,8 +171,7 @@ class GHistIndexMatrixTest : public testing::TestWithParam<std::tuple<float, flo
     ASSERT_TRUE(Xy->SingleColBlock());
     bst_bin_t constexpr kBins{17};
     auto p = BatchParam{kBins, threshold};
-    Context gpu_ctx;
-    gpu_ctx.gpu_id = 0;
+    auto gpu_ctx = MakeCUDACtx(0);
     for (auto const &page : Xy->GetBatches<EllpackPage>(
              &gpu_ctx, BatchParam{kBins, tree::TrainParam::DftSparseThreshold()})) {
       from_ellpack = std::make_unique<GHistIndexMatrix>(&ctx, Xy->Info(), page, p);
diff --git a/tests/cpp/gbm/test_gbtree.cc b/tests/cpp/gbm/test_gbtree.cc
index 1c0927031..9e6311701 100644
--- a/tests/cpp/gbm/test_gbtree.cc
+++ b/tests/cpp/gbm/test_gbtree.cc
@@ -180,7 +180,12 @@ TEST(GBTree, ChooseTreeMethod) {
       learner->SetParam("tree_method", tree_method.value());
     }
     if (device.has_value()) {
-      learner->SetParam("gpu_id", device.value());
+      auto const& d = device.value();
+      if (std::isdigit(d.front()) || d.front() == '-') {
+        learner->SetParam("gpu_id", d);
+      } else {
+        learner->SetParam("device", d);
+      }
     }
     learner->Configure();
     for (std::int32_t i = 0; i < 3; ++i) {
@@ -199,7 +204,12 @@ TEST(GBTree, ChooseTreeMethod) {
       learner->SetParam("tree_method", tree_method.value());
     }
     if (device.has_value()) {
-      learner->SetParam("gpu_id", device.value());
+      auto const& d = device.value();
+      if (std::isdigit(d.front()) || d.front() == '-') {
+        learner->SetParam("gpu_id", d);
+      } else {
+        learner->SetParam("device", d);
+      }
     }
     learner->Configure();
     for (std::int32_t i = 0; i < 3; ++i) {
@@ -215,11 +225,12 @@ TEST(GBTree, ChooseTreeMethod) {
 
   // |        | hist    | gpu_hist | exact | NA  |
   // |--------+---------+----------+-------+-----|
-  // | CUDA:0 | GPU     | GPU (w)  | Err   | GPU | # not yet tested
-  // | CPU    | CPU     | Err      | CPU   | CPU | # not yet tested
+  // | CUDA:0 | GPU     | GPU (w)  | Err   | GPU |
+  // | CPU    | CPU     | GPU (w)  | CPU   | CPU |
   // |--------+---------+----------+-------+-----|
   // | -1     | CPU     | GPU (w)  | CPU   | CPU |
   // | 0      | GPU     | GPU (w)  | Err   | GPU |
+  // |--------+---------+----------+-------+-----|
   // | NA     | CPU     | GPU (w)  | CPU   | CPU |
   //
   // - (w): warning
@@ -237,18 +248,30 @@ TEST(GBTree, ChooseTreeMethod) {
           // hist
           {{"hist", "-1"}, "grow_quantile_histmaker"},
           {{"hist", "0"}, "grow_gpu_hist"},
+          {{"hist", "cpu"}, "grow_quantile_histmaker"},
+          {{"hist", "cuda"}, "grow_gpu_hist"},
+          {{"hist", "cuda:0"}, "grow_gpu_hist"},
           {{"hist", std::nullopt}, "grow_quantile_histmaker"},
           // gpu_hist
           {{"gpu_hist", "-1"}, "grow_gpu_hist"},
           {{"gpu_hist", "0"}, "grow_gpu_hist"},
+          {{"gpu_hist", "cpu"}, "grow_gpu_hist"},
+          {{"gpu_hist", "cuda"}, "grow_gpu_hist"},
+          {{"gpu_hist", "cuda:0"}, "grow_gpu_hist"},
           {{"gpu_hist", std::nullopt}, "grow_gpu_hist"},
           // exact
           {{"exact", "-1"}, "grow_colmaker,prune"},
           {{"exact", "0"}, "err"},
+          {{"exact", "cpu"}, "grow_colmaker,prune"},
+          {{"exact", "cuda"}, "err"},
+          {{"exact", "cuda:0"}, "err"},
           {{"exact", std::nullopt}, "grow_colmaker,prune"},
           // NA
           {{std::nullopt, "-1"}, "grow_quantile_histmaker"},
           {{std::nullopt, "0"}, "grow_gpu_hist"},  // default to hist
+          {{std::nullopt, "cpu"}, "grow_quantile_histmaker"},
+          {{std::nullopt, "cuda"}, "grow_gpu_hist"},
+          {{std::nullopt, "cuda:0"}, "grow_gpu_hist"},
           {{std::nullopt, std::nullopt}, "grow_quantile_histmaker"},
       };
 
@@ -392,8 +415,7 @@ class Dart : public testing::TestWithParam<char const*> {
     for (size_t i = 0; i < 16; ++i) {
       learner->UpdateOneIter(i, p_mat);
     }
-
-    ConfigLearnerByCtx(&ctx, learner.get());
+    learner->SetParam("device", ctx.DeviceName());
 
     HostDeviceVector<float> predts_training;
     learner->Predict(p_mat, false, &predts_training, 0, 0, true);
@@ -654,8 +676,7 @@ TEST(GBTree, InplacePredictionError) {
         RandomDataGenerator{n_samples, n_features, 0.5f}.Batches(2).GenerateSparsePageDMatrix(
             "cache", true);
     std::unique_ptr<Learner> learner{Learner::Create({p_fmat})};
-    learner->SetParam("booster", booster);
-    ConfigLearnerByCtx(ctx, learner.get());
+    learner->SetParams(Args{{"booster", booster}, {"device", ctx->DeviceName()}});
     learner->Configure();
     for (std::int32_t i = 0; i < 3; ++i) {
       learner->UpdateOneIter(i, p_fmat);
@@ -697,9 +718,9 @@ TEST(GBTree, InplacePredictionError) {
 #endif  // defined(XGBOOST_USE_CUDA)
     };
     std::unique_ptr<Learner> learner{Learner::Create({p_fmat})};
-    learner->SetParam("booster", booster);
-    learner->SetParam("max_bin", std::to_string(max_bins));
-    ConfigLearnerByCtx(ctx, learner.get());
+    learner->SetParams(Args{{"booster", booster},
+                            {"max_bin", std::to_string(max_bins)},
+                            {"device", ctx->DeviceName()}});
     learner->Configure();
     for (std::int32_t i = 0; i < 3; ++i) {
       learner->UpdateOneIter(i, p_fmat);
diff --git a/tests/cpp/gbm/test_gbtree.cu b/tests/cpp/gbm/test_gbtree.cu
index 7321be75e..03f689822 100644
--- a/tests/cpp/gbm/test_gbtree.cu
+++ b/tests/cpp/gbm/test_gbtree.cu
@@ -8,6 +8,7 @@
 #include <limits>  // for numeric_limits
 #include <memory>  // for shared_ptr
 #include <string>  // for string
+#include <thread>  // for thread
 
 #include "../../../src/data/adapter.h"           // for ArrayAdapter
 #include "../../../src/data/device_adapter.cuh"  // for CupyAdapter
@@ -41,7 +42,7 @@ void TestInplaceFallback(Context const* ctx) {
 
   // learner is configured to the device specified by ctx
   std::unique_ptr<Learner> learner{Learner::Create({Xy})};
-  ConfigLearnerByCtx(ctx, learner.get());
+  learner->SetParam("device", ctx->DeviceName());
   for (std::int32_t i = 0; i < 3; ++i) {
     learner->UpdateOneIter(i, Xy);
   }
@@ -56,18 +57,31 @@ void TestInplaceFallback(Context const* ctx) {
 
   HostDeviceVector<float>* out_predt{nullptr};
   ConsoleLogger::Configure(Args{{"verbosity", "1"}});
+  std::string output;
   // test whether the warning is raised
+#if !defined(_WIN32)
+  // Windows has issue with CUDA and thread local storage. For some reason, on Windows a
+  // cudaInitializationError is raised during destruction of `HostDeviceVector`. This
+  // might be related to https://github.com/dmlc/xgboost/issues/5793
   ::testing::internal::CaptureStderr();
+  std::thread{[&] {
+    // Launch a new thread to ensure a warning is raised as we prevent over-verbose
+    // warning by using thread-local flags.
+    learner->InplacePredict(p_m, PredictionType::kValue, std::numeric_limits<float>::quiet_NaN(),
+                            &out_predt, 0, 0);
+  }}.join();
+  output = testing::internal::GetCapturedStderr();
+  ASSERT_NE(output.find("Falling back"), std::string::npos);
+#endif
+
   learner->InplacePredict(p_m, PredictionType::kValue, std::numeric_limits<float>::quiet_NaN(),
                           &out_predt, 0, 0);
-  auto output = testing::internal::GetCapturedStderr();
-  ASSERT_NE(output.find("Falling back"), std::string::npos);
 
   // test when the contexts match
   Context new_ctx = *proxy->Ctx();
   ASSERT_NE(new_ctx.gpu_id, ctx->gpu_id);
 
-  ConfigLearnerByCtx(&new_ctx, learner.get());
+  learner->SetParam("device", new_ctx.DeviceName());
   HostDeviceVector<float>* out_predt_1{nullptr};
   // no warning is raised
   ::testing::internal::CaptureStderr();
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index 449d97a40..b166109d9 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -559,16 +559,4 @@ class DeclareUnifiedDistributedTest(MetricTest) : public ::testing::Test {
     }
   }
 };
-
-// A temporary solution before we move away from gpu_id.
-inline void ConfigLearnerByCtx(Context const* ctx, Learner* learner) {
-  if (ctx->IsCPU()) {
-    learner->SetParam("tree_method", "hist");
-  } else {
-    learner->SetParam("tree_method", "gpu_hist");
-  }
-  learner->SetParam("gpu_id", std::to_string(ctx->gpu_id));
-  learner->Configure();
-  ASSERT_EQ(learner->Ctx()->gpu_id, ctx->gpu_id);
-}
 }  // namespace xgboost
diff --git a/tests/cpp/metric/test_multiclass_metric.h b/tests/cpp/metric/test_multiclass_metric.h
index 0f4017041..5fdead596 100644
--- a/tests/cpp/metric/test_multiclass_metric.h
+++ b/tests/cpp/metric/test_multiclass_metric.h
@@ -46,7 +46,6 @@ inline void CheckDeterministicMetricMultiClass(StringView name, int32_t device)
 
 inline void TestMultiClassError(int device, DataSplitMode data_split_mode) {
   auto ctx = MakeCUDACtx(device);
-  ctx.gpu_id = device;
   xgboost::Metric * metric = xgboost::Metric::Create("merror", &ctx);
   metric->Configure({});
   ASSERT_STREQ(metric->Name(), "merror");
@@ -67,7 +66,6 @@ inline void VerifyMultiClassError(DataSplitMode data_split_mode = DataSplitMode:
 
 inline void TestMultiClassLogLoss(int device, DataSplitMode data_split_mode) {
   auto ctx = MakeCUDACtx(device);
-  ctx.gpu_id = device;
   xgboost::Metric * metric = xgboost::Metric::Create("mlogloss", &ctx);
   metric->Configure({});
   ASSERT_STREQ(metric->Name(), "mlogloss");
diff --git a/tests/cpp/objective/test_lambdarank_obj.cu b/tests/cpp/objective/test_lambdarank_obj.cu
index d0f448993..16dc45307 100644
--- a/tests/cpp/objective/test_lambdarank_obj.cu
+++ b/tests/cpp/objective/test_lambdarank_obj.cu
@@ -13,26 +13,22 @@
 
 namespace xgboost::obj {
 TEST(LambdaRank, GPUNDCGJsonIO) {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);
   TestNDCGJsonIO(&ctx);
 }
 
 TEST(LambdaRank, GPUMAPStat) {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);
   TestMAPStat(&ctx);
 }
 
 TEST(LambdaRank, GPUNDCGGPair) {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);
   TestNDCGGPair(&ctx);
 }
 
 void TestGPUMakePair() {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);
 
   MetaInfo info;
   HostDeviceVector<float> predt;
@@ -126,8 +122,7 @@ void TestGPUMakePair() {
 TEST(LambdaRank, GPUMakePair) { TestGPUMakePair(); }
 
 TEST(LambdaRank, GPUUnbiasedNDCG) {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);
   TestUnbiasedNDCG(&ctx);
 }
 
@@ -161,8 +156,7 @@ TEST(LambdaRank, RankItemCountOnRight) {
 }
 
 TEST(LambdaRank, GPUMAPGPair) {
-  Context ctx;
-  ctx.gpu_id = 0;
+  auto ctx = MakeCUDACtx(0);
   TestMAPGPair(&ctx);
 }
 }  // namespace xgboost::obj
diff --git a/tests/cpp/objective/test_regression_obj.cc b/tests/cpp/objective/test_regression_obj.cc
index 1b5573b0f..b8a40603b 100644
--- a/tests/cpp/objective/test_regression_obj.cc
+++ b/tests/cpp/objective/test_regression_obj.cc
@@ -305,12 +305,12 @@ TEST(Objective, CPU_vs_CUDA) {
 
   {
     // CPU
-    ctx.gpu_id = -1;
+    ctx = ctx.MakeCPU();
     obj->GetGradient(preds, info, 0, &cpu_out_preds);
   }
   {
     // CUDA
-    ctx.gpu_id = 0;
+    ctx = ctx.MakeCUDA(0);
     obj->GetGradient(preds, info, 0, &cuda_out_preds);
   }
 
diff --git a/tests/cpp/plugin/test_regression_obj_oneapi.cc b/tests/cpp/plugin/test_regression_obj_oneapi.cc
index 031a9ec2c..c01d9d951 100755
--- a/tests/cpp/plugin/test_regression_obj_oneapi.cc
+++ b/tests/cpp/plugin/test_regression_obj_oneapi.cc
@@ -148,7 +148,7 @@ TEST(Plugin, CPUvsOneAPI) {
 
   {
     // CPU
-    ctx.gpu_id = -1;
+    ctx = ctx.MakeCPU();
     obj_cpu->GetGradient(preds, info, 0, &cpu_out_preds);
   }
   {
diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc
index 841a576d5..a54c42a98 100644
--- a/tests/cpp/predictor/test_cpu_predictor.cc
+++ b/tests/cpp/predictor/test_cpu_predictor.cc
@@ -214,15 +214,16 @@ void TestUpdatePredictionCache(bool use_subsampling) {
 }
 }  // namespace
 
-TEST(CPUPredictor, GHistIndex) {
+TEST(CPUPredictor, GHistIndexTraining) {
   size_t constexpr kRows{128}, kCols{16}, kBins{64};
+  Context ctx;
   auto p_hist = RandomDataGenerator{kRows, kCols, 0.0}.Bins(kBins).GenerateQuantileDMatrix(false);
   HostDeviceVector<float> storage(kRows * kCols);
   auto columnar = RandomDataGenerator{kRows, kCols, 0.0}.GenerateArrayInterface(&storage);
   auto adapter = data::ArrayAdapter(columnar.c_str());
   std::shared_ptr<DMatrix> p_full{
       DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1)};
-  TestTrainingPrediction(kRows, kBins, "hist", p_full, p_hist);
+  TestTrainingPrediction(&ctx, kRows, kBins, p_full, p_hist);
 }
 
 TEST(CPUPredictor, CategoricalPrediction) {
diff --git a/tests/cpp/predictor/test_gpu_predictor.cu b/tests/cpp/predictor/test_gpu_predictor.cu
index 15fbd462e..be0cad5ce 100644
--- a/tests/cpp/predictor/test_gpu_predictor.cu
+++ b/tests/cpp/predictor/test_gpu_predictor.cu
@@ -33,9 +33,8 @@ TEST(GPUPredictor, Basic) {
     int n_row = i, n_col = i;
     auto dmat = RandomDataGenerator(n_row, n_col, 0).GenerateDMatrix();
 
-    Context ctx;
-    ctx.gpu_id = 0;
-    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.gpu_id)};
+    auto ctx = MakeCUDACtx(0);
+    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Ordinal())};
     gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);
 
     // Test predict batch
@@ -71,7 +70,7 @@ void VerifyBasicColumnSplit(std::array<std::vector<float>, 32> const& expected_r
     auto dmat = RandomDataGenerator(n_row, n_col, 0).GenerateDMatrix();
     std::unique_ptr<DMatrix> sliced{dmat->SliceCol(world_size, rank)};
 
-    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.gpu_id)};
+    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Ordinal())};
     gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);
 
     // Test predict batch
@@ -102,7 +101,7 @@ TEST(GPUPredictor, MGPUBasicColumnSplit) {
     size_t n_row = i, n_col = i;
     auto dmat = RandomDataGenerator(n_row, n_col, 0).GenerateDMatrix();
 
-    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.gpu_id)};
+    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Ordinal())};
     gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);
 
     // Test predict batch
@@ -132,18 +131,19 @@ TEST(GPUPredictor, EllpackBasic) {
 }
 
 TEST(GPUPredictor, EllpackTraining) {
-  size_t constexpr kRows { 128 }, kCols { 16 }, kBins { 64 };
-  auto p_ellpack =
-      RandomDataGenerator{kRows, kCols, 0.0}.Bins(kBins).Device(0).GenerateDeviceDMatrix(false);
+  auto ctx = MakeCUDACtx(0);
+  size_t constexpr kRows{128}, kCols{16}, kBins{64};
+  auto p_ellpack = RandomDataGenerator{kRows, kCols, 0.0}
+                       .Bins(kBins)
+                       .Device(ctx.Ordinal())
+                       .GenerateDeviceDMatrix(false);
   HostDeviceVector<float> storage(kRows * kCols);
-  auto columnar = RandomDataGenerator{kRows, kCols, 0.0}
-       .Device(0)
-       .GenerateArrayInterface(&storage);
+  auto columnar =
+      RandomDataGenerator{kRows, kCols, 0.0}.Device(ctx.Ordinal()).GenerateArrayInterface(&storage);
   auto adapter = data::CupyAdapter(columnar);
-  std::shared_ptr<DMatrix> p_full {
-    DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1)
-  };
-  TestTrainingPrediction(kRows, kBins, "gpu_hist", p_full, p_ellpack);
+  std::shared_ptr<DMatrix> p_full{
+      DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1)};
+  TestTrainingPrediction(&ctx, kRows, kBins, p_full, p_ellpack);
 }
 
 TEST(GPUPredictor, ExternalMemoryTest) {
@@ -153,9 +153,8 @@ TEST(GPUPredictor, ExternalMemoryTest) {
   gpu_predictor->Configure({});
 
   const int n_classes = 3;
-  Context ctx;
-  ctx.gpu_id = 0;
-  LearnerModelParam mparam{MakeMP(5, .5, n_classes, ctx.gpu_id)};
+  Context ctx = MakeCUDACtx(0);
+  LearnerModelParam mparam{MakeMP(5, .5, n_classes, ctx.Ordinal())};
 
   gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx, n_classes);
   std::vector<std::unique_ptr<DMatrix>> dmats;
@@ -185,7 +184,7 @@ TEST(GPUPredictor, InplacePredictCupy) {
   auto ctx = MakeCUDACtx(0);
   size_t constexpr kRows{128}, kCols{64};
   RandomDataGenerator gen(kRows, kCols, 0.5);
-  gen.Device(ctx.gpu_id);
+  gen.Device(ctx.Ordinal());
   HostDeviceVector<float> data;
   std::string interface_str = gen.GenerateArrayInterface(&data);
   std::shared_ptr<DMatrix> p_fmat{new data::DMatrixProxy};
@@ -197,7 +196,7 @@ TEST(GPUPredictor, InplacePredictCuDF) {
   auto ctx = MakeCUDACtx(0);
   size_t constexpr kRows{128}, kCols{64};
   RandomDataGenerator gen(kRows, kCols, 0.5);
-  gen.Device(ctx.gpu_id);
+  gen.Device(ctx.Ordinal());
   std::vector<HostDeviceVector<float>> storage(kCols);
   auto interface_str = gen.GenerateColumnarArrayInterface(&storage);
   std::shared_ptr<DMatrix> p_fmat{new data::DMatrixProxy};
@@ -214,9 +213,8 @@ TEST(GpuPredictor, LesserFeatures) {
 TEST(GPUPredictor, ShapStump) {
   cudaSetDevice(0);
 
-  Context ctx;
-  ctx.gpu_id = 0;
-  LearnerModelParam mparam{MakeMP(1, .5, 1, ctx.gpu_id)};
+  auto ctx = MakeCUDACtx(0);
+  LearnerModelParam mparam{MakeMP(1, .5, 1, ctx.Ordinal())};
   gbm::GBTreeModel model(&mparam, &ctx);
 
   std::vector<std::unique_ptr<RegTree>> trees;
@@ -241,9 +239,8 @@ TEST(GPUPredictor, ShapStump) {
 }
 
 TEST(GPUPredictor, Shap) {
-  Context ctx;
-  ctx.gpu_id = 0;
-  LearnerModelParam mparam{MakeMP(1, .5, 1, ctx.gpu_id)};
+  auto ctx = MakeCUDACtx(0);
+  LearnerModelParam mparam{MakeMP(1, .5, 1, ctx.Ordinal())};
   gbm::GBTreeModel model(&mparam, &ctx);
 
   std::vector<std::unique_ptr<RegTree>> trees;
diff --git a/tests/cpp/predictor/test_predictor.cc b/tests/cpp/predictor/test_predictor.cc
index b85abf183..993504c57 100644
--- a/tests/cpp/predictor/test_predictor.cc
+++ b/tests/cpp/predictor/test_predictor.cc
@@ -44,60 +44,49 @@ TEST(Predictor, PredictionCache) {
   EXPECT_ANY_THROW(container.Entry(m));
 }
 
-void TestTrainingPrediction(size_t rows, size_t bins,
-                            std::string tree_method,
-                            std::shared_ptr<DMatrix> p_full,
-                            std::shared_ptr<DMatrix> p_hist) {
+void TestTrainingPrediction(Context const *ctx, size_t rows, size_t bins,
+                            std::shared_ptr<DMatrix> p_full, std::shared_ptr<DMatrix> p_hist) {
   size_t constexpr kCols = 16;
   size_t constexpr kClasses = 3;
   size_t constexpr kIters = 3;
 
   std::unique_ptr<Learner> learner;
-  auto train = [&](Context const& ctx) {
-    p_hist->Info().labels.Reshape(rows, 1);
-    auto &h_label = p_hist->Info().labels.Data()->HostVector();
 
-    for (size_t i = 0; i < rows; ++i) {
-      h_label[i] = i % kClasses;
-    }
+  p_hist->Info().labels.Reshape(rows, 1);
+  auto &h_label = p_hist->Info().labels.Data()->HostVector();
 
-    learner.reset(Learner::Create({}));
-    learner->SetParam("tree_method", tree_method);
-    learner->SetParam("objective", "multi:softprob");
-    learner->SetParam("num_feature", std::to_string(kCols));
-    learner->SetParam("num_class", std::to_string(kClasses));
-    learner->SetParam("max_bin", std::to_string(bins));
-    ConfigLearnerByCtx(&ctx, learner.get());
-    learner->Configure();
+  for (size_t i = 0; i < rows; ++i) {
+    h_label[i] = i % kClasses;
+  }
 
-    for (size_t i = 0; i < kIters; ++i) {
-      learner->UpdateOneIter(i, p_hist);
-    }
+  learner.reset(Learner::Create({}));
+  learner->SetParams(Args{{"objective", "multi:softprob"},
+                          {"num_feature", std::to_string(kCols)},
+                          {"num_class", std::to_string(kClasses)},
+                          {"max_bin", std::to_string(bins)},
+                          {"device", ctx->DeviceName()}});
+  learner->Configure();
 
-    Json model{Object{}};
-    learner->SaveModel(&model);
+  for (size_t i = 0; i < kIters; ++i) {
+    learner->UpdateOneIter(i, p_hist);
+  }
 
-    learner.reset(Learner::Create({}));
-    learner->LoadModel(model);
-    ConfigLearnerByCtx(&ctx, learner.get());
-    learner->Configure();
+  Json model{Object{}};
+  learner->SaveModel(&model);
 
-    HostDeviceVector<float> from_full;
-    learner->Predict(p_full, false, &from_full, 0, 0);
+  learner.reset(Learner::Create({}));
+  learner->LoadModel(model);
+  learner->SetParam("device", ctx->DeviceName());
+  learner->Configure();
 
-    HostDeviceVector<float> from_hist;
-    learner->Predict(p_hist, false, &from_hist, 0, 0);
+  HostDeviceVector<float> from_full;
+  learner->Predict(p_full, false, &from_full, 0, 0);
 
-    for (size_t i = 0; i < rows; ++i) {
-      EXPECT_NEAR(from_hist.ConstHostVector()[i],
-                  from_full.ConstHostVector()[i], kRtEps);
-    }
-  };
+  HostDeviceVector<float> from_hist;
+  learner->Predict(p_hist, false, &from_hist, 0, 0);
 
-  if (tree_method == "gpu_hist") {
-    train(MakeCUDACtx(0));
-  } else {
-    train(Context{});
+  for (size_t i = 0; i < rows; ++i) {
+    EXPECT_NEAR(from_hist.ConstHostVector()[i], from_full.ConstHostVector()[i], kRtEps);
   }
 }
 
@@ -120,7 +109,7 @@ void TestInplacePrediction(Context const *ctx, std::shared_ptr<DMatrix> x, bst_r
     learner->UpdateOneIter(it, m);
   }
 
-  learner->SetParam("gpu_id", std::to_string(ctx->gpu_id));
+  learner->SetParam("device", ctx->DeviceName());
   learner->Configure();
 
   HostDeviceVector<float> *p_out_predictions_0{nullptr};
@@ -153,7 +142,7 @@ void TestInplacePrediction(Context const *ctx, std::shared_ptr<DMatrix> x, bst_r
     ASSERT_NEAR(h_pred[i], h_pred_0[i] + h_pred_1[i] - 0.5f, kRtEps);
   }
 
-  learner->SetParam("gpu_id", "-1");
+  learner->SetParam("device", "cpu");
   learner->Configure();
 }
 
@@ -161,12 +150,12 @@ namespace {
 std::unique_ptr<Learner> LearnerForTest(Context const *ctx, std::shared_ptr<DMatrix> dmat,
                                         size_t iters, size_t forest = 1) {
   std::unique_ptr<Learner> learner{Learner::Create({dmat})};
-  learner->SetParams(Args{{"num_parallel_tree", std::to_string(forest)}});
+  learner->SetParams(
+      Args{{"num_parallel_tree", std::to_string(forest)}, {"device", ctx->DeviceName()}});
   for (size_t i = 0; i < iters; ++i) {
     learner->UpdateOneIter(i, dmat);
   }
 
-  ConfigLearnerByCtx(ctx, learner.get());
   return learner;
 }
 
@@ -215,7 +204,7 @@ void TestPredictionDeviceAccess() {
   {
     ASSERT_EQ(from_cpu.DeviceIdx(), Context::kCpuId);
     Context cpu_ctx;
-    ConfigLearnerByCtx(&cpu_ctx, learner.get());
+    learner->SetParam("device", cpu_ctx.DeviceName());
     learner->Predict(m_test, false, &from_cpu, 0, 0);
     ASSERT_TRUE(from_cpu.HostCanWrite());
     ASSERT_FALSE(from_cpu.DeviceCanRead());
@@ -225,7 +214,7 @@ void TestPredictionDeviceAccess() {
   HostDeviceVector<float> from_cuda;
   {
     Context cuda_ctx = MakeCUDACtx(0);
-    ConfigLearnerByCtx(&cuda_ctx, learner.get());
+    learner->SetParam("device", cuda_ctx.DeviceName());
     learner->Predict(m_test, false, &from_cuda, 0, 0);
     ASSERT_EQ(from_cuda.DeviceIdx(), 0);
     ASSERT_TRUE(from_cuda.DeviceCanWrite());
@@ -465,11 +454,7 @@ void TestIterationRangeColumnSplit(Context const* ctx) {
   auto dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix(true, true, kClasses);
   auto learner = LearnerForTest(ctx, dmat, kIters, kForest);
 
-  if (ctx->IsCPU()) {
-    learner->SetParams(Args{{"gpu_id", std::to_string(-1)}});
-  } else {
-    learner->SetParams(Args{{"gpu_id", std::to_string(0)}});
-  }
+  learner->SetParam("device", ctx->DeviceName());
 
   bool bound = false;
   std::unique_ptr<Learner> sliced{learner->Slice(0, 3, 1, &bound)};
@@ -582,7 +567,7 @@ void TestSparsePredictionColumnSplit(Context const* ctx, float sparsity) {
   learner.reset(Learner::Create({Xy}));
   learner->LoadModel(model);
 
-  ConfigLearnerByCtx(ctx, learner.get());
+  learner->SetParam("device", ctx->DeviceName());
   learner->Predict(Xy, false, &sparse_predt, 0, 0);
 
   auto constexpr kWorldSize = 2;
diff --git a/tests/cpp/predictor/test_predictor.h b/tests/cpp/predictor/test_predictor.h
index c6f4d1816..81ec3cb5d 100644
--- a/tests/cpp/predictor/test_predictor.h
+++ b/tests/cpp/predictor/test_predictor.h
@@ -84,9 +84,8 @@ void TestPredictionFromGradientIndex(Context const* ctx, size_t rows, size_t col
 }
 
 // p_full and p_hist should come from the same data set.
-void TestTrainingPrediction(size_t rows, size_t bins, std::string tree_method,
-                            std::shared_ptr<DMatrix> p_full,
-                            std::shared_ptr<DMatrix> p_hist);
+void TestTrainingPrediction(Context const* ctx, size_t rows, size_t bins,
+                            std::shared_ptr<DMatrix> p_full, std::shared_ptr<DMatrix> p_hist);
 
 void TestInplacePrediction(Context const* ctx, std::shared_ptr<DMatrix> x, bst_row_t rows,
                            bst_feature_t cols);
diff --git a/tests/cpp/test_context.cc b/tests/cpp/test_context.cc
new file mode 100644
index 000000000..d49f7b4b2
--- /dev/null
+++ b/tests/cpp/test_context.cc
@@ -0,0 +1,31 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+#include <xgboost/base.h>
+#include <xgboost/context.h>
+
+namespace xgboost {
+TEST(Context, CPU) {
+  Context ctx;
+  ASSERT_EQ(ctx.Device(), DeviceOrd::CPU());
+  ASSERT_EQ(ctx.Ordinal(), Context::kCpuId);
+
+  std::int32_t flag{0};
+  ctx.DispatchDevice([&] { flag = -1; }, [&] { flag = 1; });
+  ASSERT_EQ(flag, -1);
+
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", "oops"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", "-1"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", "CPU"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", "CUDA"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", "CPU:0"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", "gpu:+0"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", "gpu:0-"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", "gpu:"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", ":"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", ":gpu"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", ":0"}}), dmlc::Error);
+  ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", ""}}), dmlc::Error);
+}
+}  // namespace xgboost
diff --git a/tests/cpp/test_context.cu b/tests/cpp/test_context.cu
new file mode 100644
index 000000000..035d22125
--- /dev/null
+++ b/tests/cpp/test_context.cu
@@ -0,0 +1,99 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+#include <xgboost/base.h>  // for Args
+#include <xgboost/context.h>
+#include <xgboost/json.h>  // for FromJson, ToJson
+
+#include <string>  // for string, to_string
+
+#include "../../src/common/common.h"  // for AllVisibleGPUs
+
+namespace xgboost {
+namespace {
+void TestCUDA(Context const& ctx, bst_d_ordinal_t ord) {
+  ASSERT_EQ(ctx.gpu_id, ord);
+  ASSERT_EQ(ctx.Device().ordinal, ord);
+  ASSERT_EQ(ctx.DeviceName(), "cuda:" + std::to_string(ord));
+  ASSERT_EQ(ctx.Ordinal(), ord);
+  ASSERT_TRUE(ctx.IsCUDA());
+  ASSERT_FALSE(ctx.IsCPU());
+  ASSERT_EQ(ctx.Device(), DeviceOrd::CUDA(ord));
+
+  Json jctx{ToJson(ctx)};
+  Context new_ctx;
+  FromJson(jctx, &new_ctx);
+  ASSERT_EQ(new_ctx.Device(), ctx.Device());
+  ASSERT_EQ(new_ctx.gpu_id, ctx.gpu_id);
+}
+}  // namespace
+
+TEST(Context, DeviceOrdinal) {
+  Context ctx;
+  auto n_vis = common::AllVisibleGPUs();
+  auto ord = n_vis - 1;
+
+  std::string device = "cuda:" + std::to_string(ord);
+  ctx.UpdateAllowUnknown(Args{{"device", device}});
+  TestCUDA(ctx, ord);
+
+  device = "cuda:" + std::to_string(1001);
+  ctx.UpdateAllowUnknown(Args{{"device", device}});
+  ord = 1001 % n_vis;
+
+  TestCUDA(ctx, ord);
+
+  std::int32_t flag{0};
+  ctx.DispatchDevice([&] { flag = -1; }, [&] { flag = 1; });
+  ASSERT_EQ(flag, 1);
+
+  Context new_ctx = ctx;
+  TestCUDA(new_ctx, ctx.Ordinal());
+
+  auto cpu_ctx = ctx.MakeCPU();
+  ASSERT_TRUE(cpu_ctx.IsCPU());
+  ASSERT_EQ(cpu_ctx.Ordinal(), Context::kCpuId);
+  ASSERT_EQ(cpu_ctx.Device(), DeviceOrd::CPU());
+
+  auto cuda_ctx = cpu_ctx.MakeCUDA(ctx.Ordinal());
+  TestCUDA(cuda_ctx, ctx.Ordinal());
+
+  cuda_ctx.UpdateAllowUnknown(Args{{"fail_on_invalid_gpu_id", "true"}});
+  ASSERT_THROW({ cuda_ctx.UpdateAllowUnknown(Args{{"device", "cuda:9999"}}); }, dmlc::Error);
+  cuda_ctx.UpdateAllowUnknown(Args{{"device", "cuda:00"}});
+  ASSERT_EQ(cuda_ctx.Ordinal(), 0);
+
+  ctx.UpdateAllowUnknown(Args{{"device", "cpu"}});
+  // Test alias
+  ctx.UpdateAllowUnknown(Args{{"device", "gpu:0"}});
+  TestCUDA(ctx, 0);
+  ctx.UpdateAllowUnknown(Args{{"device", "gpu"}});
+  TestCUDA(ctx, 0);
+
+  // Test the thread local memory in dmlc is not linking different instances together.
+  cpu_ctx.UpdateAllowUnknown(Args{{"device", "cpu"}});
+  TestCUDA(ctx, 0);
+  ctx.UpdateAllowUnknown(Args{});
+  TestCUDA(ctx, 0);
+}
+
+TEST(Context, GPUId) {
+  Context ctx;
+  ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
+  TestCUDA(ctx, 0);
+
+  auto n_vis = common::AllVisibleGPUs();
+  auto ord = n_vis - 1;
+  ctx.UpdateAllowUnknown(Args{{"gpu_id", std::to_string(ord)}});
+  TestCUDA(ctx, ord);
+
+  auto device = "cuda:" + std::to_string(1001);
+  ctx.UpdateAllowUnknown(Args{{"device", device}});
+  ord = 1001 % n_vis;
+  TestCUDA(ctx, ord);
+
+  ctx.UpdateAllowUnknown(Args{{"gpu_id", "-1"}});
+  ASSERT_EQ(ctx.Device(), DeviceOrd::CPU());
+}
+}  // namespace xgboost
diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc
index 0981fc352..2165c6c8d 100644
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@@ -27,7 +27,6 @@
 #include "../../src/common/io.h"                    // for LoadSequentialFile
 #include "../../src/common/linalg_op.h"             // for ElementWiseTransformHost, begin, end
 #include "../../src/common/random.h"                // for GlobalRandom
-#include "../../src/common/transform_iterator.h"    // for IndexTransformIter
 #include "dmlc/io.h"                                // for Stream
 #include "dmlc/omp.h"                               // for omp_get_max_threads
 #include "dmlc/registry.h"                          // for Registry
@@ -35,14 +34,13 @@
 #include "helpers.h"                                // for GetBaseScore, RandomDataGenerator
 #include "objective_helpers.h"                      // for MakeObjNamesForTest, ObjTestNameGenerator
 #include "xgboost/base.h"                           // for bst_float, Args, bst_feature_t, bst_int
-#include "xgboost/context.h"                        // for Context
+#include "xgboost/context.h"                        // for Context, DeviceOrd
 #include "xgboost/data.h"                           // for DMatrix, MetaInfo, DataType
 #include "xgboost/host_device_vector.h"             // for HostDeviceVector
 #include "xgboost/json.h"                           // for Json, Object, get, String, IsA, opera...
 #include "xgboost/linalg.h"                         // for Tensor, TensorView
 #include "xgboost/logging.h"                        // for ConsoleLogger
 #include "xgboost/predictor.h"                      // for PredictionCacheEntry
-#include "xgboost/span.h"                           // for Span, operator!=, SpanIterator
 #include "xgboost/string_view.h"                    // for StringView
 
 namespace xgboost {
@@ -58,9 +56,9 @@ TEST(Learner, Basic) {
   auto minor = XGBOOST_VER_MINOR;
   auto patch = XGBOOST_VER_PATCH;
 
-  static_assert(std::is_integral<decltype(major)>::value, "Wrong major version type");
-  static_assert(std::is_integral<decltype(minor)>::value, "Wrong minor version type");
-  static_assert(std::is_integral<decltype(patch)>::value, "Wrong patch version type");
+  static_assert(std::is_integral_v<decltype(major)>, "Wrong major version type");
+  static_assert(std::is_integral_v<decltype(minor)>, "Wrong minor version type");
+  static_assert(std::is_integral_v<decltype(patch)>, "Wrong patch version type");
 }
 
 TEST(Learner, ParameterValidation) {
@@ -92,8 +90,7 @@ TEST(Learner, CheckGroup) {
   size_t constexpr kNumRows = 17;
   bst_feature_t constexpr kNumCols = 15;
 
-  std::shared_ptr<DMatrix> p_mat{
-      RandomDataGenerator{kNumRows, kNumCols, 0.0f}.GenerateDMatrix()};
+  std::shared_ptr<DMatrix> p_mat{RandomDataGenerator{kNumRows, kNumCols, 0.0f}.GenerateDMatrix()};
   std::vector<bst_float> weight(kNumGroups, 1);
   std::vector<bst_int> group(kNumGroups);
   group[0] = 2;
@@ -312,35 +309,36 @@ TEST(Learner, GPUConfiguration) {
     learner->SetParams({Arg{"booster", "gblinear"},
                         Arg{"updater", "gpu_coord_descent"}});
     learner->UpdateOneIter(0, p_dmat);
-    ASSERT_EQ(learner->Ctx()->gpu_id, 0);
+    ASSERT_EQ(learner->Ctx()->Device(), DeviceOrd::CUDA(0));
   }
   {
-    std::unique_ptr<Learner> learner {Learner::Create(mat)};
+    std::unique_ptr<Learner> learner{Learner::Create(mat)};
     learner->SetParams({Arg{"tree_method", "gpu_hist"}});
+    learner->Configure();
+    ASSERT_EQ(learner->Ctx()->Device(), DeviceOrd::CUDA(0));
     learner->UpdateOneIter(0, p_dmat);
-    ASSERT_EQ(learner->Ctx()->gpu_id, 0);
+    ASSERT_EQ(learner->Ctx()->Device(), DeviceOrd::CUDA(0));
   }
   {
     std::unique_ptr<Learner> learner {Learner::Create(mat)};
     learner->SetParams({Arg{"tree_method", "gpu_hist"},
                         Arg{"gpu_id", "-1"}});
     learner->UpdateOneIter(0, p_dmat);
-    ASSERT_EQ(learner->Ctx()->gpu_id, 0);
+    ASSERT_EQ(learner->Ctx()->Device(), DeviceOrd::CUDA(0));
   }
   {
     // with CPU algorithm
     std::unique_ptr<Learner> learner {Learner::Create(mat)};
     learner->SetParams({Arg{"tree_method", "hist"}});
     learner->UpdateOneIter(0, p_dmat);
-    ASSERT_EQ(learner->Ctx()->gpu_id, -1);
+    ASSERT_EQ(learner->Ctx()->Device(), DeviceOrd::CPU());
   }
   {
     // with CPU algorithm, but `gpu_id` takes priority
     std::unique_ptr<Learner> learner {Learner::Create(mat)};
-    learner->SetParams({Arg{"tree_method", "hist"},
-                        Arg{"gpu_id", "0"}});
+    learner->SetParams({Arg{"tree_method", "hist"}, Arg{"gpu_id", "0"}});
     learner->UpdateOneIter(0, p_dmat);
-    ASSERT_EQ(learner->Ctx()->gpu_id, 0);
+    ASSERT_EQ(learner->Ctx()->Device(), DeviceOrd::CUDA(0));
   }
 }
 #endif  // defined(XGBOOST_USE_CUDA)
diff --git a/tests/cpp/tree/test_node_partition.cc b/tests/cpp/tree/test_node_partition.cc
index d7254fa60..abde2da70 100644
--- a/tests/cpp/tree/test_node_partition.cc
+++ b/tests/cpp/tree/test_node_partition.cc
@@ -6,7 +6,9 @@
 #include <xgboost/task.h>          // for ObjInfo
 #include <xgboost/tree_updater.h>  // for TreeUpdater
 
-#include <memory>                  // for unique_ptr
+#include <memory>  // for unique_ptr
+
+#include "../helpers.h"
 
 namespace xgboost {
 TEST(Updater, HasNodePosition) {
@@ -19,7 +21,7 @@ TEST(Updater, HasNodePosition) {
   ASSERT_TRUE(up->HasNodePosition());
 
 #if defined(XGBOOST_USE_CUDA)
-  ctx.gpu_id = 0;
+  ctx = MakeCUDACtx(0);
   up.reset(TreeUpdater::Create("grow_gpu_hist", &ctx, &task));
   ASSERT_TRUE(up->HasNodePosition());
 #endif  // defined(XGBOOST_USE_CUDA)
diff --git a/tests/cpp/tree/test_prediction_cache.cc b/tests/cpp/tree/test_prediction_cache.cc
index 1877b7a35..e60d9cd8a 100644
--- a/tests/cpp/tree/test_prediction_cache.cc
+++ b/tests/cpp/tree/test_prediction_cache.cc
@@ -70,9 +70,9 @@ class TestPredictionCache : public ::testing::Test {
       Context ctx;
       ctx.InitAllowUnknown(Args{{"nthread", "8"}});
       if (updater_name == "grow_gpu_hist") {
-        ctx.gpu_id = 0;
+        ctx = ctx.MakeCUDA(0);
       } else {
-        ctx.gpu_id = Context::kCpuId;
+        ctx = ctx.MakeCPU();
       }
 
       ObjInfo task{ObjInfo::kRegression};
diff --git a/tests/python-gpu/load_pickle.py b/tests/python-gpu/load_pickle.py
index caefa362d..a63dd28aa 100644
--- a/tests/python-gpu/load_pickle.py
+++ b/tests/python-gpu/load_pickle.py
@@ -34,7 +34,7 @@ class TestLoadPickle:
         bst = load_pickle(model_path)
         config = bst.save_config()
         config = json.loads(config)
-        assert config["learner"]["generic_param"]["gpu_id"] == "-1"
+        assert config["learner"]["generic_param"]["device"] == "cpu"
 
     def test_context_is_preserved(self) -> None:
         """Test the device context is preserved after pickling."""
@@ -42,14 +42,14 @@ class TestLoadPickle:
         bst = load_pickle(model_path)
         config = bst.save_config()
         config = json.loads(config)
-        assert config["learner"]["generic_param"]["gpu_id"] == "0"
+        assert config["learner"]["generic_param"]["device"] == "cuda:0"
 
     def test_wrap_gpu_id(self) -> None:
         assert os.environ["CUDA_VISIBLE_DEVICES"] == "0"
         bst = load_pickle(model_path)
         config = bst.save_config()
         config = json.loads(config)
-        assert config["learner"]["generic_param"]["gpu_id"] == "0"
+        assert config["learner"]["generic_param"]["device"] == "cuda:0"
 
         x, y = build_dataset()
         test_x = xgb.DMatrix(x)
diff --git a/tests/python-gpu/test_device_quantile_dmatrix.py b/tests/python-gpu/test_device_quantile_dmatrix.py
index 477e9f2a1..ace17933b 100644
--- a/tests/python-gpu/test_device_quantile_dmatrix.py
+++ b/tests/python-gpu/test_device_quantile_dmatrix.py
@@ -203,7 +203,7 @@ class TestQuantileDMatrix:
         np.testing.assert_equal(h_ret.indices, d_ret.indices)
 
         booster = xgb.train(
-            {"tree_method": "gpu_hist", "gpu_id": "0"}, dtrain=d_m
+            {"tree_method": "hist", "device": "cuda:0"}, dtrain=d_m
         )
 
         np.testing.assert_allclose(
diff --git a/tests/python-gpu/test_gpu_basic_models.py b/tests/python-gpu/test_gpu_basic_models.py
index a6f50c224..e97ca210e 100644
--- a/tests/python-gpu/test_gpu_basic_models.py
+++ b/tests/python-gpu/test_gpu_basic_models.py
@@ -65,16 +65,20 @@ class TestGPUBasicModels:
     @pytest.mark.skipif(**tm.no_sklearn())
     def test_invalid_gpu_id(self):
         from sklearn.datasets import load_digits
+
         X, y = load_digits(return_X_y=True)
         # should pass with invalid gpu id
-        cls1 = xgb.XGBClassifier(tree_method='gpu_hist', gpu_id=9999)
+        cls1 = xgb.XGBClassifier(tree_method="gpu_hist", gpu_id=9999)
         cls1.fit(X, y)
         # should throw error with fail_on_invalid_gpu_id enabled
         cls2 = xgb.XGBClassifier(
-            tree_method='gpu_hist', gpu_id=9999, fail_on_invalid_gpu_id=True
+            tree_method="gpu_hist", gpu_id=9999, fail_on_invalid_gpu_id=True
         )
-        try:
+        with pytest.raises(ValueError, match="ordinal 9999 is invalid"):
+            cls2.fit(X, y)
+
+        cls2 = xgb.XGBClassifier(
+            tree_method="hist", device="cuda:9999", fail_on_invalid_gpu_id=True
+        )
+        with pytest.raises(ValueError, match="ordinal 9999 is invalid"):
             cls2.fit(X, y)
-            assert False, "Should have failed with with fail_on_invalid_gpu_id enabled"
-        except xgb.core.XGBoostError as err:
-            assert "gpu_id 9999 is invalid" in str(err)
diff --git a/tests/python-gpu/test_gpu_eval_metrics.py b/tests/python-gpu/test_gpu_eval_metrics.py
index f5f770d2f..f084eaa45 100644
--- a/tests/python-gpu/test_gpu_eval_metrics.py
+++ b/tests/python-gpu/test_gpu_eval_metrics.py
@@ -43,10 +43,16 @@ class TestGPUEvalMetrics:
             num_boost_round=10,
         )
         cpu_auc = float(booster.eval(Xy).split(":")[1])
-        booster.set_param({"gpu_id": "0"})
-        assert json.loads(booster.save_config())["learner"]["generic_param"]["gpu_id"] == "0"
+        booster.set_param({"device": "cuda:0"})
+        assert (
+            json.loads(booster.save_config())["learner"]["generic_param"]["device"]
+            == "cuda:0"
+        )
         gpu_auc = float(booster.eval(Xy).split(":")[1])
-        assert json.loads(booster.save_config())["learner"]["generic_param"]["gpu_id"] == "0"
+        assert (
+            json.loads(booster.save_config())["learner"]["generic_param"]["device"]
+            == "cuda:0"
+        )
 
         np.testing.assert_allclose(cpu_auc, gpu_auc)
 
diff --git a/tests/python-gpu/test_gpu_pickling.py b/tests/python-gpu/test_gpu_pickling.py
index 49ac24740..10c4c7e45 100644
--- a/tests/python-gpu/test_gpu_pickling.py
+++ b/tests/python-gpu/test_gpu_pickling.py
@@ -113,14 +113,6 @@ class TestPickling:
         param = {"tree_method": "gpu_hist", "verbosity": 1}
         bst = xgb.train(param, train_x)
 
-        with tm.captured_output() as (out, err):
-            bst.inplace_predict(x)
-
-        # The warning is redirected to Python callback, so it's printed in stdout
-        # instead of stderr.
-        stdout = out.getvalue()
-        assert stdout.find("mismatched devices") != -1
-
         save_pickle(bst, model_path)
 
         args = self.args_template.copy()
@@ -177,7 +169,7 @@ class TestPickling:
 
         # Switch to CPU predictor
         bst = model.get_booster()
-        tm.set_ordinal(-1, bst)
+        bst.set_param({"device": "cpu"})
         cpu_pred = model.predict(x, output_margin=True)
         np.testing.assert_allclose(cpu_pred, gpu_pred, rtol=1e-5)
 
diff --git a/tests/python-gpu/test_gpu_prediction.py b/tests/python-gpu/test_gpu_prediction.py
index 0d961d0e3..fb5f47c2b 100644
--- a/tests/python-gpu/test_gpu_prediction.py
+++ b/tests/python-gpu/test_gpu_prediction.py
@@ -39,7 +39,8 @@ predict_parameter_strategy = strategies.fixed_dictionaries(
     }
 )
 
-pytestmark = tm.timeout(20)
+# cupy nvrtc compilation can take a long time for the first run
+pytestmark = tm.timeout(30)
 
 
 class TestGPUPredict:
@@ -71,8 +72,8 @@ class TestGPUPredict:
                 param = {
                     "objective": "binary:logistic",
                     "eval_metric": "logloss",
-                    "tree_method": "gpu_hist",
-                    "gpu_id": 0,
+                    "tree_method": "hist",
+                    "device": "gpu:0",
                     "max_depth": 1,
                 }
                 bst = xgb.train(
@@ -84,7 +85,7 @@ class TestGPUPredict:
                 gpu_pred_test = bst.predict(dtest, output_margin=True)
                 gpu_pred_val = bst.predict(dval, output_margin=True)
 
-                bst.set_param({"gpu_id": -1, "tree_method": "hist"})
+                bst.set_param({"device": "cpu", "tree_method": "hist"})
                 bst_cpu = copy(bst)
                 cpu_pred_train = bst_cpu.predict(dtrain, output_margin=True)
                 cpu_pred_test = bst_cpu.predict(dtest, output_margin=True)
@@ -107,14 +108,15 @@ class TestGPUPredict:
         dtrain = xgb.DMatrix(X_train, label=y_train)
 
         params = {}
-        params["tree_method"] = "gpu_hist"
+        params["tree_method"] = "hist"
+        params["device"] = "cuda:0"
         bst = xgb.train(params, dtrain)
 
-        tm.set_ordinal(0, bst)
+        bst.set_param({"device": "cuda:0"})
         # Don't reuse the DMatrix for prediction, otherwise the result is cached.
         predict_gpu_0 = bst.predict(xgb.DMatrix(X_test))
         predict_gpu_1 = bst.predict(xgb.DMatrix(X_test))
-        tm.set_ordinal(-1, bst)
+        bst.set_param({"device": "cpu"})
         predict_cpu = bst.predict(xgb.DMatrix(X_test))
 
         assert np.allclose(predict_gpu_0, predict_gpu_1)
@@ -131,8 +133,8 @@ class TestGPUPredict:
         X_test, y_test = X[tr_size:, :], y[tr_size:]
 
         params = {
-            "tree_method": "gpu_hist",
-            "gpu_id": "0",
+            "tree_method": "hist",
+            "device": "cuda:0",
             "n_jobs": -1,
             "seed": 123,
         }
@@ -141,13 +143,54 @@ class TestGPUPredict:
         gpu_test_score = m.score(X_test, y_test)
 
         # Now with cpu
-        m = tm.set_ordinal(-1, m)
+        m.set_params(device="cpu")
         cpu_train_score = m.score(X_train, y_train)
         cpu_test_score = m.score(X_test, y_test)
 
         assert np.allclose(cpu_train_score, gpu_train_score)
         assert np.allclose(cpu_test_score, gpu_test_score)
 
+    @pytest.mark.parametrize("device", ["cpu", "cuda"])
+    @pytest.mark.skipif(**tm.no_cupy())
+    def test_inplace_predict_device_type(self, device: str) -> None:
+        """Test inplace predict with different device and data types.
+
+        The sklearn interface uses inplace predict by default and gbtree fallbacks to
+        DMatrix whenever device doesn't match. This test checks that XGBoost can handle
+        different combinations of device and input data type.
+
+        """
+        import cudf
+        import cupy as cp
+        import pandas as pd
+        from scipy.sparse import csr_matrix
+
+        reg = xgb.XGBRegressor(tree_method="hist", device=device)
+        n_samples = 4096
+        n_features = 13
+        X, y, w = tm.make_regression(n_samples, n_features, use_cupy=True)
+        X[X == 0.0] = 1.0
+
+        reg.fit(X, y, sample_weight=w)
+        predt_0 = reg.predict(X)
+
+        X = cp.asnumpy(X)
+        predt_1 = reg.predict(X)
+
+        df = pd.DataFrame(X)
+        predt_2 = reg.predict(df)
+
+        df = cudf.DataFrame(X)
+        predt_3 = reg.predict(df)
+
+        X_csr = csr_matrix(X)
+        predt_4 = reg.predict(X_csr)
+
+        np.testing.assert_allclose(predt_0, predt_1)
+        np.testing.assert_allclose(predt_0, predt_2)
+        np.testing.assert_allclose(predt_0, predt_3)
+        np.testing.assert_allclose(predt_0, predt_4)
+
     def run_inplace_base_margin(self, booster, dtrain, X, base_margin):
         import cupy as cp
 
@@ -175,7 +218,9 @@ class TestGPUPredict:
         dtrain = xgb.DMatrix(X, y)
 
         booster = xgb.train(
-            {"tree_method": "gpu_hist", "gpu_id": device}, dtrain, num_boost_round=10
+            {"tree_method": "hist", "device": f"cuda:{device}"},
+            dtrain,
+            num_boost_round=10,
         )
 
         test = xgb.DMatrix(X[:10, ...], missing=missing)
@@ -208,13 +253,13 @@ class TestGPUPredict:
         missing_idx = [i for i in range(0, X.shape[1], 16)]
         X[:, missing_idx] = missing
         reg = xgb.XGBRegressor(
-            tree_method="gpu_hist", n_estimators=8, missing=missing, gpu_id=device
+            tree_method="hist", n_estimators=8, missing=missing, device=f"cuda:{device}"
         )
         reg.fit(X, y)
 
-        reg = tm.set_ordinal(device, reg)
+        reg.set_params(device=f"cuda:{device}")
         gpu_predt = reg.predict(X)
-        reg = tm.set_ordinal(-1, reg)
+        reg = reg.set_params(device="cpu")
         cpu_predt = reg.predict(cp.asnumpy(X))
         np.testing.assert_allclose(gpu_predt, cpu_predt, atol=1e-6)
         cp.cuda.runtime.setDevice(0)
@@ -250,7 +295,9 @@ class TestGPUPredict:
 
         dtrain = xgb.DMatrix(X, y)
 
-        booster = xgb.train({"tree_method": "gpu_hist"}, dtrain, num_boost_round=10)
+        booster = xgb.train(
+            {"tree_method": "hist", "device": "cuda:0"}, dtrain, num_boost_round=10
+        )
         test = xgb.DMatrix(X)
         predt_from_array = booster.inplace_predict(X)
         predt_from_dmatrix = booster.predict(test)
@@ -280,12 +327,12 @@ class TestGPUPredict:
     def test_shap(self, num_rounds, dataset, param):
         if dataset.name.endswith("-l1"):  # not supported by the exact tree method
             return
-        param.update({"tree_method": "gpu_hist", "gpu_id": 0})
+        param.update({"tree_method": "hist", "device": "gpu:0"})
         param = dataset.set_params(param)
         dmat = dataset.get_dmat()
         bst = xgb.train(param, dmat, num_rounds)
         test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w, dataset.margin)
-        bst = tm.set_ordinal(0, bst)
+        bst.set_param({"device": "gpu:0"})
         shap = bst.predict(test_dmat, pred_contribs=True)
         margin = bst.predict(test_dmat, output_margin=True)
         assume(len(dataset.y) > 0)
@@ -298,12 +345,12 @@ class TestGPUPredict:
     def test_shap_interactions(self, num_rounds, dataset, param):
         if dataset.name.endswith("-l1"):  # not supported by the exact tree method
             return
-        param.update({"tree_method": "hist", "gpu_id": 0})
+        param.update({"tree_method": "hist", "device": "cuda:0"})
         param = dataset.set_params(param)
         dmat = dataset.get_dmat()
         bst = xgb.train(param, dmat, num_rounds)
         test_dmat = xgb.DMatrix(dataset.X, dataset.y, dataset.w, dataset.margin)
-        bst = tm.set_ordinal(0, bst)
+        bst.set_param({"device": "cuda:0"})
         shap = bst.predict(test_dmat, pred_interactions=True)
         margin = bst.predict(test_dmat, output_margin=True)
         assume(len(dataset.y) > 0)
@@ -317,16 +364,18 @@ class TestGPUPredict:
     def test_shap_categorical(self):
         X, y = tm.make_categorical(100, 20, 7, False)
         Xy = xgb.DMatrix(X, y, enable_categorical=True)
-        booster = xgb.train({"tree_method": "gpu_hist"}, Xy, num_boost_round=10)
+        booster = xgb.train(
+            {"tree_method": "hist", "device": "gpu:0"}, Xy, num_boost_round=10
+        )
 
-        booster = tm.set_ordinal(0, booster)
+        booster.set_param({"device": "cuda:0"})
         shap = booster.predict(Xy, pred_contribs=True)
         margin = booster.predict(Xy, output_margin=True)
         np.testing.assert_allclose(
             np.sum(shap, axis=len(shap.shape) - 1), margin, rtol=1e-3
         )
 
-        booster = tm.set_ordinal(-1, booster)
+        booster.set_param({"device": "cpu"})
         shap = booster.predict(Xy, pred_contribs=True)
         margin = booster.predict(Xy, output_margin=True)
         np.testing.assert_allclose(
@@ -334,8 +383,8 @@ class TestGPUPredict:
         )
 
     def test_predict_leaf_basic(self):
-        gpu_leaf = run_predict_leaf(0)
-        cpu_leaf = run_predict_leaf(-1)
+        gpu_leaf = run_predict_leaf("gpu:0")
+        cpu_leaf = run_predict_leaf("cpu")
         np.testing.assert_equal(gpu_leaf, cpu_leaf)
 
     def run_predict_leaf_booster(self, param, num_rounds, dataset):
@@ -344,23 +393,22 @@ class TestGPUPredict:
         booster = xgb.train(
             param, dtrain=dataset.get_dmat(), num_boost_round=num_rounds
         )
-        booster = tm.set_ordinal(-1, booster)
+        booster.set_param({"device": "cpu"})
         cpu_leaf = booster.predict(m, pred_leaf=True)
 
-        booster = tm.set_ordinal(0, booster)
+        booster.set_param({"device": "cuda:0"})
         gpu_leaf = booster.predict(m, pred_leaf=True)
 
         np.testing.assert_equal(cpu_leaf, gpu_leaf)
 
     @given(predict_parameter_strategy, tm.make_dataset_strategy())
     @settings(deadline=None, max_examples=20, print_blob=True)
-    def test_predict_leaf_gbtree(self, param, dataset):
+    def test_predict_leaf_gbtree(self, param: dict, dataset: tm.TestDataset) -> None:
         # Unsupported for random forest
         if param.get("num_parallel_tree", 1) > 1 and dataset.name.endswith("-l1"):
             return
 
-        param["booster"] = "gbtree"
-        param["tree_method"] = "gpu_hist"
+        param.update({"booster": "gbtree", "tree_method": "hist", "device": "cuda:0"})
         self.run_predict_leaf_booster(param, 10, dataset)
 
     @given(predict_parameter_strategy, tm.make_dataset_strategy())
@@ -370,8 +418,7 @@ class TestGPUPredict:
         if param.get("num_parallel_tree", 1) > 1 and dataset.name.endswith("-l1"):
             return
 
-        param["booster"] = "dart"
-        param["tree_method"] = "gpu_hist"
+        param.update({"booster": "dart", "tree_method": "hist", "device": "cuda:0"})
         self.run_predict_leaf_booster(param, 10, dataset)
 
     @pytest.mark.skipif(**tm.no_sklearn())
@@ -395,12 +442,12 @@ class TestGPUPredict:
         dtrain = xgb.DMatrix(df, label=y, enable_categorical=True)
 
         params = {
-            "tree_method": "gpu_hist",
+            "tree_method": "hist",
             "max_depth": 3,
             "learning_rate": 1.0,
             "base_score": 0.0,
             "eval_metric": "rmse",
-            "gpu_id": "0",
+            "device": "cuda:0",
         }
 
         eval_history = {}
@@ -412,7 +459,7 @@ class TestGPUPredict:
             verbose_eval=False,
             evals_result=eval_history,
         )
-        bst = tm.set_ordinal(0, bst)
+        bst.set_param({"device": "cuda:0"})
         pred = bst.predict(dtrain)
         rmse = mean_squared_error(y_true=y, y_pred=pred, squared=False)
         np.testing.assert_almost_equal(
@@ -434,14 +481,16 @@ class TestGPUPredict:
         Xy = xgb.DMatrix(X, y)
         if n_classes == 2:
             params = {
-                "tree_method": "gpu_hist",
+                "tree_method": "hist",
+                "device": "cuda:0",
                 "booster": "dart",
                 "rate_drop": 0.5,
                 "objective": "binary:logistic",
             }
         else:
             params = {
-                "tree_method": "gpu_hist",
+                "tree_method": "hist",
+                "device": "cuda:0",
                 "booster": "dart",
                 "rate_drop": 0.5,
                 "objective": "multi:softprob",
@@ -455,7 +504,7 @@ class TestGPUPredict:
         copied = booster.predict(Xy)
 
         # CPU
-        booster = tm.set_ordinal(-1, booster)
+        booster.set_param({"device": "cpu"})
         cpu_inplace = booster.inplace_predict(X_)
         cpu_copied = booster.predict(Xy)
 
@@ -465,7 +514,7 @@ class TestGPUPredict:
         cp.testing.assert_allclose(inplace, copied, atol=1e-6)
 
         # GPU
-        booster = tm.set_ordinal(0, booster)
+        booster.set_param({"device": "cuda:0"})
         inplace = booster.inplace_predict(X)
         copied = booster.predict(Xy)
 
@@ -482,7 +531,7 @@ class TestGPUPredict:
         orig = rng.randint(low=0, high=127, size=rows * cols).reshape(rows, cols)
         y = rng.randint(low=0, high=127, size=rows)
         dtrain = xgb.DMatrix(orig, label=y)
-        booster = xgb.train({"tree_method": "gpu_hist"}, dtrain)
+        booster = xgb.train({"tree_method": "hist", "device": "cuda:0"}, dtrain)
 
         predt_orig = booster.inplace_predict(orig)
         # all primitive types in numpy
diff --git a/tests/python/test_predict.py b/tests/python/test_predict.py
index 15288f53e..04a7d70cb 100644
--- a/tests/python/test_predict.py
+++ b/tests/python/test_predict.py
@@ -28,7 +28,7 @@ def run_threaded_predict(X, rows, predict_func):
         assert f.result()
 
 
-def run_predict_leaf(gpu_id: int) -> np.ndarray:
+def run_predict_leaf(device: str) -> np.ndarray:
     rows = 100
     cols = 4
     classes = 5
@@ -48,7 +48,7 @@ def run_predict_leaf(gpu_id: int) -> np.ndarray:
         num_boost_round=num_boost_round,
     )
 
-    booster = tm.set_ordinal(gpu_id, booster)
+    booster.set_param({"device": device})
     empty = xgb.DMatrix(np.ones(shape=(0, cols)))
     empty_leaf = booster.predict(empty, pred_leaf=True)
     assert empty_leaf.shape[0] == 0
@@ -74,14 +74,14 @@ def run_predict_leaf(gpu_id: int) -> np.ndarray:
 
     # When there's only 1 tree, the output is a 1 dim vector
     booster = xgb.train({"tree_method": "hist"}, num_boost_round=1, dtrain=m)
-    booster = tm.set_ordinal(gpu_id, booster)
+    booster.set_param({"device": device})
     assert booster.predict(m, pred_leaf=True).shape == (rows,)
 
     return leaf
 
 
 def test_predict_leaf() -> None:
-    run_predict_leaf(-1)
+    run_predict_leaf("cpu")
 
 
 def test_predict_shape():
diff --git a/tests/test_distributed/test_with_spark/test_data.py b/tests/test_distributed/test_with_spark/test_data.py
index b08fcdf1d..7f8f1a13e 100644
--- a/tests/test_distributed/test_with_spark/test_data.py
+++ b/tests/test_distributed/test_with_spark/test_data.py
@@ -69,7 +69,7 @@ def run_dmatrix_ctor(is_feature_cols: bool, is_qdm: bool, on_gpu: bool) -> None:
     train_Xy, valid_Xy = create_dmatrix_from_partitions(
         iter(dfs),
         feature_cols,
-        gpu_id=device_id,
+        dev_ordinal=device_id,
         use_qdm=is_qdm,
         kwargs=kwargs,
         enable_sparse_data_optim=False,
diff --git a/tests/test_distributed/test_with_spark/test_spark_local.py b/tests/test_distributed/test_with_spark/test_spark_local.py
index 6d88323ac..dfdadb2ef 100644
--- a/tests/test_distributed/test_with_spark/test_spark_local.py
+++ b/tests/test_distributed/test_with_spark/test_spark_local.py
@@ -1025,6 +1025,7 @@ class XgboostLocalTest(SparkTestCase):
         self.assertTrue(hasattr(py_reg, "n_estimators"))
         self.assertEqual(py_reg.n_estimators.parent, py_reg.uid)
         self.assertFalse(hasattr(py_reg, "gpu_id"))
+        self.assertFalse(hasattr(py_reg, "device"))
         self.assertEqual(py_reg.getOrDefault(py_reg.n_estimators), 100)
         self.assertEqual(py_reg.getOrDefault(py_reg.objective), "reg:squarederror")
         py_reg2 = SparkXGBRegressor(n_estimators=200)
@@ -1038,6 +1039,7 @@ class XgboostLocalTest(SparkTestCase):
         self.assertTrue(hasattr(py_cls, "n_estimators"))
         self.assertEqual(py_cls.n_estimators.parent, py_cls.uid)
         self.assertFalse(hasattr(py_cls, "gpu_id"))
+        self.assertFalse(hasattr(py_cls, "device"))
         self.assertEqual(py_cls.getOrDefault(py_cls.n_estimators), 100)
         self.assertEqual(py_cls.getOrDefault(py_cls.objective), None)
         py_cls2 = SparkXGBClassifier(n_estimators=200)
@@ -1051,6 +1053,7 @@ class XgboostLocalTest(SparkTestCase):
         self.assertTrue(hasattr(py_cls, "n_estimators"))
         self.assertEqual(py_cls.n_estimators.parent, py_cls.uid)
         self.assertFalse(hasattr(py_cls, "gpu_id"))
+        self.assertFalse(hasattr(py_cls, "device"))
         self.assertTrue(hasattr(py_cls, "arbitrary_params_dict"))
         expected_kwargs = {"sketch_eps": 0.03}
         self.assertEqual(

From 9da50506439be367dd0abec72f9057a7bfba0b75 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 15 Jul 2023 07:46:43 +0800
Subject: [PATCH 032/136] Turn warning messages into Python warnings. (#9387)

---
 python-package/xgboost/core.py    |  6 +++++-
 tests/python-gpu/load_pickle.py   |  6 ++----
 tests/python/test_with_sklearn.py | 20 +++++++++-----------
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index d6214c7a6..0250dd293 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -153,7 +153,11 @@ def _expect(expectations: Sequence[Type], got: Type) -> str:
 
 def _log_callback(msg: bytes) -> None:
     """Redirect logs from native library into Python console"""
-    print(py_str(msg))
+    smsg = py_str(msg)
+    if smsg.find("WARNING:") != -1:
+        warnings.warn(smsg, UserWarning)
+        return
+    print(smsg)
 
 
 def _get_log_callback_func() -> Callable:
diff --git a/tests/python-gpu/load_pickle.py b/tests/python-gpu/load_pickle.py
index a63dd28aa..2f582e535 100644
--- a/tests/python-gpu/load_pickle.py
+++ b/tests/python-gpu/load_pickle.py
@@ -61,9 +61,7 @@ class TestLoadPickle:
         rng = np.random.RandomState(1994)
         X = rng.randn(10, 10)
         y = rng.randn(10)
-        with tm.captured_output() as (out, err):
+        with pytest.warns(UserWarning, match="No visible GPU is found"):
             # Test no thrust exception is thrown
-            with pytest.raises(xgb.core.XGBoostError):
+            with pytest.raises(xgb.core.XGBoostError, match="have at least one device"):
                 xgb.train({"tree_method": "gpu_hist"}, xgb.DMatrix(X, y))
-
-            assert out.getvalue().find("No visible GPU is found") != -1
diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
index b4550dab2..b961db2c4 100644
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -3,6 +3,7 @@ import os
 import pickle
 import random
 import tempfile
+import warnings
 from typing import Callable, Optional
 
 import numpy as np
@@ -1091,25 +1092,22 @@ def test_constraint_parameters():
     )
 
 
+@pytest.mark.filterwarnings("error")
 def test_parameter_validation():
-    reg = xgb.XGBRegressor(foo='bar', verbosity=1)
+    reg = xgb.XGBRegressor(foo="bar", verbosity=1)
     X = np.random.randn(10, 10)
     y = np.random.randn(10)
-    with tm.captured_output() as (out, err):
+    with pytest.warns(Warning, match="foo"):
         reg.fit(X, y)
-        output = out.getvalue().strip()
 
-    assert output.find('foo') != -1
-
-    reg = xgb.XGBRegressor(n_estimators=2, missing=3,
-                           importance_type='gain', verbosity=1)
+    reg = xgb.XGBRegressor(
+        n_estimators=2, missing=3, importance_type="gain", verbosity=1
+    )
     X = np.random.randn(10, 10)
     y = np.random.randn(10)
-    with tm.captured_output() as (out, err):
-        reg.fit(X, y)
-        output = out.getvalue().strip()
 
-    assert len(output) == 0
+    with warnings.catch_warnings():
+        reg.fit(X, y)
 
 
 def test_deprecate_position_arg():

From 16eb41936d6edfa216f78029ed2980f391a97adc Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 15 Jul 2023 19:11:20 +0800
Subject: [PATCH 033/136] Handle the new `device` parameter in dask and demos.
 (#9386)

* Handle the new `device` parameter in dask and demos.

- Check no ordinal is specified in the dask interface.
- Update demos.
- Update dask doc.
- Update the condition for QDM.
---
 demo/dask/cpu_survival.py                     |  52 +-
 demo/dask/cpu_training.py                     |  22 +-
 demo/dask/gpu_training.py                     |  59 +-
 demo/dask/sklearn_gpu_training.py             |   9 +-
 demo/guide-python/callbacks.py                |   3 +-
 demo/guide-python/cat_in_the_dat.py           |   3 +-
 demo/guide-python/categorical.py              |   4 +-
 demo/guide-python/external_memory.py          |   5 +-
 demo/guide-python/learning_to_rank.py         |   6 +-
 demo/guide-python/quantile_data_iterator.py   |  35 +-
 demo/guide-python/update_process.py           |   8 +-
 doc/gpu/index.rst                             |  31 +-
 doc/parameter.rst                             |   8 +-
 doc/python/python_intro.rst                   |   4 +-
 doc/tutorials/dask.rst                        |  11 +-
 include/xgboost/c_api.h                       |   2 +-
 python-package/xgboost/core.py                |   2 +-
 python-package/xgboost/dask.py                |  20 +-
 python-package/xgboost/sklearn.py             |   6 +-
 python-package/xgboost/spark/core.py          |   8 +-
 src/c_api/c_api.cc                            |  15 +-
 src/common/error_msg.cc                       |  13 +-
 src/common/error_msg.h                        |   3 +
 src/learner.cc                                |  15 +-
 tests/ci_build/lint_python.py                 |   3 +
 tests/python/test_quantile_dmatrix.py         |  12 +
 tests/python/test_updaters.py                 |  22 +-
 tests/python/test_with_sklearn.py             |   5 +-
 .../test_gpu_with_dask/test_gpu_with_dask.py  |  68 +-
 .../test_with_dask/test_with_dask.py          | 623 ++++++++++--------
 .../test_with_spark/test_spark_local.py       |   4 +-
 31 files changed, 631 insertions(+), 450 deletions(-)

diff --git a/demo/dask/cpu_survival.py b/demo/dask/cpu_survival.py
index 83eddd361..7fe0570de 100644
--- a/demo/dask/cpu_survival.py
+++ b/demo/dask/cpu_survival.py
@@ -18,43 +18,45 @@ def main(client):
     # The Veterans' Administration Lung Cancer Trial
     # The Statistical Analysis of Failure Time Data by Kalbfleisch J. and Prentice R (1980)
     CURRENT_DIR = os.path.dirname(__file__)
-    df = dd.read_csv(os.path.join(CURRENT_DIR, os.pardir, 'data', 'veterans_lung_cancer.csv'))
+    df = dd.read_csv(
+        os.path.join(CURRENT_DIR, os.pardir, "data", "veterans_lung_cancer.csv")
+    )
 
     # DaskDMatrix acts like normal DMatrix, works as a proxy for local
     # DMatrix scatter around workers.
     # For AFT survival, you'd need to extract the lower and upper bounds for the label
     # and pass them as arguments to DaskDMatrix.
-    y_lower_bound = df['Survival_label_lower_bound']
-    y_upper_bound = df['Survival_label_upper_bound']
-    X = df.drop(['Survival_label_lower_bound',
-                 'Survival_label_upper_bound'], axis=1)
-    dtrain = DaskDMatrix(client, X, label_lower_bound=y_lower_bound,
-                         label_upper_bound=y_upper_bound)
+    y_lower_bound = df["Survival_label_lower_bound"]
+    y_upper_bound = df["Survival_label_upper_bound"]
+    X = df.drop(["Survival_label_lower_bound", "Survival_label_upper_bound"], axis=1)
+    dtrain = DaskDMatrix(
+        client, X, label_lower_bound=y_lower_bound, label_upper_bound=y_upper_bound
+    )
 
     # Use train method from xgboost.dask instead of xgboost.  This
     # distributed version of train returns a dictionary containing the
     # resulting booster and evaluation history obtained from
     # evaluation metrics.
-    params = {'verbosity': 1,
-              'objective': 'survival:aft',
-              'eval_metric': 'aft-nloglik',
-              'learning_rate': 0.05,
-              'aft_loss_distribution_scale': 1.20,
-              'aft_loss_distribution': 'normal',
-              'max_depth': 6,
-              'lambda': 0.01,
-              'alpha': 0.02}
-    output = xgb.dask.train(client,
-                            params,
-                            dtrain,
-                            num_boost_round=100,
-                            evals=[(dtrain, 'train')])
-    bst = output['booster']
-    history = output['history']
+    params = {
+        "verbosity": 1,
+        "objective": "survival:aft",
+        "eval_metric": "aft-nloglik",
+        "learning_rate": 0.05,
+        "aft_loss_distribution_scale": 1.20,
+        "aft_loss_distribution": "normal",
+        "max_depth": 6,
+        "lambda": 0.01,
+        "alpha": 0.02,
+    }
+    output = xgb.dask.train(
+        client, params, dtrain, num_boost_round=100, evals=[(dtrain, "train")]
+    )
+    bst = output["booster"]
+    history = output["history"]
 
     # you can pass output directly into `predict` too.
     prediction = xgb.dask.predict(client, bst, dtrain)
-    print('Evaluation history: ', history)
+    print("Evaluation history: ", history)
 
     # Uncomment the following line to save the model to the disk
     # bst.save_model('survival_model.json')
@@ -62,7 +64,7 @@ def main(client):
     return prediction
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     # or use other clusters for scaling
     with LocalCluster(n_workers=7, threads_per_worker=4) as cluster:
         with Client(cluster) as client:
diff --git a/demo/dask/cpu_training.py b/demo/dask/cpu_training.py
index a31e5d2a6..811af5cd3 100644
--- a/demo/dask/cpu_training.py
+++ b/demo/dask/cpu_training.py
@@ -15,7 +15,7 @@ def main(client):
     m = 100000
     n = 100
     X = da.random.random(size=(m, n), chunks=100)
-    y = da.random.random(size=(m, ), chunks=100)
+    y = da.random.random(size=(m,), chunks=100)
 
     # DaskDMatrix acts like normal DMatrix, works as a proxy for local
     # DMatrix scatter around workers.
@@ -25,21 +25,23 @@ def main(client):
     # distributed version of train returns a dictionary containing the
     # resulting booster and evaluation history obtained from
     # evaluation metrics.
-    output = xgb.dask.train(client,
-                            {'verbosity': 1,
-                             'tree_method': 'hist'},
-                            dtrain,
-                            num_boost_round=4, evals=[(dtrain, 'train')])
-    bst = output['booster']
-    history = output['history']
+    output = xgb.dask.train(
+        client,
+        {"verbosity": 1, "tree_method": "hist"},
+        dtrain,
+        num_boost_round=4,
+        evals=[(dtrain, "train")],
+    )
+    bst = output["booster"]
+    history = output["history"]
 
     # you can pass output directly into `predict` too.
     prediction = xgb.dask.predict(client, bst, dtrain)
-    print('Evaluation history:', history)
+    print("Evaluation history:", history)
     return prediction
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     # or use other clusters for scaling
     with LocalCluster(n_workers=7, threads_per_worker=4) as cluster:
         with Client(cluster) as client:
diff --git a/demo/dask/gpu_training.py b/demo/dask/gpu_training.py
index 1a75f6c70..6eea00692 100644
--- a/demo/dask/gpu_training.py
+++ b/demo/dask/gpu_training.py
@@ -13,33 +13,38 @@ from xgboost import dask as dxgb
 from xgboost.dask import DaskDMatrix
 
 
-def using_dask_matrix(client: Client, X, y):
-    # DaskDMatrix acts like normal DMatrix, works as a proxy for local
-    # DMatrix scatter around workers.
+def using_dask_matrix(client: Client, X: da.Array, y: da.Array) -> da.Array:
+    # DaskDMatrix acts like normal DMatrix, works as a proxy for local DMatrix scatter
+    # around workers.
     dtrain = DaskDMatrix(client, X, y)
 
-    # Use train method from xgboost.dask instead of xgboost.  This
-    # distributed version of train returns a dictionary containing the
-    # resulting booster and evaluation history obtained from
-    # evaluation metrics.
-    output = xgb.dask.train(client,
-                            {'verbosity': 2,
-                             # Golden line for GPU training
-                             'tree_method': 'gpu_hist'},
-                            dtrain,
-                            num_boost_round=4, evals=[(dtrain, 'train')])
-    bst = output['booster']
-    history = output['history']
+    # Use train method from xgboost.dask instead of xgboost.  This distributed version
+    # of train returns a dictionary containing the resulting booster and evaluation
+    # history obtained from evaluation metrics.
+    output = xgb.dask.train(
+        client,
+        {
+            "verbosity": 2,
+            "tree_method": "hist",
+            # Golden line for GPU training
+            "device": "cuda",
+        },
+        dtrain,
+        num_boost_round=4,
+        evals=[(dtrain, "train")],
+    )
+    bst = output["booster"]
+    history = output["history"]
 
     # you can pass output directly into `predict` too.
     prediction = xgb.dask.predict(client, bst, dtrain)
-    print('Evaluation history:', history)
+    print("Evaluation history:", history)
     return prediction
 
 
-def using_quantile_device_dmatrix(client: Client, X, y):
-    """`DaskQuantileDMatrix` is a data type specialized for `gpu_hist` and `hist` tree
-     methods for reducing memory usage.
+def using_quantile_device_dmatrix(client: Client, X: da.Array, y: da.Array) -> da.Array:
+    """`DaskQuantileDMatrix` is a data type specialized for `hist` tree methods for
+     reducing memory usage.
 
     .. versionadded:: 1.2.0
 
@@ -52,26 +57,28 @@ def using_quantile_device_dmatrix(client: Client, X, y):
     # the `ref` argument of `DaskQuantileDMatrix`.
     dtrain = dxgb.DaskQuantileDMatrix(client, X, y)
     output = xgb.dask.train(
-        client, {"verbosity": 2, "tree_method": "gpu_hist"}, dtrain, num_boost_round=4
+        client,
+        {"verbosity": 2, "tree_method": "hist", "device": "cuda"},
+        dtrain,
+        num_boost_round=4,
     )
 
     prediction = xgb.dask.predict(client, output, X)
     return prediction
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     # `LocalCUDACluster` is used for assigning GPU to XGBoost processes.  Here
-    # `n_workers` represents the number of GPUs since we use one GPU per worker
-    # process.
+    # `n_workers` represents the number of GPUs since we use one GPU per worker process.
     with LocalCUDACluster(n_workers=2, threads_per_worker=4) as cluster:
         with Client(cluster) as client:
             # generate some random data for demonstration
             m = 100000
             n = 100
             X = da.random.random(size=(m, n), chunks=10000)
-            y = da.random.random(size=(m, ), chunks=10000)
+            y = da.random.random(size=(m,), chunks=10000)
 
-            print('Using DaskQuantileDMatrix')
+            print("Using DaskQuantileDMatrix")
             from_ddqdm = using_quantile_device_dmatrix(client, X, y)
-            print('Using DMatrix')
+            print("Using DMatrix")
             from_dmatrix = using_dask_matrix(client, X, y)
diff --git a/demo/dask/sklearn_gpu_training.py b/demo/dask/sklearn_gpu_training.py
index 4c544e4e8..32a994464 100644
--- a/demo/dask/sklearn_gpu_training.py
+++ b/demo/dask/sklearn_gpu_training.py
@@ -21,7 +21,8 @@ def main(client):
     y = da.random.random(m, partition_size)
 
     regressor = xgboost.dask.DaskXGBRegressor(verbosity=1)
-    regressor.set_params(tree_method='gpu_hist')
+    # set the device to CUDA
+    regressor.set_params(tree_method="hist", device="cuda")
     # assigning client here is optional
     regressor.client = client
 
@@ -31,13 +32,13 @@ def main(client):
     bst = regressor.get_booster()
     history = regressor.evals_result()
 
-    print('Evaluation history:', history)
+    print("Evaluation history:", history)
     # returned prediction is always a dask array.
     assert isinstance(prediction, da.Array)
-    return bst                  # returning the trained model
+    return bst  # returning the trained model
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     # With dask cuda, one can scale up XGBoost to arbitrary GPU clusters.
     # `LocalCUDACluster` used here is only for demonstration purpose.
     with LocalCUDACluster() as cluster:
diff --git a/demo/guide-python/callbacks.py b/demo/guide-python/callbacks.py
index 817a65939..42fe397db 100644
--- a/demo/guide-python/callbacks.py
+++ b/demo/guide-python/callbacks.py
@@ -71,7 +71,8 @@ def custom_callback():
         {
             'objective': 'binary:logistic',
             'eval_metric': ['error', 'rmse'],
-            'tree_method': 'gpu_hist'
+            'tree_method': 'hist',
+            "device": "cuda",
         },
         D_train,
         evals=[(D_train, 'Train'), (D_valid, 'Valid')],
diff --git a/demo/guide-python/cat_in_the_dat.py b/demo/guide-python/cat_in_the_dat.py
index fdac04d6b..f8f345bda 100644
--- a/demo/guide-python/cat_in_the_dat.py
+++ b/demo/guide-python/cat_in_the_dat.py
@@ -63,7 +63,8 @@ def load_cat_in_the_dat() -> tuple[pd.DataFrame, pd.Series]:
 
 
 params = {
-    "tree_method": "gpu_hist",
+    "tree_method": "hist",
+    "device": "cuda",
     "n_estimators": 32,
     "colsample_bylevel": 0.7,
 }
diff --git a/demo/guide-python/categorical.py b/demo/guide-python/categorical.py
index a7fc85c71..aa5fb005b 100644
--- a/demo/guide-python/categorical.py
+++ b/demo/guide-python/categorical.py
@@ -58,13 +58,13 @@ def main() -> None:
     # Specify `enable_categorical` to True, also we use onehot encoding based split
     # here for demonstration. For details see the document of `max_cat_to_onehot`.
     reg = xgb.XGBRegressor(
-        tree_method="gpu_hist", enable_categorical=True, max_cat_to_onehot=5
+        tree_method="hist", enable_categorical=True, max_cat_to_onehot=5, device="cuda"
     )
     reg.fit(X, y, eval_set=[(X, y)])
 
     # Pass in already encoded data
     X_enc, y_enc = make_categorical(100, 10, 4, True)
-    reg_enc = xgb.XGBRegressor(tree_method="gpu_hist")
+    reg_enc = xgb.XGBRegressor(tree_method="hist", device="cuda")
     reg_enc.fit(X_enc, y_enc, eval_set=[(X_enc, y_enc)])
 
     reg_results = np.array(reg.evals_result()["validation_0"]["rmse"])
diff --git a/demo/guide-python/external_memory.py b/demo/guide-python/external_memory.py
index 11a05c61c..fdaa9dab9 100644
--- a/demo/guide-python/external_memory.py
+++ b/demo/guide-python/external_memory.py
@@ -82,8 +82,9 @@ def main(tmpdir: str) -> xgboost.Booster:
     missing = np.NaN
     Xy = xgboost.DMatrix(it, missing=missing, enable_categorical=False)
 
-    # Other tree methods including ``approx``, and ``gpu_hist`` are supported. GPU
-    # behaves differently than CPU tree methods. See tutorial in doc for details.
+    # ``approx`` is also supported, but less efficient due to sketching. GPU behaves
+    # differently than CPU tree methods as it uses a hybrid approach. See tutorial in
+    # doc for details.
     booster = xgboost.train(
         {"tree_method": "hist", "max_depth": 4},
         Xy,
diff --git a/demo/guide-python/learning_to_rank.py b/demo/guide-python/learning_to_rank.py
index 37b7157f5..62df8253b 100644
--- a/demo/guide-python/learning_to_rank.py
+++ b/demo/guide-python/learning_to_rank.py
@@ -104,7 +104,8 @@ def ranking_demo(args: argparse.Namespace) -> None:
     qid_test = qid_test[sorted_idx]
 
     ranker = xgb.XGBRanker(
-        tree_method="gpu_hist",
+        tree_method="hist",
+        device="cuda",
         lambdarank_pair_method="topk",
         lambdarank_num_pair_per_sample=13,
         eval_metric=["ndcg@1", "ndcg@8"],
@@ -161,7 +162,8 @@ def click_data_demo(args: argparse.Namespace) -> None:
 
     ranker = xgb.XGBRanker(
         n_estimators=512,
-        tree_method="gpu_hist",
+        tree_method="hist",
+        device="cuda",
         learning_rate=0.01,
         reg_lambda=1.5,
         subsample=0.8,
diff --git a/demo/guide-python/quantile_data_iterator.py b/demo/guide-python/quantile_data_iterator.py
index 29dd96b24..1241caef4 100644
--- a/demo/guide-python/quantile_data_iterator.py
+++ b/demo/guide-python/quantile_data_iterator.py
@@ -23,22 +23,23 @@ import numpy
 import xgboost
 
 COLS = 64
-ROWS_PER_BATCH = 1000            # data is splited by rows
+ROWS_PER_BATCH = 1000  # data is splited by rows
 BATCHES = 32
 
 
 class IterForDMatrixDemo(xgboost.core.DataIter):
-    '''A data iterator for XGBoost DMatrix.
+    """A data iterator for XGBoost DMatrix.
 
     `reset` and `next` are required for any data iterator, other functions here
     are utilites for demonstration's purpose.
 
-    '''
+    """
+
     def __init__(self):
-        '''Generate some random data for demostration.
+        """Generate some random data for demostration.
 
         Actual data can be anything that is currently supported by XGBoost.
-        '''
+        """
         self.rows = ROWS_PER_BATCH
         self.cols = COLS
         rng = cupy.random.RandomState(1994)
@@ -46,7 +47,7 @@ class IterForDMatrixDemo(xgboost.core.DataIter):
         self._labels = [rng.randn(self.rows)] * BATCHES
         self._weights = [rng.uniform(size=self.rows)] * BATCHES
 
-        self.it = 0             # set iterator to 0
+        self.it = 0  # set iterator to 0
         super().__init__()
 
     def as_array(self):
@@ -59,27 +60,26 @@ class IterForDMatrixDemo(xgboost.core.DataIter):
         return cupy.concatenate(self._weights)
 
     def data(self):
-        '''Utility function for obtaining current batch of data.'''
+        """Utility function for obtaining current batch of data."""
         return self._data[self.it]
 
     def labels(self):
-        '''Utility function for obtaining current batch of label.'''
+        """Utility function for obtaining current batch of label."""
         return self._labels[self.it]
 
     def weights(self):
         return self._weights[self.it]
 
     def reset(self):
-        '''Reset the iterator'''
+        """Reset the iterator"""
         self.it = 0
 
     def next(self, input_data):
-        '''Yield next batch of data.'''
+        """Yield next batch of data."""
         if self.it == len(self._data):
             # Return 0 when there's no more batch.
             return 0
-        input_data(data=self.data(), label=self.labels(),
-                   weight=self.weights())
+        input_data(data=self.data(), label=self.labels(), weight=self.weights())
         self.it += 1
         return 1
 
@@ -103,18 +103,19 @@ def main():
 
     assert m_with_it.num_col() == m.num_col()
     assert m_with_it.num_row() == m.num_row()
-    # Tree meethod must be one of the `hist` or `gpu_hist`. We use `gpu_hist` for GPU
-    # input here.
+    # Tree meethod must be `hist`.
     reg_with_it = xgboost.train(
-        {"tree_method": "gpu_hist"}, m_with_it, num_boost_round=rounds
+        {"tree_method": "hist", "device": "cuda"}, m_with_it, num_boost_round=rounds
     )
     predict_with_it = reg_with_it.predict(m_with_it)
 
-    reg = xgboost.train({"tree_method": "gpu_hist"}, m, num_boost_round=rounds)
+    reg = xgboost.train(
+        {"tree_method": "hist", "device": "cuda"}, m, num_boost_round=rounds
+    )
     predict = reg.predict(m)
 
     numpy.testing.assert_allclose(predict_with_it, predict, rtol=1e6)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/demo/guide-python/update_process.py b/demo/guide-python/update_process.py
index 77e0dc870..17bbbc39c 100644
--- a/demo/guide-python/update_process.py
+++ b/demo/guide-python/update_process.py
@@ -24,7 +24,7 @@ def main():
     Xy = xgb.DMatrix(X_train, y_train)
     evals_result: xgb.callback.EvaluationMonitor.EvalsLog = {}
     booster = xgb.train(
-        {"tree_method": "gpu_hist", "max_depth": 6},
+        {"tree_method": "hist", "max_depth": 6, "device": "cuda"},
         Xy,
         num_boost_round=n_rounds,
         evals=[(Xy, "Train")],
@@ -33,8 +33,8 @@ def main():
     SHAP = booster.predict(Xy, pred_contribs=True)
 
     # Refresh the leaf value and tree statistic
-    X_refresh = X[X.shape[0] // 2:]
-    y_refresh = y[y.shape[0] // 2:]
+    X_refresh = X[X.shape[0] // 2 :]
+    y_refresh = y[y.shape[0] // 2 :]
     Xy_refresh = xgb.DMatrix(X_refresh, y_refresh)
     # The model will adapt to other half of the data by changing leaf value (no change in
     # split condition) with refresh_leaf set to True.
@@ -87,7 +87,7 @@ def main():
     np.testing.assert_allclose(
         np.array(prune_result["Original"]["rmse"]),
         np.array(prune_result["Train"]["rmse"]),
-        atol=1e-5
+        atol=1e-5,
     )
 
 
diff --git a/doc/gpu/index.rst b/doc/gpu/index.rst
index 3cee0cdf5..4489c1427 100644
--- a/doc/gpu/index.rst
+++ b/doc/gpu/index.rst
@@ -14,30 +14,24 @@ Most of the algorithms in XGBoost including training, prediction and evaluation
 
 Usage
 =====
-Specify the ``tree_method`` parameter as ``gpu_hist``. For details around the ``tree_method`` parameter, see :doc:`tree method </treemethod>`.
-
-Supported parameters
---------------------
-
-GPU accelerated prediction is enabled by default for the above mentioned ``tree_method`` parameters but can be switched to CPU prediction by setting ``predictor`` to ``cpu_predictor``. This could be useful if you want to conserve GPU memory. Likewise when using CPU algorithms, GPU accelerated prediction can be enabled by setting ``predictor`` to ``gpu_predictor``.
-
-The device ordinal (which GPU to use if you have many of them) can be selected using the
-``device`` parameter, which defaults to 0 when "CUDA" is specified(the first device reported by CUDA
-runtime).
 
+To enable GPU acceleration, specify the ``device`` parameter as ``cuda``. In addition, the device ordinal (which GPU to use if you have multiple devices in the same node) can be specified using the ``cuda:<ordinal>`` syntax, where ``<ordinal>`` is an integer that represents the device ordinal. XGBoost defaults to 0 (the first device reported by CUDA runtime).
 
 The GPU algorithms currently work with CLI, Python, R, and JVM packages. See :doc:`/install` for details.
 
 .. code-block:: python
   :caption: Python example
 
-  param["device"] = "cuda:0"
-  param['tree_method'] = 'gpu_hist'
+  params = dict()
+  params["device"] = "cuda:0"
+  params["tree_method"] = "hist"
+  Xy = xgboost.QuantileDMatrix(X, y)
+  xgboost.train(params, Xy)
 
 .. code-block:: python
   :caption: With Scikit-Learn interface
 
-  XGBRegressor(tree_method='gpu_hist', device="cuda")
+  XGBRegressor(tree_method="hist", device="cuda")
 
 
 GPU-Accelerated SHAP values
@@ -46,12 +40,11 @@ XGBoost makes use of `GPUTreeShap <https://github.com/rapidsai/gputreeshap>`_ as
 
 .. code-block:: python
 
-  model.set_param({"device": "cuda:0", "tree_method": "gpu_hist"})
-  shap_values = model.predict(dtrain, pred_contribs=True)
+  booster.set_param({"device": "cuda:0"})
+  shap_values = booster.predict(dtrain, pred_contribs=True)
   shap_interaction_values = model.predict(dtrain, pred_interactions=True)
 
-See examples `here
-<https://github.com/dmlc/xgboost/tree/master/demo/gpu_acceleration>`__.
+See examples `here <https://github.com/dmlc/xgboost/tree/master/demo/gpu_acceleration>`__.
 
 Multi-node Multi-GPU Training
 =============================
@@ -61,7 +54,7 @@ XGBoost supports fully distributed GPU training using `Dask <https://dask.org/>`
 
 Memory usage
 ============
-The following are some guidelines on the device memory usage of the `gpu_hist` tree method.
+The following are some guidelines on the device memory usage of the ``hist`` tree method on GPU.
 
 Memory inside xgboost training is generally allocated for two reasons - storing the dataset and working memory.
 
@@ -79,7 +72,7 @@ XGBoost models trained on GPUs can be used on CPU-only systems to generate predi
 
 Developer notes
 ===============
-The application may be profiled with annotations by specifying USE_NTVX to cmake. Regions covered by the 'Monitor' class in CUDA code will automatically appear in the nsight profiler when `verbosity` is set to 3.
+The application may be profiled with annotations by specifying ``USE_NTVX`` to cmake. Regions covered by the 'Monitor' class in CUDA code will automatically appear in the nsight profiler when `verbosity` is set to 3.
 
 **********
 References
diff --git a/doc/parameter.rst b/doc/parameter.rst
index d628d161b..382cddd4f 100644
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -55,10 +55,6 @@ General Parameters
 
   - Flag to disable default metric. Set to 1 or ``true`` to disable.
 
-* ``num_feature`` [set automatically by XGBoost, no need to be set by user]
-
-  - Feature dimension used in boosting, set to maximum dimension of the feature
-
 * ``device`` [default= ``cpu``]
 
   .. versionadded:: 2.0.0
@@ -164,7 +160,7 @@ Parameters for Tree Booster
     - ``grow_colmaker``: non-distributed column-based construction of trees.
     - ``grow_histmaker``: distributed tree construction with row-based data splitting based on global proposal of histogram counting.
     - ``grow_quantile_histmaker``: Grow tree using quantized histogram.
-    - ``grow_gpu_hist``: Grow tree with GPU. Same as setting tree method to ``hist`` and use ``device=cuda``.
+    - ``grow_gpu_hist``: Grow tree with GPU. Same as setting ``tree_method`` to ``hist`` and use ``device=cuda``.
     - ``sync``: synchronizes trees in all distributed nodes.
     - ``refresh``: refreshes tree's statistics and/or leaf values based on the current data. Note that no random subsampling of data rows is performed.
     - ``prune``: prunes the splits where loss < min_split_loss (or gamma) and nodes that have depth greater than ``max_depth``.
@@ -421,7 +417,7 @@ Specify the learning task and the corresponding learning objective. The objectiv
 
       .. math::
 
-	 AP@l = \frac{1}{min{(l, N)}}\sum^l_{k=1}P@k \cdot I_{(k)}
+         AP@l = \frac{1}{min{(l, N)}}\sum^l_{k=1}P@k \cdot I_{(k)}
 
       where :math:`I_{(k)}` is an indicator function that equals to :math:`1` when the document at :math:`k` is relevant and :math:`0` otherwise. The :math:`P@k` is the precision at :math:`k`, and :math:`N` is the total number of relevant documents. Lastly, the `mean average precision` is defined as the weighted average across all queries.
 
diff --git a/doc/python/python_intro.rst b/doc/python/python_intro.rst
index 505556383..bb74e7bc3 100644
--- a/doc/python/python_intro.rst
+++ b/doc/python/python_intro.rst
@@ -310,8 +310,8 @@ for more info.
 
 .. code-block:: python
 
-  # Use "gpu_hist" for training the model.
-  reg = xgb.XGBRegressor(tree_method="gpu_hist")
+  # Use "hist" for training the model.
+  reg = xgb.XGBRegressor(tree_method="hist", device="cuda")
   # Fit the model using predictor X and response y.
   reg.fit(X, y)
   # Save model into JSON format.
diff --git a/doc/tutorials/dask.rst b/doc/tutorials/dask.rst
index 7fde35b0e..131929b24 100644
--- a/doc/tutorials/dask.rst
+++ b/doc/tutorials/dask.rst
@@ -56,7 +56,6 @@ on a dask cluster:
         dtrain = xgb.dask.DaskDMatrix(client, X, y)
         # or
         # dtrain = xgb.dask.DaskQuantileDMatrix(client, X, y)
-        # `DaskQuantileDMatrix` is available for the `hist` and `gpu_hist` tree method.
 
         output = xgb.dask.train(
             client,
@@ -149,7 +148,7 @@ Also for inplace prediction:
 .. code-block:: python
 
   # where X is a dask DataFrame or dask Array backed by cupy or cuDF.
-  booster.set_param({"device": "cuda:0"})
+  booster.set_param({"device": "cuda"})
   prediction = xgb.dask.inplace_predict(client, booster, X)
 
 When input is ``da.Array`` object, output is always ``da.Array``.  However, if the input
@@ -225,6 +224,12 @@ collection.
                 main(client)
 
 
+****************
+GPU acceleration
+****************
+
+For most of the use cases with GPUs, the `Dask-CUDA <https://docs.rapids.ai/api/dask-cuda/stable/quickstart.html>`__ project should be used to create the cluster, which automatically configures the correct device ordinal for worker processes. As a result, users should NOT specify the ordinal (good: ``device=cuda``, bad: ``device=cuda:1``). See :ref:`sphx_glr_python_dask-examples_gpu_training.py` and :ref:`sphx_glr_python_dask-examples_sklearn_gpu_training.py` for worked examples.
+
 ***************************
 Working with other clusters
 ***************************
@@ -262,7 +267,7 @@ In the example below, a ``KubeCluster`` is used for `deploying Dask on Kubernete
 
           regressor = xgb.dask.DaskXGBRegressor(n_estimators=10, missing=0.0)
           regressor.client = client
-          regressor.set_params(tree_method='gpu_hist')
+          regressor.set_params(tree_method='hist', device="cuda")
           regressor.fit(X, y, eval_set=[(X, y)])
 
 
diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h
index 2a7d51393..8844b853b 100644
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@@ -478,7 +478,7 @@ XGB_DLL int XGDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHandle proxy
  * \param config   JSON encoded parameters for DMatrix construction.  Accepted fields are:
  *   - missing:      Which value to represent missing value
  *   - nthread (optional): Number of threads used for initializing DMatrix.
- *   - max_bin (optional):  Maximum number of bins for building histogram.
+ *   - max_bin (optional): Maximum number of bins for building histogram.
  * \param out      The created Device Quantile DMatrix
  *
  * \return 0 when success, -1 when failure happens
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 0250dd293..86c49e0ff 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -1451,7 +1451,7 @@ class QuantileDMatrix(DMatrix):
         enable_categorical: bool = False,
         data_split_mode: DataSplitMode = DataSplitMode.ROW,
     ) -> None:
-        self.max_bin: int = max_bin if max_bin is not None else 256
+        self.max_bin = max_bin
         self.missing = missing if missing is not None else np.nan
         self.nthread = nthread if nthread is not None else -1
         self._silent = silent  # unused, kept for compatibility
diff --git a/python-package/xgboost/dask.py b/python-package/xgboost/dask.py
index 35c5c009f..32dd2a4a7 100644
--- a/python-package/xgboost/dask.py
+++ b/python-package/xgboost/dask.py
@@ -82,6 +82,7 @@ from .sklearn import (
     XGBRanker,
     XGBRankerMixIn,
     XGBRegressorBase,
+    _can_use_qdm,
     _check_rf_callback,
     _cls_predict_proba,
     _objective_decorator,
@@ -617,14 +618,7 @@ class DaskPartitionIter(DataIter):  # pylint: disable=R0902
         if self._iter == len(self._data):
             # Return 0 when there's no more batch.
             return 0
-        feature_names: Optional[FeatureNames] = None
-        if self._feature_names:
-            feature_names = self._feature_names
-        else:
-            if hasattr(self.data(), "columns"):
-                feature_names = self.data().columns.format()
-            else:
-                feature_names = None
+
         input_data(
             data=self.data(),
             label=self._get("_label"),
@@ -634,7 +628,7 @@ class DaskPartitionIter(DataIter):  # pylint: disable=R0902
             base_margin=self._get("_base_margin"),
             label_lower_bound=self._get("_label_lower_bound"),
             label_upper_bound=self._get("_label_upper_bound"),
-            feature_names=feature_names,
+            feature_names=self._feature_names,
             feature_types=self._feature_types,
             feature_weights=self._feature_weights,
         )
@@ -935,6 +929,12 @@ async def _train_async(
         raise NotImplementedError(
             f"booster `{params['booster']}` is not yet supported for dask."
         )
+    device = params.get("device", None)
+    if device and device.find(":") != -1:
+        raise ValueError(
+            "The dask interface for XGBoost doesn't support selecting specific device"
+            " ordinal. Use `device=cpu` or `device=cuda` instead."
+        )
 
     def dispatched_train(
         parameters: Dict,
@@ -1574,7 +1574,7 @@ async def _async_wrap_evaluation_matrices(
     """A switch function for async environment."""
 
     def _dispatch(ref: Optional[DaskDMatrix], **kwargs: Any) -> DaskDMatrix:
-        if tree_method in ("hist", "gpu_hist"):
+        if _can_use_qdm(tree_method):
             return DaskQuantileDMatrix(
                 client=client, ref=ref, max_bin=max_bin, **kwargs
             )
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index e9f9e9f10..d69cb3a01 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -76,6 +76,10 @@ def _check_rf_callback(
         )
 
 
+def _can_use_qdm(tree_method: Optional[str]) -> bool:
+    return tree_method in ("hist", "gpu_hist", None, "auto")
+
+
 SklObjective = Optional[
     Union[str, Callable[[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]]
 ]
@@ -939,7 +943,7 @@ class XGBModel(XGBModelBase):
 
     def _create_dmatrix(self, ref: Optional[DMatrix], **kwargs: Any) -> DMatrix:
         # Use `QuantileDMatrix` to save memory.
-        if self.tree_method in ("hist", "gpu_hist"):
+        if _can_use_qdm(self.tree_method) and self.booster != "gblinear":
             try:
                 return QuantileDMatrix(
                     **kwargs, ref=ref, nthread=self.n_jobs, max_bin=self.max_bin
diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py
index a170fbf9f..283999c6d 100644
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@@ -61,7 +61,7 @@ import xgboost
 from xgboost import XGBClassifier
 from xgboost.compat import is_cudf_available
 from xgboost.core import Booster
-from xgboost.sklearn import DEFAULT_N_ESTIMATORS, XGBModel
+from xgboost.sklearn import DEFAULT_N_ESTIMATORS, XGBModel, _can_use_qdm
 from xgboost.training import train as worker_train
 
 from .data import (
@@ -901,7 +901,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
             context = BarrierTaskContext.get()
 
             dev_ordinal = None
-            use_hist = booster_params.get("tree_method", None) in ("hist", "gpu_hist")
+            use_qdm = _can_use_qdm(booster_params.get("tree_method", None))
 
             if use_gpu:
                 dev_ordinal = (
@@ -912,9 +912,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
                 # because without cuDF, DMatrix performs better than QDM.
                 # Note: Checking `is_cudf_available` in spark worker side because
                 # spark worker might has different python environment with driver side.
-                use_qdm = use_hist and is_cudf_available()
-            else:
-                use_qdm = use_hist
+                use_qdm = use_qdm and is_cudf_available()
 
             if use_qdm and (booster_params.get("max_bin", None) is not None):
                 dmatrix_kwargs["max_bin"] = booster_params["max_bin"]
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 4e1f86ff2..0c98c0198 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -81,13 +81,6 @@ void XGBBuildInfoDevice(Json *p_info) {
 }  // namespace xgboost
 #endif
 
-namespace {
-void DeprecatedFunc(StringView old, StringView since, StringView replacement) {
-  LOG(WARNING) << "`" << old << "` is deprecated since" << since << ", use `" << replacement
-               << "` instead.";
-}
-}  // anonymous namespace
-
 XGB_DLL int XGBuildInfo(char const **out) {
   API_BEGIN();
   xgboost_CHECK_C_ARG_PTR(out);
@@ -328,7 +321,7 @@ XGB_DLL int XGDeviceQuantileDMatrixCreateFromCallback(DataIterHandle iter, DMatr
                                                       int nthread, int max_bin,
                                                       DMatrixHandle *out) {
   API_BEGIN();
-  DeprecatedFunc(__func__, "1.7.0", "XGQuantileDMatrixCreateFromCallback");
+  LOG(WARNING) << error::DeprecatedFunc(__func__, "1.7.0", "XGQuantileDMatrixCreateFromCallback");
   *out = new std::shared_ptr<xgboost::DMatrix>{
       xgboost::DMatrix::Create(iter, proxy, nullptr, reset, next, missing, nthread, max_bin)};
   API_END();
@@ -432,7 +425,7 @@ XGB_DLL int XGDMatrixCreateFromCSREx(const size_t *indptr, const unsigned *indic
                                      const bst_float *data, size_t nindptr, size_t nelem,
                                      size_t num_col, DMatrixHandle *out) {
   API_BEGIN();
-  DeprecatedFunc(__func__, "2.0.0", "XGDMatrixCreateFromCSR");
+  LOG(WARNING) << error::DeprecatedFunc(__func__, "2.0.0", "XGDMatrixCreateFromCSR");
   data::CSRAdapter adapter(indptr, indices, data, nindptr - 1, nelem, num_col);
   *out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, std::nan(""), 1));
   API_END();
@@ -496,7 +489,7 @@ XGB_DLL int XGDMatrixCreateFromCSCEx(const size_t *col_ptr, const unsigned *indi
                                      const bst_float *data, size_t nindptr, size_t, size_t num_row,
                                      DMatrixHandle *out) {
   API_BEGIN();
-  DeprecatedFunc(__func__, "2.0.0", "XGDMatrixCreateFromCSC");
+  LOG(WARNING) << error::DeprecatedFunc(__func__, "2.0.0", "XGDMatrixCreateFromCSC");
   data::CSCAdapter adapter(col_ptr, indices, data, nindptr - 1, num_row);
   xgboost_CHECK_C_ARG_PTR(out);
   *out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, std::nan(""), 1));
@@ -1347,7 +1340,7 @@ XGB_DLL int XGBoosterGetModelRaw(BoosterHandle handle, xgboost::bst_ulong *out_l
   raw_str.resize(0);
 
   common::MemoryBufferStream fo(&raw_str);
-  DeprecatedFunc(__func__, "1.6.0", "XGBoosterSaveModelToBuffer");
+  LOG(WARNING) << error::DeprecatedFunc(__func__, "1.6.0", "XGBoosterSaveModelToBuffer");
 
   learner->Configure();
   learner->SaveModel(&fo);
diff --git a/src/common/error_msg.cc b/src/common/error_msg.cc
index bb57014a6..593c7d6de 100644
--- a/src/common/error_msg.cc
+++ b/src/common/error_msg.cc
@@ -3,10 +3,18 @@
  */
 #include "error_msg.h"
 
+#include <sstream>  // for stringstream
+
 #include "../collective/communicator-inl.h"  // for GetRank
 #include "xgboost/logging.h"
 
 namespace xgboost::error {
+std::string DeprecatedFunc(StringView old, StringView since, StringView replacement) {
+  std::stringstream ss;
+  ss << "`" << old << "` is deprecated since" << since << ", use `" << replacement << "` instead.";
+  return ss.str();
+}
+
 void WarnDeprecatedGPUHist() {
   auto msg =
       "The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` "
@@ -34,8 +42,9 @@ void WarnDeprecatedGPUId() {
   if (logged) {
     return;
   }
-  LOG(WARNING) << "`gpu_id` is deprecated in favor of the new `device` parameter: "
-               << "device = cpu/cuda/cuda:0";
+  auto msg = DeprecatedFunc("gpu_id", "2.0.0", "device");
+  msg += " E.g. device=cpu/cuda/cuda:0";
+  LOG(WARNING) << msg;
   logged = true;
 }
 
diff --git a/src/common/error_msg.h b/src/common/error_msg.h
index 07b5c3e53..8bdc85999 100644
--- a/src/common/error_msg.h
+++ b/src/common/error_msg.h
@@ -8,6 +8,7 @@
 
 #include <cinttypes>  // for uint64_t
 #include <limits>     // for numeric_limits
+#include <string>     // for string
 
 #include "xgboost/base.h"  // for bst_feature_t
 #include "xgboost/logging.h"
@@ -86,5 +87,7 @@ void WarnManualUpdater();
 void WarnDeprecatedGPUId();
 
 void WarnEmptyDataset();
+
+std::string DeprecatedFunc(StringView old, StringView since, StringView replacement);
 }  // namespace xgboost::error
 #endif  // XGBOOST_COMMON_ERROR_MSG_H_
diff --git a/src/learner.cc b/src/learner.cc
index 03714a056..2f453ea30 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -690,19 +690,20 @@ class LearnerConfiguration : public Learner {
       stack.pop();
       auto const &obj = get<Object const>(j_obj);
 
-      for (auto const &kv : obj) {
+      for (auto const& kv : obj) {
         if (is_parameter(kv.first)) {
           auto parameter = get<Object const>(kv.second);
-          std::transform(parameter.begin(), parameter.end(), std::back_inserter(keys),
-                         [](std::pair<std::string const&, Json const&> const& kv) {
-                           return kv.first;
-                         });
+          std::transform(
+              parameter.begin(), parameter.end(), std::back_inserter(keys),
+              [](std::pair<std::string const&, Json const&> const& kv) { return kv.first; });
         } else if (IsA<Object>(kv.second)) {
           stack.push(kv.second);
-        } else if (kv.first == "metrics") {
+        } else if (IsA<Array>(kv.second)) {
           auto const& array = get<Array const>(kv.second);
           for (auto const& v : array) {
-            stack.push(v);
+            if (IsA<Object>(v) || IsA<Array>(v)) {
+              stack.push(v);
+            }
           }
         }
       }
diff --git a/tests/ci_build/lint_python.py b/tests/ci_build/lint_python.py
index 08baa844b..ca5d56e4c 100644
--- a/tests/ci_build/lint_python.py
+++ b/tests/ci_build/lint_python.py
@@ -32,6 +32,7 @@ class LintersPaths:
         "tests/test_distributed/test_with_spark/",
         "tests/test_distributed/test_gpu_with_spark/",
         # demo
+        "demo/dask/",
         "demo/json-model/json_parser.py",
         "demo/guide-python/cat_in_the_dat.py",
         "demo/guide-python/categorical.py",
@@ -42,6 +43,8 @@ class LintersPaths:
         "demo/guide-python/quantile_regression.py",
         "demo/guide-python/multioutput_regression.py",
         "demo/guide-python/learning_to_rank.py",
+        "demo/guide-python/quantile_data_iterator.py",
+        "demo/guide-python/update_process.py",
         "demo/aft_survival/aft_survival_viz_demo.py",
         # CI
         "tests/ci_build/lint_python.py",
diff --git a/tests/python/test_quantile_dmatrix.py b/tests/python/test_quantile_dmatrix.py
index c1ec23ea3..b7428dfac 100644
--- a/tests/python/test_quantile_dmatrix.py
+++ b/tests/python/test_quantile_dmatrix.py
@@ -322,3 +322,15 @@ class TestQuantileDMatrix:
             X: np.ndarray = np.array(orig, dtype=dtype)
             with pytest.raises(ValueError):
                 xgb.QuantileDMatrix(X)
+
+    def test_changed_max_bin(self) -> None:
+        n_samples = 128
+        n_features = 16
+        csr, y = make_sparse_regression(n_samples, n_features, 0.5, False)
+        Xy = xgb.QuantileDMatrix(csr, y, max_bin=9)
+        booster = xgb.train({"max_bin": 9}, Xy, num_boost_round=2)
+
+        Xy = xgb.QuantileDMatrix(csr, y, max_bin=11)
+
+        with pytest.raises(ValueError, match="consistent"):
+            xgb.train({}, Xy, num_boost_round=2, xgb_model=booster)
diff --git a/tests/python/test_updaters.py b/tests/python/test_updaters.py
index 2027942fe..029911bf0 100644
--- a/tests/python/test_updaters.py
+++ b/tests/python/test_updaters.py
@@ -27,7 +27,7 @@ def train_result(param, dmat, num_rounds):
         param,
         dmat,
         num_rounds,
-        [(dmat, "train")],
+        evals=[(dmat, "train")],
         verbose_eval=False,
         evals_result=result,
     )
@@ -169,13 +169,21 @@ class TestTreeMethod:
         hist_res = {}
         exact_res = {}
 
-        xgb.train(ag_param, ag_dtrain, 10,
-                  [(ag_dtrain, 'train'), (ag_dtest, 'test')],
-                  evals_result=hist_res)
+        xgb.train(
+            ag_param,
+            ag_dtrain,
+            10,
+            evals=[(ag_dtrain, "train"), (ag_dtest, "test")],
+            evals_result=hist_res
+        )
         ag_param["tree_method"] = "exact"
-        xgb.train(ag_param, ag_dtrain, 10,
-                  [(ag_dtrain, 'train'), (ag_dtest, 'test')],
-                  evals_result=exact_res)
+        xgb.train(
+            ag_param,
+            ag_dtrain,
+            10,
+            evals=[(ag_dtrain, "train"), (ag_dtest, "test")],
+            evals_result=exact_res
+        )
         assert hist_res['train']['auc'] == exact_res['train']['auc']
         assert hist_res['test']['auc'] == exact_res['test']['auc']
 
diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
index b961db2c4..9a58b7277 100644
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -1349,10 +1349,11 @@ def test_multilabel_classification() -> None:
     np.testing.assert_allclose(clf.predict(X), predt)
 
 
-def test_data_initialization():
+def test_data_initialization() -> None:
     from sklearn.datasets import load_digits
+
     X, y = load_digits(return_X_y=True)
-    validate_data_initialization(xgb.DMatrix, xgb.XGBClassifier, X, y)
+    validate_data_initialization(xgb.QuantileDMatrix, xgb.XGBClassifier, X, y)
 
 
 @parametrize_with_checks([xgb.XGBRegressor()])
diff --git a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
index 047093700..9386486de 100644
--- a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
+++ b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
@@ -1,10 +1,9 @@
 """Copyright 2019-2022 XGBoost contributors"""
 import asyncio
-import os
-import subprocess
+import json
 from collections import OrderedDict
 from inspect import signature
-from typing import Any, Dict, Type, TypeVar, Union
+from typing import Any, Dict, Type, TypeVar
 
 import numpy as np
 import pytest
@@ -64,7 +63,7 @@ def run_with_dask_dataframe(DMatrixT: Type, client: Client) -> None:
     dtrain = DMatrixT(client, X, y)
     out = dxgb.train(
         client,
-        {"tree_method": "gpu_hist", "debug_synchronize": True},
+        {"tree_method": "hist", "debug_synchronize": True, "device": "cuda"},
         dtrain=dtrain,
         evals=[(dtrain, "X")],
         num_boost_round=4,
@@ -116,12 +115,18 @@ def run_with_dask_array(DMatrixT: Type, client: Client) -> None:
     dtrain = DMatrixT(client, X, y)
     out = dxgb.train(
         client,
-        {"tree_method": "gpu_hist", "debug_synchronize": True},
+        {"tree_method": "hist", "debug_synchronize": True, "device": "cuda"},
         dtrain=dtrain,
         evals=[(dtrain, "X")],
         num_boost_round=2,
     )
     from_dmatrix = dxgb.predict(client, out, dtrain).compute()
+    assert (
+        json.loads(out["booster"].save_config())["learner"]["gradient_booster"][
+            "updater"
+        ][0]["name"]
+        == "grow_gpu_hist"
+    )
     inplace_predictions = dxgb.inplace_predict(client, out, X).compute()
     single_node = out["booster"].predict(xgb.DMatrix(X.compute()))
     np.testing.assert_allclose(single_node, from_dmatrix)
@@ -149,7 +154,8 @@ def run_gpu_hist(
     DMatrixT: Type,
     client: Client,
 ) -> None:
-    params["tree_method"] = "gpu_hist"
+    params["tree_method"] = "hist"
+    params["device"] = "cuda"
     params = dataset.set_params(params)
     # It doesn't make sense to distribute a completely
     # empty dataset.
@@ -196,11 +202,11 @@ def run_gpu_hist(
 def test_tree_stats() -> None:
     with LocalCUDACluster(n_workers=1) as cluster:
         with Client(cluster) as client:
-            local = run_tree_stats(client, "gpu_hist")
+            local = run_tree_stats(client, "hist", "cuda")
 
     with LocalCUDACluster(n_workers=2) as cluster:
         with Client(cluster) as client:
-            distributed = run_tree_stats(client, "gpu_hist")
+            distributed = run_tree_stats(client, "hist", "cuda")
 
     assert local == distributed
 
@@ -214,12 +220,12 @@ class TestDistributedGPU:
         X_, y_ = load_breast_cancer(return_X_y=True)
         X = dd.from_array(X_, chunksize=100).map_partitions(cudf.from_pandas)
         y = dd.from_array(y_, chunksize=100).map_partitions(cudf.from_pandas)
-        run_boost_from_prediction(X, y, "gpu_hist", local_cuda_client)
+        run_boost_from_prediction(X, y, "hist", "cuda", local_cuda_client)
 
         X_, y_ = load_iris(return_X_y=True)
         X = dd.from_array(X_, chunksize=50).map_partitions(cudf.from_pandas)
         y = dd.from_array(y_, chunksize=50).map_partitions(cudf.from_pandas)
-        run_boost_from_prediction_multi_class(X, y, "gpu_hist", local_cuda_client)
+        run_boost_from_prediction_multi_class(X, y, "hist", "cuda", local_cuda_client)
 
     def test_init_estimation(self, local_cuda_client: Client) -> None:
         check_init_estimation("gpu_hist", local_cuda_client)
@@ -282,7 +288,7 @@ class TestDistributedGPU:
         )
         result = xgb.dask.train(
             client,
-            {"tree_method": "gpu_hist"},
+            {"tree_method": "hist", "device": "cuda", "debug_synchronize": True},
             Xy,
             num_boost_round=10,
             evals=[(Xy_valid, "Valid")],
@@ -313,7 +319,8 @@ class TestDistributedGPU:
             {
                 "objective": "binary:logistic",
                 "eval_metric": "error",
-                "tree_method": "gpu_hist",
+                "tree_method": "hist",
+                "device": "cuda",
             },
             m,
             evals=[(valid, "Valid")],
@@ -328,7 +335,8 @@ class TestDistributedGPU:
         valid_y = y
         cls = dxgb.DaskXGBClassifier(
             objective="binary:logistic",
-            tree_method="gpu_hist",
+            tree_method="hist",
+            device="cuda",
             eval_metric="error",
             n_estimators=100,
         )
@@ -356,7 +364,11 @@ class TestDistributedGPU:
         run_dask_classifier(X, y, w, model, "gpu_hist", local_cuda_client, 10)
 
     def test_empty_dmatrix(self, local_cuda_client: Client) -> None:
-        parameters = {"tree_method": "gpu_hist", "debug_synchronize": True}
+        parameters = {
+            "tree_method": "hist",
+            "debug_synchronize": True,
+            "device": "cuda",
+        }
         run_empty_dmatrix_reg(local_cuda_client, parameters)
         run_empty_dmatrix_cls(local_cuda_client, parameters)
 
@@ -374,7 +386,11 @@ class TestDistributedGPU:
                 "y": [10, 20, 30, 40.0, 50] * mult,
             }
         )
-        parameters = {"tree_method": "gpu_hist", "debug_synchronize": True}
+        parameters = {
+            "tree_method": "hist",
+            "debug_synchronize": True,
+            "device": "cuda",
+        }
 
         empty = df.iloc[:0]
         ddf = dask_cudf.concat(
@@ -432,13 +448,25 @@ class TestDistributedGPU:
 
     def test_empty_dmatrix_auc(self, local_cuda_client: Client) -> None:
         n_workers = len(tm.get_client_workers(local_cuda_client))
-        run_empty_dmatrix_auc(local_cuda_client, "gpu_hist", n_workers)
+        run_empty_dmatrix_auc(local_cuda_client, "cuda", n_workers)
 
     def test_auc(self, local_cuda_client: Client) -> None:
-        run_auc(local_cuda_client, "gpu_hist")
+        run_auc(local_cuda_client, "cuda")
+
+    def test_invalid_ordinal(self, local_cuda_client: Client) -> None:
+        """One should not specify the device ordinal with dask."""
+        with pytest.raises(ValueError, match="device=cuda"):
+            X, y, _ = generate_array()
+            m = dxgb.DaskDMatrix(local_cuda_client, X, y)
+            dxgb.train(local_cuda_client, {"device": "cuda:0"}, m)
+
+        booster = dxgb.train(local_cuda_client, {"device": "cuda"}, m)["booster"]
+        assert (
+            json.loads(booster.save_config())["learner"]["generic_param"]["device"]
+            == "cuda:0"
+        )
 
     def test_data_initialization(self, local_cuda_client: Client) -> None:
-
         X, y, _ = generate_array()
         fw = da.random.random((random_cols,))
         fw = fw - fw.min()
@@ -531,7 +559,9 @@ async def run_from_dask_array_asyncio(scheduler_address: str) -> dxgb.TrainRetur
         y = y.map_blocks(cp.array)
 
         m = await xgb.dask.DaskQuantileDMatrix(client, X, y)
-        output = await xgb.dask.train(client, {"tree_method": "gpu_hist"}, dtrain=m)
+        output = await xgb.dask.train(
+            client, {"tree_method": "hist", "device": "cuda"}, dtrain=m
+        )
 
         with_m = await xgb.dask.predict(client, output, m)
         with_X = await xgb.dask.predict(client, output, X)
diff --git a/tests/test_distributed/test_with_dask/test_with_dask.py b/tests/test_distributed/test_with_dask/test_with_dask.py
index cab4188a8..66c6058a5 100644
--- a/tests/test_distributed/test_with_dask/test_with_dask.py
+++ b/tests/test_distributed/test_with_dask/test_with_dask.py
@@ -45,7 +45,7 @@ from xgboost.testing.dask import check_init_estimation, check_uneven_nan
 dask.config.set({"distributed.scheduler.allowed-failures": False})
 
 
-if hasattr(HealthCheck, 'function_scoped_fixture'):
+if hasattr(HealthCheck, "function_scoped_fixture"):
     suppress = [HealthCheck.function_scoped_fixture]
 else:
     suppress = hypothesis.utils.conventions.not_set  # type:ignore
@@ -131,7 +131,9 @@ def generate_array(
     return X, y, None
 
 
-def deterministic_persist_per_worker(df: dd.DataFrame, client: "Client") -> dd.DataFrame:
+def deterministic_persist_per_worker(
+    df: dd.DataFrame, client: "Client"
+) -> dd.DataFrame:
     # Got this script from https://github.com/dmlc/xgboost/issues/7927
     # Query workers
     n_workers = len(client.cluster.workers)
@@ -196,6 +198,7 @@ def test_xgbclassifier_classes_type_and_value(to_frame: bool, client: "Client"):
     X, y = make_classification(n_samples=1000, n_features=4, random_state=123)
     if to_frame:
         import pandas as pd
+
         feats = [f"var_{i}" for i in range(4)]
         df = pd.DataFrame(X, columns=feats)
         df["target"] = y
@@ -219,7 +222,7 @@ def test_from_dask_dataframe() -> None:
             y = dd.from_dask_array(y)
 
             dtrain = DaskDMatrix(client, X, y)
-            booster = xgb.dask.train(client, {}, dtrain, num_boost_round=2)['booster']
+            booster = xgb.dask.train(client, {}, dtrain, num_boost_round=2)["booster"]
 
             prediction = xgb.dask.predict(client, model=booster, data=dtrain)
 
@@ -230,7 +233,8 @@ def test_from_dask_dataframe() -> None:
             with pytest.raises(TypeError):
                 # evals_result is not supported in dask interface.
                 xgb.dask.train(  # type:ignore
-                    client, {}, dtrain, num_boost_round=2, evals_result={})
+                    client, {}, dtrain, num_boost_round=2, evals_result={}
+                )
             # force prediction to be computed
             from_dmatrix = prediction.compute()
 
@@ -243,8 +247,9 @@ def test_from_dask_dataframe() -> None:
 
             series_predictions = xgb.dask.inplace_predict(client, booster, X)
             assert isinstance(series_predictions, dd.Series)
-            np.testing.assert_allclose(series_predictions.compute().values,
-                                       from_dmatrix)
+            np.testing.assert_allclose(
+                series_predictions.compute().values, from_dmatrix
+            )
 
             # Make sure the output can be integrated back to original dataframe
             X["predict"] = prediction
@@ -303,7 +308,8 @@ def test_dask_sparse(client: "Client") -> None:
     clf.fit(X, y, eval_set=[(X, y)])
     sparse_results = clf.evals_result()
     np.testing.assert_allclose(
-        dense_results["validation_0"]["mlogloss"], sparse_results["validation_0"]["mlogloss"]
+        dense_results["validation_0"]["mlogloss"],
+        sparse_results["validation_0"]["mlogloss"],
     )
 
 
@@ -357,7 +363,7 @@ def run_categorical(client: "Client", tree_method: str, X, X_onehot, y) -> None:
         n_estimators=10,
         tree_method=tree_method,
         # force onehot
-        max_cat_to_onehot=9999
+        max_cat_to_onehot=9999,
     )
     reg.fit(X, y)
 
@@ -435,10 +441,15 @@ def run_boost_from_prediction_multi_class(
     X: dd.DataFrame,
     y: dd.Series,
     tree_method: str,
+    device: str,
     client: "Client",
 ) -> None:
     model_0 = xgb.dask.DaskXGBClassifier(
-        learning_rate=0.3, n_estimators=4, tree_method=tree_method, max_bin=768
+        learning_rate=0.3,
+        n_estimators=4,
+        tree_method=tree_method,
+        max_bin=768,
+        device=device,
     )
     X, y, _ = deterministic_repartition(client, X, y, None)
     model_0.fit(X=X, y=y)
@@ -448,7 +459,11 @@ def run_boost_from_prediction_multi_class(
     margin.columns = [f"m_{i}" for i in range(margin.shape[1])]
 
     model_1 = xgb.dask.DaskXGBClassifier(
-        learning_rate=0.3, n_estimators=4, tree_method=tree_method, max_bin=768
+        learning_rate=0.3,
+        n_estimators=4,
+        tree_method=tree_method,
+        max_bin=768,
+        device=device,
     )
     X, y, margin = deterministic_repartition(client, X, y, margin)
     model_1.fit(X=X, y=y, base_margin=margin)
@@ -460,7 +475,11 @@ def run_boost_from_prediction_multi_class(
     )
 
     model_2 = xgb.dask.DaskXGBClassifier(
-        learning_rate=0.3, n_estimators=8, tree_method=tree_method, max_bin=768
+        learning_rate=0.3,
+        n_estimators=8,
+        tree_method=tree_method,
+        max_bin=768,
+        device=device,
     )
     X, y, _ = deterministic_repartition(client, X, y, None)
     model_2.fit(X=X, y=y)
@@ -483,19 +502,28 @@ def run_boost_from_prediction(
     X: dd.DataFrame,
     y: dd.Series,
     tree_method: str,
+    device: str,
     client: "Client",
 ) -> None:
     X, y = client.persist([X, y])
 
     model_0 = xgb.dask.DaskXGBClassifier(
-        learning_rate=0.3, n_estimators=4, tree_method=tree_method, max_bin=512
+        learning_rate=0.3,
+        n_estimators=4,
+        tree_method=tree_method,
+        max_bin=512,
+        device=device,
     )
     X, y, _ = deterministic_repartition(client, X, y, None)
     model_0.fit(X=X, y=y)
     margin: dd.Series = model_0.predict(X, output_margin=True)
 
     model_1 = xgb.dask.DaskXGBClassifier(
-        learning_rate=0.3, n_estimators=4, tree_method=tree_method, max_bin=512
+        learning_rate=0.3,
+        n_estimators=4,
+        tree_method=tree_method,
+        max_bin=512,
+        device=device,
     )
     X, y, margin = deterministic_repartition(client, X, y, margin)
     model_1.fit(X=X, y=y, base_margin=margin)
@@ -503,7 +531,11 @@ def run_boost_from_prediction(
     predictions_1: dd.Series = model_1.predict(X, base_margin=margin)
 
     model_2 = xgb.dask.DaskXGBClassifier(
-        learning_rate=0.3, n_estimators=8, tree_method=tree_method, max_bin=512
+        learning_rate=0.3,
+        n_estimators=8,
+        tree_method=tree_method,
+        max_bin=512,
+        device=device,
     )
     X, y, _ = deterministic_repartition(client, X, y, None)
     model_2.fit(X=X, y=y)
@@ -539,17 +571,19 @@ def run_boost_from_prediction(
 @pytest.mark.parametrize("tree_method", ["hist", "approx"])
 def test_boost_from_prediction(tree_method: str, client: "Client") -> None:
     from sklearn.datasets import load_breast_cancer, load_digits
+
     X_, y_ = load_breast_cancer(return_X_y=True)
     X, y = dd.from_array(X_, chunksize=200), dd.from_array(y_, chunksize=200)
-    run_boost_from_prediction(X, y, tree_method, client)
+    run_boost_from_prediction(X, y, tree_method, "cpu", client)
 
     X_, y_ = load_digits(return_X_y=True)
     X, y = dd.from_array(X_, chunksize=100), dd.from_array(y_, chunksize=100)
-    run_boost_from_prediction_multi_class(X, y, tree_method, client)
+    run_boost_from_prediction_multi_class(X, y, tree_method, "cpu", client)
 
 
 def test_inplace_predict(client: "Client") -> None:
     from sklearn.datasets import load_diabetes
+
     X_, y_ = load_diabetes(return_X_y=True)
     X, y = dd.from_array(X_, chunksize=32), dd.from_array(y_, chunksize=32)
     reg = xgb.dask.DaskXGBRegressor(n_estimators=4).fit(X, y)
@@ -573,16 +607,14 @@ def test_dask_missing_value_reg(client: "Client") -> None:
     X = X.rechunk(20, 1)
     y = da.random.randint(0, 3, size=20)
     y.rechunk(20)
-    regressor = xgb.dask.DaskXGBRegressor(verbosity=1, n_estimators=2,
-                                          missing=0.0)
+    regressor = xgb.dask.DaskXGBRegressor(verbosity=1, n_estimators=2, missing=0.0)
     regressor.client = client
-    regressor.set_params(tree_method='hist')
+    regressor.set_params(tree_method="hist")
     regressor.fit(X, y, eval_set=[(X, y)])
     dd_predt = regressor.predict(X).compute()
 
     np_X = X.compute()
-    np_predt = regressor.get_booster().predict(
-        xgb.DMatrix(np_X, missing=0.0))
+    np_predt = regressor.get_booster().predict(xgb.DMatrix(np_X, missing=0.0))
     np.testing.assert_allclose(np_predt, dd_predt)
 
 
@@ -595,20 +627,19 @@ def test_dask_missing_value_cls(client: "Client") -> None:
     X = X.rechunk(20, None)
     y = da.random.randint(0, 3, size=kRows)
     y = y.rechunk(20, 1)
-    cls = xgb.dask.DaskXGBClassifier(verbosity=1, n_estimators=2,
-                                     tree_method='hist',
-                                     missing=0.0)
+    cls = xgb.dask.DaskXGBClassifier(
+        verbosity=1, n_estimators=2, tree_method="hist", missing=0.0
+    )
     cls.client = client
     cls.fit(X, y, eval_set=[(X, y)])
     dd_pred_proba = cls.predict_proba(X).compute()
 
     np_X = X.compute()
-    np_pred_proba = cls.get_booster().predict(
-        xgb.DMatrix(np_X, missing=0.0))
+    np_pred_proba = cls.get_booster().predict(xgb.DMatrix(np_X, missing=0.0))
     np.testing.assert_allclose(np_pred_proba, dd_pred_proba)
 
     cls = xgb.dask.DaskXGBClassifier()
-    assert hasattr(cls, 'missing')
+    assert hasattr(cls, "missing")
 
 
 @pytest.mark.parametrize("model", ["boosting", "rf"])
@@ -622,7 +653,7 @@ def test_dask_regressor(model: str, client: "Client") -> None:
     assert regressor._estimator_type == "regressor"
     assert sklearn.base.is_regressor(regressor)
 
-    regressor.set_params(tree_method='hist')
+    regressor.set_params(tree_method="hist")
     regressor.client = client
     regressor.fit(X, y, sample_weight=w, eval_set=[(X, y)])
     prediction = regressor.predict(X)
@@ -635,7 +666,7 @@ def test_dask_regressor(model: str, client: "Client") -> None:
     assert isinstance(prediction, da.Array)
     assert isinstance(history, dict)
 
-    assert list(history['validation_0'].keys())[0] == 'rmse'
+    assert list(history["validation_0"].keys())[0] == "rmse"
     forest = int(
         json.loads(regressor.get_booster().save_config())["learner"][
             "gradient_booster"
@@ -643,10 +674,10 @@ def test_dask_regressor(model: str, client: "Client") -> None:
     )
 
     if model == "boosting":
-        assert len(history['validation_0']['rmse']) == 2
+        assert len(history["validation_0"]["rmse"]) == 2
         assert forest == 1
     else:
-        assert len(history['validation_0']['rmse']) == 1
+        assert len(history["validation_0"]["rmse"]) == 1
         assert forest == 2
 
 
@@ -753,30 +784,38 @@ def test_empty_dmatrix_training_continuation(client: "Client") -> None:
     kRows, kCols = 1, 97
     X = dd.from_array(np.random.randn(kRows, kCols))
     y = dd.from_array(np.random.rand(kRows))
-    X.columns = ['X' + str(i) for i in range(0, kCols)]
+    X.columns = ["X" + str(i) for i in range(0, kCols)]
     dtrain = xgb.dask.DaskDMatrix(client, X, y)
 
     kRows += 1000
     X = dd.from_array(np.random.randn(kRows, kCols), chunksize=10)
-    X.columns = ['X' + str(i) for i in range(0, kCols)]
+    X.columns = ["X" + str(i) for i in range(0, kCols)]
     y = dd.from_array(np.random.rand(kRows), chunksize=10)
     valid = xgb.dask.DaskDMatrix(client, X, y)
 
-    out = xgb.dask.train(client, {'tree_method': 'hist'},
-                         dtrain=dtrain, num_boost_round=2,
-                         evals=[(valid, 'validation')])
+    out = xgb.dask.train(
+        client,
+        {"tree_method": "hist"},
+        dtrain=dtrain,
+        num_boost_round=2,
+        evals=[(valid, "validation")],
+    )
 
-    out = xgb.dask.train(client, {'tree_method': 'hist'},
-                         dtrain=dtrain, xgb_model=out['booster'],
-                         num_boost_round=2,
-                         evals=[(valid, 'validation')])
+    out = xgb.dask.train(
+        client,
+        {"tree_method": "hist"},
+        dtrain=dtrain,
+        xgb_model=out["booster"],
+        num_boost_round=2,
+        evals=[(valid, "validation")],
+    )
     assert xgb.dask.predict(client, out, dtrain).compute().shape[0] == 1
 
 
 def run_empty_dmatrix_reg(client: "Client", parameters: dict) -> None:
     def _check_outputs(out: xgb.dask.TrainReturnT, predictions: np.ndarray) -> None:
-        assert isinstance(out['booster'], xgb.dask.Booster)
-        for _, v in out['history']['validation'].items():
+        assert isinstance(out["booster"], xgb.dask.Booster)
+        for _, v in out["history"]["validation"].items():
             assert len(v) == 2
         assert isinstance(predictions, np.ndarray)
         assert predictions.shape[0] == 1
@@ -786,12 +825,14 @@ def run_empty_dmatrix_reg(client: "Client", parameters: dict) -> None:
     y = dd.from_array(np.random.rand(kRows))
     dtrain = xgb.dask.DaskDMatrix(client, X, y)
 
-    out = xgb.dask.train(client, parameters,
-                         dtrain=dtrain,
-                         evals=[(dtrain, 'validation')],
-                         num_boost_round=2)
-    predictions = xgb.dask.predict(client=client, model=out,
-                                   data=dtrain).compute()
+    out = xgb.dask.train(
+        client,
+        parameters,
+        dtrain=dtrain,
+        evals=[(dtrain, "validation")],
+        num_boost_round=2,
+    )
+    predictions = xgb.dask.predict(client=client, model=out, data=dtrain).compute()
     _check_outputs(out, predictions)
 
     # valid has more rows than train
@@ -799,12 +840,14 @@ def run_empty_dmatrix_reg(client: "Client", parameters: dict) -> None:
     X = dd.from_array(np.random.randn(kRows, kCols))
     y = dd.from_array(np.random.rand(kRows))
     valid = xgb.dask.DaskDMatrix(client, X, y)
-    out = xgb.dask.train(client, parameters,
-                         dtrain=dtrain,
-                         evals=[(valid, 'validation')],
-                         num_boost_round=2)
-    predictions = xgb.dask.predict(client=client, model=out,
-                                   data=dtrain).compute()
+    out = xgb.dask.train(
+        client,
+        parameters,
+        dtrain=dtrain,
+        evals=[(valid, "validation")],
+        num_boost_round=2,
+    )
+    predictions = xgb.dask.predict(client=client, model=out, data=dtrain).compute()
     _check_outputs(out, predictions)
 
     # train has more rows than evals
@@ -814,12 +857,14 @@ def run_empty_dmatrix_reg(client: "Client", parameters: dict) -> None:
     y = dd.from_array(np.random.rand(kRows))
     dtrain = xgb.dask.DaskDMatrix(client, X, y)
 
-    out = xgb.dask.train(client, parameters,
-                         dtrain=dtrain,
-                         evals=[(valid, 'validation')],
-                         num_boost_round=2)
-    predictions = xgb.dask.predict(client=client, model=out,
-                                   data=valid).compute()
+    out = xgb.dask.train(
+        client,
+        parameters,
+        dtrain=dtrain,
+        evals=[(valid, "validation")],
+        num_boost_round=2,
+    )
+    predictions = xgb.dask.predict(client=client, model=out, data=valid).compute()
     _check_outputs(out, predictions)
 
 
@@ -827,8 +872,8 @@ def run_empty_dmatrix_cls(client: "Client", parameters: dict) -> None:
     n_classes = 4
 
     def _check_outputs(out: xgb.dask.TrainReturnT, predictions: np.ndarray) -> None:
-        assert isinstance(out['booster'], xgb.dask.Booster)
-        assert len(out['history']['validation']['merror']) == 2
+        assert isinstance(out["booster"], xgb.dask.Booster)
+        assert len(out["history"]["validation"]["merror"]) == 2
         assert isinstance(predictions, np.ndarray)
         assert predictions.shape[1] == n_classes, predictions.shape
 
@@ -836,16 +881,18 @@ def run_empty_dmatrix_cls(client: "Client", parameters: dict) -> None:
     X = dd.from_array(np.random.randn(kRows, kCols))
     y = dd.from_array(np.random.randint(low=0, high=n_classes, size=kRows))
     dtrain = xgb.dask.DaskDMatrix(client, X, y)
-    parameters['objective'] = 'multi:softprob'
-    parameters['eval_metric'] = 'merror'
-    parameters['num_class'] = n_classes
+    parameters["objective"] = "multi:softprob"
+    parameters["eval_metric"] = "merror"
+    parameters["num_class"] = n_classes
 
-    out = xgb.dask.train(client, parameters,
-                         dtrain=dtrain,
-                         evals=[(dtrain, 'validation')],
-                         num_boost_round=2)
-    predictions = xgb.dask.predict(client=client, model=out,
-                                   data=dtrain)
+    out = xgb.dask.train(
+        client,
+        parameters,
+        dtrain=dtrain,
+        evals=[(dtrain, "validation")],
+        num_boost_round=2,
+    )
+    predictions = xgb.dask.predict(client=client, model=out, data=dtrain)
     assert predictions.shape[1] == n_classes
     predictions = predictions.compute()
     _check_outputs(out, predictions)
@@ -857,25 +904,26 @@ def run_empty_dmatrix_cls(client: "Client", parameters: dict) -> None:
     y = dd.from_array(np.random.randint(low=0, high=n_classes, size=kRows))
     dtrain = xgb.dask.DaskDMatrix(client, X, y)
 
-    out = xgb.dask.train(client, parameters,
-                         dtrain=dtrain,
-                         evals=[(valid, 'validation')],
-                         num_boost_round=2)
-    predictions = xgb.dask.predict(client=client, model=out,
-                                   data=valid).compute()
+    out = xgb.dask.train(
+        client,
+        parameters,
+        dtrain=dtrain,
+        evals=[(valid, "validation")],
+        num_boost_round=2,
+    )
+    predictions = xgb.dask.predict(client=client, model=out, data=valid).compute()
     _check_outputs(out, predictions)
 
 
-def run_empty_dmatrix_auc(client: "Client", tree_method: str, n_workers: int) -> None:
+def run_empty_dmatrix_auc(client: "Client", device: str, n_workers: int) -> None:
     from sklearn import datasets
+
     n_samples = 100
     n_features = 7
     rng = np.random.RandomState(1994)
 
     make_classification = partial(
-        datasets.make_classification,
-        n_features=n_features,
-        random_state=rng
+        datasets.make_classification, n_features=n_features, random_state=rng
     )
 
     # binary
@@ -888,7 +936,7 @@ def run_empty_dmatrix_auc(client: "Client", tree_method: str, n_workers: int) ->
     valid_X = dd.from_array(valid_X_, chunksize=n_samples)
     valid_y = dd.from_array(valid_y_, chunksize=n_samples)
 
-    cls = xgb.dask.DaskXGBClassifier(tree_method=tree_method, n_estimators=2)
+    cls = xgb.dask.DaskXGBClassifier(device=device, n_estimators=2)
     cls.fit(X, y, eval_metric=["auc", "aucpr"], eval_set=[(valid_X, valid_y)])
 
     # multiclass
@@ -897,7 +945,7 @@ def run_empty_dmatrix_auc(client: "Client", tree_method: str, n_workers: int) ->
         n_classes=n_workers,
         n_informative=n_features,
         n_redundant=0,
-        n_repeated=0
+        n_repeated=0,
     )
     for i in range(y_.shape[0]):
         y_[i] = i % n_workers
@@ -910,25 +958,26 @@ def run_empty_dmatrix_auc(client: "Client", tree_method: str, n_workers: int) ->
         n_classes=n_workers,
         n_informative=n_features,
         n_redundant=0,
-        n_repeated=0
+        n_repeated=0,
     )
     for i in range(valid_y_.shape[0]):
         valid_y_[i] = i % n_workers
     valid_X = dd.from_array(valid_X_, chunksize=n_samples)
     valid_y = dd.from_array(valid_y_, chunksize=n_samples)
 
-    cls = xgb.dask.DaskXGBClassifier(tree_method=tree_method, n_estimators=2)
+    cls = xgb.dask.DaskXGBClassifier(device=device, n_estimators=2)
     cls.fit(X, y, eval_metric=["auc", "aucpr"], eval_set=[(valid_X, valid_y)])
 
 
 def test_empty_dmatrix_auc() -> None:
     with LocalCluster(n_workers=4, dashboard_address=":0") as cluster:
         with Client(cluster) as client:
-            run_empty_dmatrix_auc(client, "hist", 4)
+            run_empty_dmatrix_auc(client, "cpu", 4)
 
 
-def run_auc(client: "Client", tree_method: str) -> None:
+def run_auc(client: "Client", device: str) -> None:
     from sklearn import datasets
+
     n_samples = 100
     n_features = 97
     rng = np.random.RandomState(1994)
@@ -944,10 +993,10 @@ def run_auc(client: "Client", tree_method: str) -> None:
     valid_X = dd.from_array(valid_X_, chunksize=10)
     valid_y = dd.from_array(valid_y_, chunksize=10)
 
-    cls = xgb.XGBClassifier(tree_method=tree_method, n_estimators=2)
+    cls = xgb.XGBClassifier(device=device, n_estimators=2)
     cls.fit(X_, y_, eval_metric="auc", eval_set=[(valid_X_, valid_y_)])
 
-    dcls = xgb.dask.DaskXGBClassifier(tree_method=tree_method, n_estimators=2)
+    dcls = xgb.dask.DaskXGBClassifier(device=device, n_estimators=2)
     dcls.fit(X, y, eval_metric="auc", eval_set=[(valid_X, valid_y)])
 
     approx = dcls.evals_result()["validation_0"]["auc"]
@@ -958,7 +1007,7 @@ def run_auc(client: "Client", tree_method: str) -> None:
 
 
 def test_auc(client: "Client") -> None:
-    run_auc(client, "hist")
+    run_auc(client, "cpu")
 
 
 # No test for Exact, as empty DMatrix handling are mostly for distributed
@@ -967,10 +1016,10 @@ def test_auc(client: "Client") -> None:
 def test_empty_dmatrix(tree_method) -> None:
     with LocalCluster(n_workers=kWorkers, dashboard_address=":0") as cluster:
         with Client(cluster) as client:
-            parameters = {'tree_method': tree_method}
+            parameters = {"tree_method": tree_method}
             run_empty_dmatrix_reg(client, parameters)
             run_empty_dmatrix_cls(client, parameters)
-            parameters = {'tree_method': tree_method, "objective": "reg:absoluteerror"}
+            parameters = {"tree_method": tree_method, "objective": "reg:absoluteerror"}
             run_empty_dmatrix_reg(client, parameters)
 
 
@@ -987,10 +1036,12 @@ async def run_from_dask_array_asyncio(scheduler_address: str) -> xgb.dask.TrainR
         assert isinstance(with_X, da.Array)
         assert isinstance(inplace, da.Array)
 
-        np.testing.assert_allclose(await client.compute(with_m),
-                                   await client.compute(with_X))
-        np.testing.assert_allclose(await client.compute(with_m),
-                                   await client.compute(inplace))
+        np.testing.assert_allclose(
+            await client.compute(with_m), await client.compute(with_X)
+        )
+        np.testing.assert_allclose(
+            await client.compute(with_m), await client.compute(inplace)
+        )
     return output
 
 
@@ -998,7 +1049,7 @@ async def run_dask_regressor_asyncio(scheduler_address: str) -> None:
     async with Client(scheduler_address, asynchronous=True) as client:
         X, y, _ = generate_array()
         regressor = await xgb.dask.DaskXGBRegressor(verbosity=1, n_estimators=2)
-        regressor.set_params(tree_method='hist')
+        regressor.set_params(tree_method="hist")
         regressor.client = client
         await regressor.fit(X, y, eval_set=[(X, y)])
         prediction = await regressor.predict(X)
@@ -1011,8 +1062,8 @@ async def run_dask_regressor_asyncio(scheduler_address: str) -> None:
         assert isinstance(prediction, da.Array)
         assert isinstance(history, dict)
 
-        assert list(history['validation_0'].keys())[0] == 'rmse'
-        assert len(history['validation_0']['rmse']) == 2
+        assert list(history["validation_0"].keys())[0] == "rmse"
+        assert len(history["validation_0"]["rmse"]) == 2
 
         awaited = await client.compute(prediction)
         assert awaited.shape[0] == kRows
@@ -1023,7 +1074,8 @@ async def run_dask_classifier_asyncio(scheduler_address: str) -> None:
         X, y, _ = generate_array()
         y = (y * 10).astype(np.int32)
         classifier = await xgb.dask.DaskXGBClassifier(
-            verbosity=1, n_estimators=2, eval_metric='merror')
+            verbosity=1, n_estimators=2, eval_metric="merror"
+        )
         classifier.client = client
         await classifier.fit(X, y, eval_set=[(X, y)])
         prediction = await classifier.predict(X)
@@ -1036,10 +1088,10 @@ async def run_dask_classifier_asyncio(scheduler_address: str) -> None:
         assert isinstance(prediction, da.Array)
         assert isinstance(history, dict)
 
-        assert list(history.keys())[0] == 'validation_0'
-        assert list(history['validation_0'].keys())[0] == 'merror'
-        assert len(list(history['validation_0'])) == 1
-        assert len(history['validation_0']['merror']) == 2
+        assert list(history.keys())[0] == "validation_0"
+        assert list(history["validation_0"].keys())[0] == "merror"
+        assert len(list(history["validation_0"])) == 1
+        assert len(history["validation_0"]["merror"]) == 2
 
         # Test .predict_proba()
         probas = await classifier.predict_proba(X)
@@ -1065,8 +1117,8 @@ def test_with_asyncio() -> None:
         with Client(cluster) as client:
             address = client.scheduler.address
             output = asyncio.run(run_from_dask_array_asyncio(address))
-            assert isinstance(output['booster'], xgb.Booster)
-            assert isinstance(output['history'], dict)
+            assert isinstance(output["booster"], xgb.Booster)
+            assert isinstance(output["history"], dict)
 
             asyncio.run(run_dask_regressor_asyncio(address))
             asyncio.run(run_dask_classifier_asyncio(address))
@@ -1124,8 +1176,9 @@ def test_predict_with_meta(client: "Client") -> None:
     margin = da.random.random(kRows, partition_size) + 1e4
 
     dtrain = DaskDMatrix(client, X, y, weight=w, base_margin=margin)
-    booster: xgb.Booster = xgb.dask.train(
-        client, {}, dtrain, num_boost_round=4)['booster']
+    booster: xgb.Booster = xgb.dask.train(client, {}, dtrain, num_boost_round=4)[
+        "booster"
+    ]
 
     prediction = xgb.dask.predict(client, model=booster, data=dtrain)
     assert prediction.ndim == 1
@@ -1141,41 +1194,41 @@ def test_predict_with_meta(client: "Client") -> None:
 
 
 def run_aft_survival(client: "Client", dmatrix_t: Type) -> None:
-    df = dd.read_csv(
-        os.path.join(tm.data_dir(__file__), "veterans_lung_cancer.csv")
+    df = dd.read_csv(os.path.join(tm.data_dir(__file__), "veterans_lung_cancer.csv"))
+    y_lower_bound = df["Survival_label_lower_bound"]
+    y_upper_bound = df["Survival_label_upper_bound"]
+    X = df.drop(["Survival_label_lower_bound", "Survival_label_upper_bound"], axis=1)
+    m = dmatrix_t(
+        client, X, label_lower_bound=y_lower_bound, label_upper_bound=y_upper_bound
     )
-    y_lower_bound = df['Survival_label_lower_bound']
-    y_upper_bound = df['Survival_label_upper_bound']
-    X = df.drop(['Survival_label_lower_bound',
-                 'Survival_label_upper_bound'], axis=1)
-    m = dmatrix_t(client, X, label_lower_bound=y_lower_bound,
-                  label_upper_bound=y_upper_bound)
-    base_params = {'verbosity': 0,
-                   'objective': 'survival:aft',
-                   'eval_metric': 'aft-nloglik',
-                   'learning_rate': 0.05,
-                   'aft_loss_distribution_scale': 1.20,
-                   'max_depth': 6,
-                   'lambda': 0.01,
-                   'alpha': 0.02}
+    base_params = {
+        "verbosity": 0,
+        "objective": "survival:aft",
+        "eval_metric": "aft-nloglik",
+        "learning_rate": 0.05,
+        "aft_loss_distribution_scale": 1.20,
+        "max_depth": 6,
+        "lambda": 0.01,
+        "alpha": 0.02,
+    }
 
     nloglik_rec = {}
-    dists = ['normal', 'logistic', 'extreme']
+    dists = ["normal", "logistic", "extreme"]
     for dist in dists:
         params = base_params
-        params.update({'aft_loss_distribution': dist})
+        params.update({"aft_loss_distribution": dist})
         evals_result = {}
-        out = xgb.dask.train(client, params, m, num_boost_round=100,
-                             evals=[(m, 'train')])
-        evals_result = out['history']
-        nloglik_rec[dist] = evals_result['train']['aft-nloglik']
+        out = xgb.dask.train(
+            client, params, m, num_boost_round=100, evals=[(m, "train")]
+        )
+        evals_result = out["history"]
+        nloglik_rec[dist] = evals_result["train"]["aft-nloglik"]
         # AFT metric (negative log likelihood) improve monotonically
-        assert all(p >= q for p, q in zip(nloglik_rec[dist],
-                                          nloglik_rec[dist][:1]))
+        assert all(p >= q for p, q in zip(nloglik_rec[dist], nloglik_rec[dist][:1]))
     # For this data, normal distribution works the best
-    assert nloglik_rec['normal'][-1] < 4.9
-    assert nloglik_rec['logistic'][-1] > 4.9
-    assert nloglik_rec['extreme'][-1] > 4.9
+    assert nloglik_rec["normal"][-1] < 4.9
+    assert nloglik_rec["logistic"][-1] > 4.9
+    assert nloglik_rec["extreme"][-1] > 4.9
 
 
 def test_dask_aft_survival() -> None:
@@ -1244,7 +1297,7 @@ def test_dask_predict_leaf(booster: str, client: "Client") -> None:
     leaf = xgb.dask.predict(
         client,
         cls.get_booster(),
-        X.to_dask_array(),      # we can't map_blocks on dataframe when output is 4-dim.
+        X.to_dask_array(),  # we can't map_blocks on dataframe when output is 4-dim.
         pred_leaf=True,
         strict_shape=True,
         validate_features=False,
@@ -1304,7 +1357,7 @@ class TestWithDask:
                 path = os.path.join(tmpdir, f"{rank}.bin")
                 Xy.save_binary(path)
 
-        def load_dmatrix(rabit_args: Dict[str, Union[int,str]], tmpdir: str) -> None:
+        def load_dmatrix(rabit_args: Dict[str, Union[int, str]], tmpdir: str) -> None:
             with xgb.dask.CommunicatorContext(**rabit_args):
                 rank = xgb.collective.get_rank()
                 path = os.path.join(tmpdir, f"{rank}.bin")
@@ -1339,22 +1392,21 @@ class TestWithDask:
                 futures.append(f)
             client.gather(futures)
 
-    @pytest.mark.parametrize('config_key,config_value', [('verbosity', 0), ('use_rmm', True)])
+    @pytest.mark.parametrize(
+        "config_key,config_value", [("verbosity", 0), ("use_rmm", True)]
+    )
     def test_global_config(
-            self,
-            client: "Client",
-            config_key: str,
-            config_value: Any
+        self, client: "Client", config_key: str, config_value: Any
     ) -> None:
         X, y, _ = generate_array()
         xgb.config.set_config(**{config_key: config_value})
         dtrain = DaskDMatrix(client, X, y)
-        before_fname = './before_training-test_global_config'
-        after_fname = './after_training-test_global_config'
+        before_fname = "./before_training-test_global_config"
+        after_fname = "./after_training-test_global_config"
 
         class TestCallback(xgb.callback.TrainingCallback):
             def write_file(self, fname: str) -> None:
-                with open(fname, 'w') as fd:
+                with open(fname, "w") as fd:
                     fd.write(str(xgb.config.get_config()[config_key]))
 
             def before_training(self, model: xgb.Booster) -> xgb.Booster:
@@ -1367,33 +1419,34 @@ class TestWithDask:
                 return model
 
             def before_iteration(
-                    self, model: xgb.Booster, epoch: int, evals_log: Dict
+                self, model: xgb.Booster, epoch: int, evals_log: Dict
             ) -> bool:
                 assert xgb.config.get_config()[config_key] == config_value
                 return False
 
             def after_iteration(
-                    self, model: xgb.Booster, epoch: int, evals_log: Dict
+                self, model: xgb.Booster, epoch: int, evals_log: Dict
             ) -> bool:
                 self.write_file(after_fname)
                 assert xgb.config.get_config()[config_key] == config_value
                 return False
 
-        xgb.dask.train(client, {}, dtrain, num_boost_round=4, callbacks=[TestCallback()])[
-            'booster']
+        xgb.dask.train(
+            client, {}, dtrain, num_boost_round=4, callbacks=[TestCallback()]
+        )["booster"]
 
-        with open(before_fname, 'r') as before, open(after_fname, 'r') as after:
+        with open(before_fname, "r") as before, open(after_fname, "r") as after:
             assert before.read() == str(config_value)
             assert after.read() == str(config_value)
 
         os.remove(before_fname)
         os.remove(after_fname)
 
-        with dask.config.set({'xgboost.foo': "bar"}):
+        with dask.config.set({"xgboost.foo": "bar"}):
             with pytest.raises(ValueError, match=r"Unknown configuration.*"):
                 xgb.dask.train(client, {}, dtrain, num_boost_round=4)
 
-        with dask.config.set({'xgboost.scheduler_address': "127.0.0.1:foo"}):
+        with dask.config.set({"xgboost.scheduler_address": "127.0.0.1:foo"}):
             with pytest.raises(socket.gaierror, match=r".*not known.*"):
                 xgb.dask.train(client, {}, dtrain, num_boost_round=1)
 
@@ -1403,9 +1456,9 @@ class TestWithDask:
         params: Dict,
         num_rounds: int,
         dataset: tm.TestDataset,
-        tree_method: str
+        tree_method: str,
     ) -> None:
-        params['tree_method'] = tree_method
+        params["tree_method"] = tree_method
         params = dataset.set_params(params)
         # It doesn't make sense to distribute a completely
         # empty dataset.
@@ -1462,10 +1515,10 @@ class TestWithDask:
         deadline=None, max_examples=10, suppress_health_check=suppress, print_blob=True
     )
     def test_hist(
-            self, params: Dict, dataset: tm.TestDataset, client: "Client"
+        self, params: Dict, dataset: tm.TestDataset, client: "Client"
     ) -> None:
         num_rounds = 10
-        self.run_updater_test(client, params, num_rounds, dataset, 'hist')
+        self.run_updater_test(client, params, num_rounds, dataset, "hist")
 
     def test_quantile_dmatrix(self, client: Client) -> None:
         X, y = make_categorical(client, 10000, 30, 13)
@@ -1478,7 +1531,7 @@ class TestWithDask:
             {"tree_method": "hist"},
             Xy,
             num_boost_round=10,
-            evals=[(Xy, "Train"), (valid_Xy, "Valid")]
+            evals=[(Xy, "Train"), (valid_Xy, "Valid")],
         )
         dmatrix_hist = output["history"]
 
@@ -1492,7 +1545,7 @@ class TestWithDask:
             {"tree_method": "hist"},
             Xy,
             num_boost_round=10,
-            evals=[(Xy, "Train"), (valid_Xy, "Valid")]
+            evals=[(Xy, "Train"), (valid_Xy, "Valid")],
         )
         quantile_hist = output["history"]
 
@@ -1532,7 +1585,7 @@ class TestWithDask:
         self, client: "Client", params: Dict, dataset: tm.TestDataset
     ) -> None:
         num_rounds = 10
-        self.run_updater_test(client, params, num_rounds, dataset, 'approx')
+        self.run_updater_test(client, params, num_rounds, dataset, "approx")
 
     def test_adaptive(self) -> None:
         def get_score(config: Dict) -> float:
@@ -1593,7 +1646,9 @@ class TestWithDask:
                 dy = client.persist(dy, workers=workers[1])
                 valid = xgb.dask.DaskDMatrix(client, dX, dy)
 
-                merged = xgb.dask._get_workers_from_data(train, evals=[(valid, 'Valid')])
+                merged = xgb.dask._get_workers_from_data(
+                    train, evals=[(valid, "Valid")]
+                )
                 assert len(merged) == 2
 
     @pytest.mark.skipif(**tm.no_dask())
@@ -1630,28 +1685,30 @@ class TestWithDask:
     @pytest.mark.skipif(**tm.no_sklearn())
     def test_custom_objective(self, client: "Client") -> None:
         from sklearn.datasets import fetch_california_housing
+
         X, y = fetch_california_housing(return_X_y=True)
         X, y = da.from_array(X), da.from_array(y)
         rounds = 20
 
         with tempfile.TemporaryDirectory() as tmpdir:
-            path = os.path.join(tmpdir, 'log')
+            path = os.path.join(tmpdir, "log")
 
             def sqr(
                 labels: np.ndarray, predts: np.ndarray
             ) -> Tuple[np.ndarray, np.ndarray]:
-                with open(path, 'a') as fd:
-                    print('Running sqr', file=fd)
+                with open(path, "a") as fd:
+                    print("Running sqr", file=fd)
                 grad = predts - labels
                 hess = np.ones(shape=labels.shape[0])
                 return grad, hess
 
-            reg = xgb.dask.DaskXGBRegressor(n_estimators=rounds, objective=sqr,
-                                            tree_method='hist')
+            reg = xgb.dask.DaskXGBRegressor(
+                n_estimators=rounds, objective=sqr, tree_method="hist"
+            )
             reg.fit(X, y, eval_set=[(X, y)])
 
             # Check the obj is ran for rounds.
-            with open(path, 'r') as fd:
+            with open(path, "r") as fd:
                 out = fd.readlines()
                 assert len(out) == rounds
 
@@ -1670,10 +1727,10 @@ class TestWithDask:
             tm.non_increasing(results_native["validation_0"]["rmse"])
 
     def test_no_duplicated_partition(self) -> None:
-        '''Assert each worker has the correct amount of data, and DMatrix initialization doesn't
+        """Assert each worker has the correct amount of data, and DMatrix initialization doesn't
         generate unnecessary copies of data.
 
-        '''
+        """
         with LocalCluster(n_workers=2, dashboard_address=":0") as cluster:
             with Client(cluster) as client:
                 X, y, _ = generate_array()
@@ -1698,9 +1755,12 @@ class TestWithDask:
                 for i in range(len(workers)):
                     futures.append(
                         client.submit(
-                            worker_fn, workers[i],
-                            m._create_fn_args(workers[i]), pure=False,
-                            workers=[workers[i]])
+                            worker_fn,
+                            workers[i],
+                            m._create_fn_args(workers[i]),
+                            pure=False,
+                            workers=[workers[i]],
+                        )
                     )
                 client.gather(futures)
 
@@ -1719,13 +1779,16 @@ class TestWithDask:
     def test_data_initialization(self, client: "Client") -> None:
         """assert that we don't create duplicated DMatrix"""
         from sklearn.datasets import load_digits
+
         X, y = load_digits(return_X_y=True)
         X, y = dd.from_array(X, chunksize=32), dd.from_array(y, chunksize=32)
         validate_data_initialization(
-            xgb.dask.DaskDMatrix, xgb.dask.DaskXGBClassifier, X, y
+            xgb.dask.DaskQuantileDMatrix, xgb.dask.DaskXGBClassifier, X, y
         )
 
-    def run_shap(self, X: Any, y: Any, params: Dict[str, Any], client: "Client") -> None:
+    def run_shap(
+        self, X: Any, y: Any, params: Dict[str, Any], client: "Client"
+    ) -> None:
         rows = X.shape[0]
         cols = X.shape[1]
 
@@ -1739,12 +1802,14 @@ class TestWithDask:
 
         X, y = da.from_array(X, chunks=(32, -1)), da.from_array(y, chunks=32)
         Xy = xgb.dask.DaskDMatrix(client, X, y)
-        booster = xgb.dask.train(client, params, Xy, num_boost_round=10)['booster']
+        booster = xgb.dask.train(client, params, Xy, num_boost_round=10)["booster"]
 
         test_Xy = xgb.dask.DaskDMatrix(client, X, y)
 
         shap = xgb.dask.predict(client, booster, test_Xy, pred_contribs=True).compute()
-        margin = xgb.dask.predict(client, booster, test_Xy, output_margin=True).compute()
+        margin = xgb.dask.predict(
+            client, booster, test_Xy, output_margin=True
+        ).compute()
         assert_shape(shap.shape)
         assert np.allclose(np.sum(shap, axis=len(shap.shape) - 1), margin, 1e-5, 1e-5)
 
@@ -1774,7 +1839,9 @@ class TestWithDask:
         test_Xy = xgb.dask.DaskDMatrix(client, X, y)
 
         shap = xgb.dask.predict(client, booster, test_Xy, pred_contribs=True).compute()
-        margin = xgb.dask.predict(client, booster, test_Xy, output_margin=True).compute()
+        margin = xgb.dask.predict(
+            client, booster, test_Xy, output_margin=True
+        ).compute()
         assert np.allclose(np.sum(shap, axis=len(shap.shape) - 1), margin, 1e-5, 1e-5)
 
         shap = xgb.dask.predict(client, booster, X, pred_contribs=True).compute()
@@ -1783,32 +1850,29 @@ class TestWithDask:
 
     def test_shap(self, client: "Client") -> None:
         from sklearn.datasets import load_diabetes, load_iris
+
         X, y = load_diabetes(return_X_y=True)
-        params: Dict[str, Any] = {'objective': 'reg:squarederror'}
+        params: Dict[str, Any] = {"objective": "reg:squarederror"}
         self.run_shap(X, y, params, client)
 
         X, y = load_iris(return_X_y=True)
-        params = {'objective': 'multi:softmax', 'num_class': 3}
+        params = {"objective": "multi:softmax", "num_class": 3}
         self.run_shap(X, y, params, client)
 
-        params = {'objective': 'multi:softprob', 'num_class': 3}
+        params = {"objective": "multi:softprob", "num_class": 3}
         self.run_shap(X, y, params, client)
 
         self.run_shap_cls_sklearn(X, y, client)
 
     def run_shap_interactions(
-        self,
-        X: Any,
-        y: Any,
-        params: Dict[str, Any],
-        client: "Client"
+        self, X: Any, y: Any, params: Dict[str, Any], client: "Client"
     ) -> None:
         rows = X.shape[0]
         cols = X.shape[1]
         X, y = da.from_array(X, chunks=(32, -1)), da.from_array(y, chunks=32)
 
         Xy = xgb.dask.DaskDMatrix(client, X, y)
-        booster = xgb.dask.train(client, params, Xy, num_boost_round=10)['booster']
+        booster = xgb.dask.train(client, params, Xy, num_boost_round=10)["booster"]
 
         test_Xy = xgb.dask.DaskDMatrix(client, X, y)
 
@@ -1821,20 +1885,27 @@ class TestWithDask:
         assert shap.shape[1] == cols + 1
         assert shap.shape[2] == cols + 1
 
-        margin = xgb.dask.predict(client, booster, test_Xy, output_margin=True).compute()
-        assert np.allclose(np.sum(shap, axis=(len(shap.shape) - 1, len(shap.shape) - 2)),
-                           margin,
-                           1e-5, 1e-5)
+        margin = xgb.dask.predict(
+            client, booster, test_Xy, output_margin=True
+        ).compute()
+        assert np.allclose(
+            np.sum(shap, axis=(len(shap.shape) - 1, len(shap.shape) - 2)),
+            margin,
+            1e-5,
+            1e-5,
+        )
 
     def test_shap_interactions(self, client: "Client") -> None:
         from sklearn.datasets import load_diabetes
+
         X, y = load_diabetes(return_X_y=True)
-        params = {'objective': 'reg:squarederror'}
+        params = {"objective": "reg:squarederror"}
         self.run_shap_interactions(X, y, params, client)
 
     @pytest.mark.skipif(**tm.no_sklearn())
-    def test_sklearn_io(self, client: 'Client') -> None:
+    def test_sklearn_io(self, client: "Client") -> None:
         from sklearn.datasets import load_digits
+
         X_, y_ = load_digits(return_X_y=True)
         X, y = da.from_array(X_), da.from_array(y_)
         cls = xgb.dask.DaskXGBClassifier(n_estimators=10)
@@ -1852,7 +1923,7 @@ class TestWithDask:
             predt_1 = cls.predict(X)
             np.testing.assert_allclose(predt_0.compute(), predt_1.compute())
 
-            path = os.path.join(tmpdir, 'cls.json')
+            path = os.path.join(tmpdir, "cls.json")
             cls.save_model(path)
 
             cls = xgb.dask.DaskXGBClassifier()
@@ -1910,7 +1981,7 @@ def test_parallel_submits(client: "Client") -> None:
         assert cls.get_booster().num_boosted_rounds() == i + 1
 
 
-def run_tree_stats(client: Client, tree_method: str) -> str:
+def run_tree_stats(client: Client, tree_method: str, device: str) -> str:
     """assert that different workers count dosn't affect summ statistic's on root"""
 
     def dask_train(X, y, num_obs, num_features):
@@ -1924,6 +1995,7 @@ def run_tree_stats(client: Client, tree_method: str) -> str:
             {
                 "verbosity": 0,
                 "tree_method": tree_method,
+                "device": device,
                 "objective": "reg:squarederror",
                 "max_depth": 3,
             },
@@ -1957,10 +2029,10 @@ def run_tree_stats(client: Client, tree_method: str) -> str:
 def test_tree_stats(tree_method: str) -> None:
     with LocalCluster(n_workers=1, dashboard_address=":0") as cluster:
         with Client(cluster) as client:
-            local = run_tree_stats(client, tree_method)
+            local = run_tree_stats(client, tree_method, "cpu")
     with LocalCluster(n_workers=2, dashboard_address=":0") as cluster:
         with Client(cluster) as client:
-            distributed = run_tree_stats(client, tree_method)
+            distributed = run_tree_stats(client, tree_method, "cpu")
 
     assert local == distributed
 
@@ -1999,6 +2071,7 @@ def test_parallel_submit_multi_clients() -> None:
         t_futures = []
         with ThreadPoolExecutor(max_workers=16) as e:
             for i in range(n_submits):
+
                 def _() -> xgb.dask.DaskXGBClassifier:
                     return futures[i][0].compute(futures[i][1]).result()
 
@@ -2025,48 +2098,7 @@ class TestDaskCallbacks:
     @pytest.mark.skipif(**tm.no_sklearn())
     def test_early_stopping(self, client: "Client") -> None:
         from sklearn.datasets import load_breast_cancer
-        X, y = load_breast_cancer(return_X_y=True)
-        X, y = da.from_array(X), da.from_array(y)
-        m = xgb.dask.DaskDMatrix(client, X, y)
 
-        valid = xgb.dask.DaskDMatrix(client, X, y)
-        early_stopping_rounds = 5
-        booster = xgb.dask.train(client, {'objective': 'binary:logistic',
-                                          'eval_metric': 'error',
-                                          'tree_method': 'hist'}, m,
-                                 evals=[(valid, 'Valid')],
-                                 num_boost_round=1000,
-                                 early_stopping_rounds=early_stopping_rounds)['booster']
-        assert hasattr(booster, 'best_score')
-        dump = booster.get_dump(dump_format='json')
-        assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
-
-        valid_X, valid_y = load_breast_cancer(return_X_y=True)
-        valid_X, valid_y = da.from_array(valid_X), da.from_array(valid_y)
-        cls = xgb.dask.DaskXGBClassifier(objective='binary:logistic', tree_method='hist',
-                                         n_estimators=1000)
-        cls.client = client
-        cls.fit(X, y, early_stopping_rounds=early_stopping_rounds,
-                eval_set=[(valid_X, valid_y)])
-        booster = cls.get_booster()
-        dump = booster.get_dump(dump_format='json')
-        assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
-
-        # Specify the metric
-        cls = xgb.dask.DaskXGBClassifier(objective='binary:logistic', tree_method='hist',
-                                         n_estimators=1000)
-        cls.client = client
-        cls.fit(X, y, early_stopping_rounds=early_stopping_rounds,
-                eval_set=[(valid_X, valid_y)], eval_metric='error')
-        assert tm.non_increasing(cls.evals_result()['validation_0']['error'])
-        booster = cls.get_booster()
-        dump = booster.get_dump(dump_format='json')
-        assert len(cls.evals_result()['validation_0']['error']) < 20
-        assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
-
-    @pytest.mark.skipif(**tm.no_sklearn())
-    def test_early_stopping_custom_eval(self, client: "Client") -> None:
-        from sklearn.datasets import load_breast_cancer
         X, y = load_breast_cancer(return_X_y=True)
         X, y = da.from_array(X), da.from_array(y)
         m = xgb.dask.DaskDMatrix(client, X, y)
@@ -2074,49 +2106,122 @@ class TestDaskCallbacks:
         valid = xgb.dask.DaskDMatrix(client, X, y)
         early_stopping_rounds = 5
         booster = xgb.dask.train(
-            client, {'objective': 'binary:logistic',
-                     'eval_metric': 'error',
-                     'tree_method': 'hist'}, m,
-            evals=[(m, 'Train'), (valid, 'Valid')],
-            feval=tm.eval_error_metric,
+            client,
+            {
+                "objective": "binary:logistic",
+                "eval_metric": "error",
+                "tree_method": "hist",
+            },
+            m,
+            evals=[(valid, "Valid")],
             num_boost_round=1000,
-            early_stopping_rounds=early_stopping_rounds)['booster']
-        assert hasattr(booster, 'best_score')
-        dump = booster.get_dump(dump_format='json')
+            early_stopping_rounds=early_stopping_rounds,
+        )["booster"]
+        assert hasattr(booster, "best_score")
+        dump = booster.get_dump(dump_format="json")
         assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
 
         valid_X, valid_y = load_breast_cancer(return_X_y=True)
         valid_X, valid_y = da.from_array(valid_X), da.from_array(valid_y)
         cls = xgb.dask.DaskXGBClassifier(
-            objective='binary:logistic',
-            tree_method='hist',
-            n_estimators=1000,
-            eval_metric=tm.eval_error_metric_skl
+            objective="binary:logistic", tree_method="hist", n_estimators=1000
         )
         cls.client = client
         cls.fit(
-            X, y, early_stopping_rounds=early_stopping_rounds, eval_set=[(valid_X, valid_y)]
+            X,
+            y,
+            early_stopping_rounds=early_stopping_rounds,
+            eval_set=[(valid_X, valid_y)],
         )
         booster = cls.get_booster()
-        dump = booster.get_dump(dump_format='json')
+        dump = booster.get_dump(dump_format="json")
+        assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
+
+        # Specify the metric
+        cls = xgb.dask.DaskXGBClassifier(
+            objective="binary:logistic", tree_method="hist", n_estimators=1000
+        )
+        cls.client = client
+        cls.fit(
+            X,
+            y,
+            early_stopping_rounds=early_stopping_rounds,
+            eval_set=[(valid_X, valid_y)],
+            eval_metric="error",
+        )
+        assert tm.non_increasing(cls.evals_result()["validation_0"]["error"])
+        booster = cls.get_booster()
+        dump = booster.get_dump(dump_format="json")
+        assert len(cls.evals_result()["validation_0"]["error"]) < 20
+        assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
+
+    @pytest.mark.skipif(**tm.no_sklearn())
+    def test_early_stopping_custom_eval(self, client: "Client") -> None:
+        from sklearn.datasets import load_breast_cancer
+
+        X, y = load_breast_cancer(return_X_y=True)
+        X, y = da.from_array(X), da.from_array(y)
+        m = xgb.dask.DaskDMatrix(client, X, y)
+
+        valid = xgb.dask.DaskDMatrix(client, X, y)
+        early_stopping_rounds = 5
+        booster = xgb.dask.train(
+            client,
+            {
+                "objective": "binary:logistic",
+                "eval_metric": "error",
+                "tree_method": "hist",
+            },
+            m,
+            evals=[(m, "Train"), (valid, "Valid")],
+            feval=tm.eval_error_metric,
+            num_boost_round=1000,
+            early_stopping_rounds=early_stopping_rounds,
+        )["booster"]
+        assert hasattr(booster, "best_score")
+        dump = booster.get_dump(dump_format="json")
+        assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
+
+        valid_X, valid_y = load_breast_cancer(return_X_y=True)
+        valid_X, valid_y = da.from_array(valid_X), da.from_array(valid_y)
+        cls = xgb.dask.DaskXGBClassifier(
+            objective="binary:logistic",
+            tree_method="hist",
+            n_estimators=1000,
+            eval_metric=tm.eval_error_metric_skl,
+        )
+        cls.client = client
+        cls.fit(
+            X,
+            y,
+            early_stopping_rounds=early_stopping_rounds,
+            eval_set=[(valid_X, valid_y)],
+        )
+        booster = cls.get_booster()
+        dump = booster.get_dump(dump_format="json")
         assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
 
     @pytest.mark.skipif(**tm.no_sklearn())
     def test_callback(self, client: "Client") -> None:
         from sklearn.datasets import load_breast_cancer
+
         X, y = load_breast_cancer(return_X_y=True)
         X, y = da.from_array(X), da.from_array(y)
 
-        cls = xgb.dask.DaskXGBClassifier(objective='binary:logistic', tree_method='hist',
-                                         n_estimators=10)
+        cls = xgb.dask.DaskXGBClassifier(
+            objective="binary:logistic", tree_method="hist", n_estimators=10
+        )
         cls.client = client
 
         with tempfile.TemporaryDirectory() as tmpdir:
-            cls.fit(X, y, callbacks=[xgb.callback.TrainingCheckPoint(
-                directory=Path(tmpdir),
-                iterations=1,
-                name='model'
-            )])
+            cls.fit(
+                X,
+                y,
+                callbacks=[
+                    xgb.callback.TrainingCheckPoint(
+                        directory=Path(tmpdir), iterations=1, name="model"
+                    )
+                ],
+            )
             for i in range(1, 10):
-                assert os.path.exists(
-                    os.path.join(tmpdir, 'model_' + str(i) + '.json'))
+                assert os.path.exists(os.path.join(tmpdir, "model_" + str(i) + ".json"))
diff --git a/tests/test_distributed/test_with_spark/test_spark_local.py b/tests/test_distributed/test_with_spark/test_spark_local.py
index dfdadb2ef..124f36d02 100644
--- a/tests/test_distributed/test_with_spark/test_spark_local.py
+++ b/tests/test_distributed/test_with_spark/test_spark_local.py
@@ -1120,7 +1120,9 @@ class XgboostLocalTest(SparkTestCase):
         reg1 = SparkXGBRegressor(**self.reg_params)
         model = reg1.fit(self.reg_df_train)
         init_booster = model.get_booster()
-        reg2 = SparkXGBRegressor(max_depth=2, n_estimators=2, xgb_model=init_booster)
+        reg2 = SparkXGBRegressor(
+            max_depth=2, n_estimators=2, xgb_model=init_booster, max_bin=21
+        )
         model21 = reg2.fit(self.reg_df_train)
         pred_res21 = model21.transform(self.reg_df_test).collect()
         reg2.save(path)

From 0a07900b9f60294e0daf79d59532479365af552a Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 15 Jul 2023 21:11:02 +0800
Subject: [PATCH 034/136] Fix integer overflow. (#9380)

---
 src/common/hist_util.cu              | 19 +++++++++----------
 tests/python-gpu/test_large_input.py |  7 ++++---
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/common/hist_util.cu b/src/common/hist_util.cu
index 1c9525a62..eabdb86de 100644
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@@ -26,9 +26,8 @@
 #include "quantile.h"
 #include "xgboost/host_device_vector.h"
 
-namespace xgboost {
-namespace common {
 
+namespace xgboost::common {
 constexpr float SketchContainer::kFactor;
 
 namespace detail {
@@ -87,13 +86,13 @@ size_t RequiredMemory(bst_row_t num_rows, bst_feature_t num_columns, size_t nnz,
   return peak;
 }
 
-size_t SketchBatchNumElements(size_t sketch_batch_num_elements,
-                              bst_row_t num_rows, bst_feature_t columns,
-                              size_t nnz, int device,
-                              size_t num_cuts, bool has_weight) {
+size_t SketchBatchNumElements(size_t sketch_batch_num_elements, bst_row_t num_rows,
+                              bst_feature_t columns, size_t nnz, int device, size_t num_cuts,
+                              bool has_weight) {
+  auto constexpr kIntMax = static_cast<std::size_t>(std::numeric_limits<std::int32_t>::max());
 #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
   // device available memory is not accurate when rmm is used.
-  return nnz;
+  return std::min(nnz, kIntMax);
 #endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 
   if (sketch_batch_num_elements == 0) {
@@ -106,7 +105,8 @@ size_t SketchBatchNumElements(size_t sketch_batch_num_elements,
       sketch_batch_num_elements = std::min(num_rows * static_cast<size_t>(columns), nnz);
     }
   }
-  return sketch_batch_num_elements;
+
+  return std::min(sketch_batch_num_elements, kIntMax);
 }
 
 void SortByWeight(dh::device_vector<float>* weights,
@@ -355,5 +355,4 @@ HistogramCuts DeviceSketch(int device, DMatrix* dmat, int max_bins,
   sketch_container.MakeCuts(&cuts, dmat->Info().IsColumnSplit());
   return cuts;
 }
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
diff --git a/tests/python-gpu/test_large_input.py b/tests/python-gpu/test_large_input.py
index fdd3493e5..2d85cabc8 100644
--- a/tests/python-gpu/test_large_input.py
+++ b/tests/python-gpu/test_large_input.py
@@ -9,15 +9,16 @@ import xgboost as xgb
 def test_large_input():
     available_bytes, _ = cp.cuda.runtime.memGetInfo()
     # 15 GB
-    required_bytes = 1.5e+10
+    required_bytes = 1.5e10
     if available_bytes < required_bytes:
         pytest.skip("Not enough memory on this device")
     n = 1000
     m = ((1 << 31) + n - 1) // n
-    assert (np.log2(m * n) > 31)
+    assert np.log2(m * n) > 31
     X = cp.ones((m, n), dtype=np.float32)
     y = cp.ones(m)
-    dmat = xgb.QuantileDMatrix(X, y)
+    w = cp.ones(m)
+    dmat = xgb.QuantileDMatrix(X, y, weight=w)
     booster = xgb.train({"tree_method": "gpu_hist", "max_depth": 1}, dmat, 1)
     del y
     booster.inplace_predict(X)

From b342ef951b9040db70f8ef41ab22e5ca7f889a53 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sun, 16 Jul 2023 06:52:55 +0800
Subject: [PATCH 035/136] Make feature validation immutable. (#9388)

---
 python-package/xgboost/core.py | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 86c49e0ff..d41976e8b 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -1623,7 +1623,7 @@ class Booster:
         )
         for d in cache:
             # Validate feature only after the feature names are saved into booster.
-            self._validate_dmatrix_features(d)
+            self._assign_dmatrix_features(d)
 
         if isinstance(model_file, Booster):
             assert self.handle is not None
@@ -1746,6 +1746,11 @@ class Booster:
         self.__dict__.update(state)
 
     def __getitem__(self, val: Union[int, tuple, slice]) -> "Booster":
+        """Get a slice of the tree-based model.
+
+        .. versionadded:: 1.3.0
+
+        """
         if isinstance(val, int):
             val = slice(val, val + 1)
         if isinstance(val, tuple):
@@ -1784,6 +1789,11 @@ class Booster:
         return sliced
 
     def __iter__(self) -> Generator["Booster", None, None]:
+        """Iterator method for getting individual trees.
+
+        .. versionadded:: 2.0.0
+
+        """
         for i in range(0, self.num_boosted_rounds()):
             yield self[i]
 
@@ -1994,7 +2004,7 @@ class Booster:
         """
         if not isinstance(dtrain, DMatrix):
             raise TypeError(f"invalid training matrix: {type(dtrain).__name__}")
-        self._validate_dmatrix_features(dtrain)
+        self._assign_dmatrix_features(dtrain)
 
         if fobj is None:
             _check_call(
@@ -2026,7 +2036,7 @@ class Booster:
             raise ValueError(f"grad / hess length mismatch: {len(grad)} / {len(hess)}")
         if not isinstance(dtrain, DMatrix):
             raise TypeError(f"invalid training matrix: {type(dtrain).__name__}")
-        self._validate_dmatrix_features(dtrain)
+        self._assign_dmatrix_features(dtrain)
 
         _check_call(
             _LIB.XGBoosterBoostOneIter(
@@ -2067,7 +2077,7 @@ class Booster:
                 raise TypeError(f"expected DMatrix, got {type(d[0]).__name__}")
             if not isinstance(d[1], str):
                 raise TypeError(f"expected string, got {type(d[1]).__name__}")
-            self._validate_dmatrix_features(d[0])
+            self._assign_dmatrix_features(d[0])
 
         dmats = c_array(ctypes.c_void_p, [d[0].handle for d in evals])
         evnames = c_array(ctypes.c_char_p, [c_str(d[1]) for d in evals])
@@ -2119,7 +2129,7 @@ class Booster:
         result: str
             Evaluation result string.
         """
-        self._validate_dmatrix_features(data)
+        self._assign_dmatrix_features(data)
         return self.eval_set([(data, name)], iteration)
 
     # pylint: disable=too-many-function-args
@@ -2218,7 +2228,8 @@ class Booster:
         if not isinstance(data, DMatrix):
             raise TypeError("Expecting data to be a DMatrix object, got: ", type(data))
         if validate_features:
-            self._validate_dmatrix_features(data)
+            fn = data.feature_names
+            self._validate_features(fn)
         args = {
             "type": 0,
             "training": training,
@@ -2843,14 +2854,13 @@ class Booster:
         # pylint: disable=no-member
         return df.sort(["Tree", "Node"]).reset_index(drop=True)
 
-    def _validate_dmatrix_features(self, data: DMatrix) -> None:
+    def _assign_dmatrix_features(self, data: DMatrix) -> None:
         if data.num_row() == 0:
             return
 
         fn = data.feature_names
         ft = data.feature_types
-        # Be consistent with versions before 1.7, "validate" actually modifies the
-        # booster.
+
         if self.feature_names is None:
             self.feature_names = fn
         if self.feature_types is None:

From 2caceb157dfbd35d11170d1bce68cb0bf56543e7 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 17 Jul 2023 13:25:46 +0800
Subject: [PATCH 036/136] [jvm-packages] Reduce log verbosity for GPU tests.
 (#9389)

---
 .../ml/dmlc/xgboost4j/scala/rapids/spark/GpuTestSuite.scala     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuTestSuite.scala b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuTestSuite.scala
index 2a355e160..112f7db12 100644
--- a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuTestSuite.scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuTestSuite.scala
@@ -282,7 +282,7 @@ object SparkSessionHolder extends Logging {
     logDebug(s"SETTING  CONF: ${conf.getAll.toMap}")
     setAllConfs(conf.getAll)
     logDebug(s"RUN WITH CONF: ${spark.conf.getAll}\n")
+    spark.sparkContext.setLogLevel("WARN")
     f(spark)
   }
-
 }

From f4fb2be101034f4a43ce9e79cc0e1375906d23e7 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 17 Jul 2023 18:40:39 +0800
Subject: [PATCH 037/136] [jvm-packages] Add the new `device` parameter.
 (#9385)

---
 doc/jvm/xgboost4j_spark_gpu_tutorial.rst      | 15 +++----
 .../example/spark/SparkMLlibPipeline.scala    | 15 ++++---
 .../scala/example/spark/SparkTraining.scala   | 12 +++---
 .../example/spark/SparkExamplesTest.scala     |  4 +-
 .../dmlc/xgboost4j/gpu/java/BoosterTest.java  |  3 +-
 .../scala/rapids/spark/GpuPreXGBoost.scala    |  8 +++-
 .../rapids/spark/GpuXGBoostGeneralSuite.scala | 43 +++++++++++++++----
 .../spark/GpuXGBoostRegressorSuite.scala      | 32 +++++++++++---
 .../dmlc/xgboost4j/scala/spark/XGBoost.scala  | 11 +++--
 .../scala/spark/XGBoostClassifier.scala       |  2 +
 .../scala/spark/XGBoostRegressor.scala        |  2 +
 .../scala/spark/params/BoosterParams.scala    |  8 ++++
 .../scala/spark/params/GeneralParams.scala    |  2 +-
 .../scala/spark/XGBoostClassifierSuite.scala  |  1 -
 .../dmlc/xgboost4j/java/BoosterImplTest.java  |  1 -
 15 files changed, 112 insertions(+), 47 deletions(-)

diff --git a/doc/jvm/xgboost4j_spark_gpu_tutorial.rst b/doc/jvm/xgboost4j_spark_gpu_tutorial.rst
index f3b97d9c3..7b80286ef 100644
--- a/doc/jvm/xgboost4j_spark_gpu_tutorial.rst
+++ b/doc/jvm/xgboost4j_spark_gpu_tutorial.rst
@@ -121,7 +121,7 @@ To train a XGBoost model for classification, we need to claim a XGBoostClassifie
       "objective" -> "multi:softprob",
       "num_class" -> 3,
       "num_round" -> 100,
-      "tree_method" -> "gpu_hist",
+      "device" -> "cuda",
       "num_workers" -> 1)
 
   val featuresNames = schema.fieldNames.filter(name => name != labelName)
@@ -130,15 +130,14 @@ To train a XGBoost model for classification, we need to claim a XGBoostClassifie
       .setFeaturesCol(featuresNames)
       .setLabelCol(labelName)
 
-The available parameters for training a XGBoost model can be found in :doc:`here </parameter>`.
-Similar to the XGBoost4J-Spark package, in addition to the default set of parameters,
-XGBoost4J-Spark-GPU also supports the camel-case variant of these parameters to be
-consistent with Spark's MLlib naming convention.
+The ``device`` parameter is for informing XGBoost that CUDA devices should be used instead of CPU. Unlike the single-node mode, GPUs are managed by spark instead of by XGBoost. Therefore, explicitly specified device ordinal like ``cuda:1`` is not support.
+
+The available parameters for training a XGBoost model can be found in :doc:`here </parameter>`. Similar to the XGBoost4J-Spark package, in addition to the default set of parameters, XGBoost4J-Spark-GPU also supports the camel-case variant of these parameters to be consistent with Spark's MLlib naming convention.
 
 Specifically, each parameter in :doc:`this page </parameter>` has its equivalent form in
-XGBoost4J-Spark-GPU with camel case. For example, to set ``max_depth`` for each tree, you can pass
-parameter just like what we did in the above code snippet (as ``max_depth`` wrapped in a Map), or
-you can do it through setters in XGBoostClassifer:
+XGBoost4J-Spark-GPU with camel case. For example, to set ``max_depth`` for each tree, you
+can pass parameter just like what we did in the above code snippet (as ``max_depth``
+wrapped in a Map), or you can do it through setters in XGBoostClassifer:
 
 .. code-block:: scala
 
diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkMLlibPipeline.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkMLlibPipeline.scala
index b8da31c09..ae59af571 100644
--- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkMLlibPipeline.scala
+++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkMLlibPipeline.scala
@@ -40,20 +40,20 @@ object SparkMLlibPipeline {
     val nativeModelPath = args(1)
     val pipelineModelPath = args(2)
 
-    val (treeMethod, numWorkers) = if (args.length == 4 && args(3) == "gpu") {
-      ("gpu_hist", 1)
-    } else ("auto", 2)
+    val (device, numWorkers) = if (args.length == 4 && args(3) == "gpu") {
+      ("cuda", 1)
+    } else ("cpu", 2)
 
     val spark = SparkSession
       .builder()
       .appName("XGBoost4J-Spark Pipeline Example")
       .getOrCreate()
 
-    run(spark, inputPath, nativeModelPath, pipelineModelPath, treeMethod, numWorkers)
+    run(spark, inputPath, nativeModelPath, pipelineModelPath, device, numWorkers)
       .show(false)
   }
   private[spark] def run(spark: SparkSession, inputPath: String, nativeModelPath: String,
-                         pipelineModelPath: String, treeMethod: String,
+                         pipelineModelPath: String, device: String,
                          numWorkers: Int): DataFrame = {
 
     // Load dataset
@@ -82,13 +82,14 @@ object SparkMLlibPipeline {
       .setOutputCol("classIndex")
       .fit(training)
     val booster = new XGBoostClassifier(
-      Map("eta" -> 0.1f,
+      Map(
+        "eta" -> 0.1f,
         "max_depth" -> 2,
         "objective" -> "multi:softprob",
         "num_class" -> 3,
         "num_round" -> 100,
         "num_workers" -> numWorkers,
-        "tree_method" -> treeMethod
+        "device" -> device
       )
     )
     booster.setFeaturesCol("features")
diff --git a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkTraining.scala b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkTraining.scala
index a7886f524..67a9f7e23 100644
--- a/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkTraining.scala
+++ b/jvm-packages/xgboost4j-example/src/main/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkTraining.scala
@@ -31,18 +31,18 @@ object SparkTraining {
       sys.exit(1)
     }
 
-    val (treeMethod, numWorkers) = if (args.length == 2 && args(1) == "gpu") {
-      ("gpu_hist", 1)
-    } else ("auto", 2)
+    val (device, numWorkers) = if (args.length == 2 && args(1) == "gpu") {
+      ("cuda", 1)
+    } else ("cpu", 2)
 
     val spark = SparkSession.builder().getOrCreate()
     val inputPath = args(0)
-    val results: DataFrame = run(spark, inputPath, treeMethod, numWorkers)
+    val results: DataFrame = run(spark, inputPath, device, numWorkers)
     results.show()
   }
 
 private[spark] def run(spark: SparkSession, inputPath: String,
-                       treeMethod: String, numWorkers: Int): DataFrame =  {
+                       device: String, numWorkers: Int): DataFrame =  {
     val schema = new StructType(Array(
       StructField("sepal length", DoubleType, true),
       StructField("sepal width", DoubleType, true),
@@ -80,7 +80,7 @@ private[spark] def run(spark: SparkSession, inputPath: String,
       "num_class" -> 3,
       "num_round" -> 100,
       "num_workers" -> numWorkers,
-      "tree_method" -> treeMethod,
+      "device" -> device,
       "eval_sets" -> Map("eval1" -> eval1, "eval2" -> eval2))
     val xgbClassifier = new XGBoostClassifier(xgbParam).
       setFeaturesCol("features").
diff --git a/jvm-packages/xgboost4j-example/src/test/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkExamplesTest.scala b/jvm-packages/xgboost4j-example/src/test/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkExamplesTest.scala
index f6cb700df..2e87bf066 100644
--- a/jvm-packages/xgboost4j-example/src/test/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkExamplesTest.scala
+++ b/jvm-packages/xgboost4j-example/src/test/scala/ml/dmlc/xgboost4j/scala/example/spark/SparkExamplesTest.scala
@@ -104,7 +104,7 @@ class SparkExamplesTest extends AnyFunSuite with BeforeAndAfterAll {
 
   test("Smoke test for SparkMLlibPipeline example") {
     SparkMLlibPipeline.run(spark, pathToTestDataset.toString, "target/native-model",
-      "target/pipeline-model", "auto", 2)
+      "target/pipeline-model", "cpu", 2)
   }
 
   test("Smoke test for SparkTraining example") {
@@ -118,6 +118,6 @@ class SparkExamplesTest extends AnyFunSuite with BeforeAndAfterAll {
       .config("spark.task.cpus", 1)
       .getOrCreate()
 
-    SparkTraining.run(spark, pathToTestDataset.toString, "auto", 2)
+    SparkTraining.run(spark, pathToTestDataset.toString, "cpu", 2)
   }
 }
diff --git a/jvm-packages/xgboost4j-gpu/src/test/java/ml/dmlc/xgboost4j/gpu/java/BoosterTest.java b/jvm-packages/xgboost4j-gpu/src/test/java/ml/dmlc/xgboost4j/gpu/java/BoosterTest.java
index 24a1491e1..ce830ef99 100644
--- a/jvm-packages/xgboost4j-gpu/src/test/java/ml/dmlc/xgboost4j/gpu/java/BoosterTest.java
+++ b/jvm-packages/xgboost4j-gpu/src/test/java/ml/dmlc/xgboost4j/gpu/java/BoosterTest.java
@@ -77,7 +77,8 @@ public class BoosterTest {
         put("objective", "binary:logistic");
         put("num_round", round);
         put("num_workers", 1);
-        put("tree_method", "gpu_hist");
+        put("tree_method", "hist");
+	put("device", "cuda");
         put("max_bin", maxBin);
       }
     };
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala
index 9ff42e370..d34802805 100644
--- a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala
@@ -137,8 +137,12 @@ object GpuPreXGBoost extends PreXGBoostProvider {
     val (Seq(labelName, weightName, marginName), feturesCols, groupName, evalSets) =
       estimator match {
         case est: XGBoostEstimatorCommon =>
-          require(est.isDefined(est.treeMethod) && est.getTreeMethod.equals("gpu_hist"),
-            s"GPU train requires tree_method set to gpu_hist")
+          require(
+            est.isDefined(est.device) &&
+              (est.getDevice.equals("cuda") || est.getDevice.equals("gpu")) ||
+              est.isDefined(est.treeMethod) && est.getTreeMethod.equals("gpu_hist"),
+            s"GPU train requires `device` set to `cuda` or `gpu`."
+          )
           val groupName = estimator match {
             case regressor: XGBoostRegressor => if (regressor.isDefined(regressor.groupCol)) {
               regressor.getGroupCol } else ""
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuXGBoostGeneralSuite.scala b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuXGBoostGeneralSuite.scala
index 3d643761a..c731afb1d 100644
--- a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuXGBoostGeneralSuite.scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuXGBoostGeneralSuite.scala
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2021-2022 by Contributors
+ Copyright (c) 2021-2023 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -50,9 +50,12 @@ class GpuXGBoostGeneralSuite extends GpuTestSuite {
     withGpuSparkSession() { spark =>
       import spark.implicits._
       val trainingDf = trainingData.toDF(allColumnNames: _*)
-      val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "multi:softprob",
-        "num_class" -> 3, "num_round" -> 5, "num_workers" -> 1, "tree_method" -> "gpu_hist",
-        "features_cols" -> featureNames, "label_col" -> labelName)
+      val xgbParam = Map(
+        "eta" -> 0.1f, "max_depth" -> 2, "objective" -> "multi:softprob",
+        "num_class" -> 3, "num_round" -> 5, "num_workers" -> 1,
+        "tree_method" -> "hist", "device" -> "cuda",
+        "features_cols" -> featureNames, "label_col" -> labelName
+      )
       new XGBoostClassifier(xgbParam)
         .fit(trainingDf)
     }
@@ -65,8 +68,11 @@ class GpuXGBoostGeneralSuite extends GpuTestSuite {
 
       trainingDf = trainingDf.select(labelName, "f2", weightName, "f3", baseMarginName, "f1")
 
-      val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "multi:softprob",
-        "num_class" -> 3, "num_round" -> 5, "num_workers" -> 1, "tree_method" -> "gpu_hist")
+      val xgbParam = Map(
+        "eta" -> 0.1f, "max_depth" -> 2, "objective" -> "multi:softprob",
+        "num_class" -> 3, "num_round" -> 5, "num_workers" -> 1,
+        "tree_method" -> "hist", "device" -> "cuda"
+      )
       new XGBoostClassifier(xgbParam)
         .setFeaturesCol(featureNames)
         .setLabelCol(labelName)
@@ -127,7 +133,7 @@ class GpuXGBoostGeneralSuite extends GpuTestSuite {
     }
   }
 
-  test("Throw exception when tree method is not set to gpu_hist") {
+  test("Throw exception when device is not set to cuda") {
     withGpuSparkSession() { spark =>
       import spark.implicits._
       val trainingDf = trainingData.toDF(allColumnNames: _*)
@@ -139,12 +145,11 @@ class GpuXGBoostGeneralSuite extends GpuTestSuite {
           .setLabelCol(labelName)
           .fit(trainingDf)
       }
-      assert(thrown.getMessage.contains("GPU train requires tree_method set to gpu_hist"))
+      assert(thrown.getMessage.contains("GPU train requires `device` set to `cuda`"))
     }
   }
 
   test("Train with eval") {
-
     withGpuSparkSession() { spark =>
       import spark.implicits._
       val Array(trainingDf, eval1, eval2) = trainingData.toDF(allColumnNames: _*)
@@ -184,4 +189,24 @@ class GpuXGBoostGeneralSuite extends GpuTestSuite {
     }
   }
 
+  test("device ordinal should not be specified") {
+    withGpuSparkSession() { spark =>
+      import spark.implicits._
+      val trainingDf = trainingData.toDF(allColumnNames: _*)
+      val params = Map(
+        "objective" -> "multi:softprob",
+        "num_class" -> 3,
+        "num_round" -> 5,
+        "num_workers" -> 1
+      )
+      val thrown = intercept[IllegalArgumentException] {
+        new XGBoostClassifier(params)
+        .setFeaturesCol(featureNames)
+        .setLabelCol(labelName)
+        .setDevice("cuda:1")
+        .fit(trainingDf)
+      }
+      assert(thrown.getMessage.contains("`cuda` or `gpu`"))
+    }
+  }
 }
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuXGBoostRegressorSuite.scala b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuXGBoostRegressorSuite.scala
index b8dca5d70..6c58ae9fc 100644
--- a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuXGBoostRegressorSuite.scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuXGBoostRegressorSuite.scala
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2021-2022 by Contributors
+ Copyright (c) 2021-2023 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -40,7 +40,7 @@ class GpuXGBoostRegressorSuite extends GpuTestSuite {
   test("The transform result should be same for several runs on same model") {
     withGpuSparkSession(enableCsvConf()) { spark =>
       val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "reg:squarederror",
-        "num_round" -> 10, "num_workers" -> 1, "tree_method" -> "gpu_hist",
+        "num_round" -> 10, "num_workers" -> 1, "tree_method" -> "hist", "device" -> "cuda",
         "features_cols" -> featureNames, "label_col" -> labelName)
       val Array(originalDf, testDf) = spark.read.option("header", "true").schema(schema)
         .csv(getResourcePath("/rank.train.csv")).randomSplit(Array(0.7, 0.3), seed = 1)
@@ -54,10 +54,30 @@ class GpuXGBoostRegressorSuite extends GpuTestSuite {
     }
   }
 
+  test("Tree method gpu_hist still works") {
+    withGpuSparkSession(enableCsvConf()) { spark =>
+      val params = Map(
+        "tree_method" -> "gpu_hist",
+        "features_cols" -> featureNames,
+        "label_col" -> labelName,
+        "num_round" -> 10,
+        "num_workers" -> 1
+      )
+      val Array(originalDf, testDf) = spark.read.option("header", "true").schema(schema)
+        .csv(getResourcePath("/rank.train.csv")).randomSplit(Array(0.7, 0.3), seed = 1)
+      // Get a model
+      val model = new XGBoostRegressor(params).fit(originalDf)
+      val left = model.transform(testDf).collect()
+      val right = model.transform(testDf).collect()
+      // The left should be same with right
+      assert(compareResults(true, 0.000001, left, right))
+    }
+  }
+
   test("use weight") {
     withGpuSparkSession(enableCsvConf()) { spark =>
       val xgbParam = Map("eta" -> 0.1f, "max_depth" -> 2, "objective" -> "reg:squarederror",
-        "num_round" -> 10, "num_workers" -> 1, "tree_method" -> "gpu_hist",
+        "num_round" -> 10, "num_workers" -> 1, "tree_method" -> "hist", "device" -> "cuda",
         "features_cols" -> featureNames, "label_col" -> labelName)
       val Array(originalDf, testDf) = spark.read.option("header", "true").schema(schema)
         .csv(getResourcePath("/rank.train.csv")).randomSplit(Array(0.7, 0.3), seed = 1)
@@ -88,7 +108,8 @@ class GpuXGBoostRegressorSuite extends GpuTestSuite {
       val classifier = new XGBoostRegressor(xgbParam)
         .setFeaturesCol(featureNames)
         .setLabelCol(labelName)
-        .setTreeMethod("gpu_hist")
+        .setTreeMethod("hist")
+        .setDevice("cuda")
       (classifier.fit(rawInput), testDf)
     }
 
@@ -175,7 +196,7 @@ class GpuXGBoostRegressorSuite extends GpuTestSuite {
       val classifier = new XGBoostRegressor(xgbParam)
         .setFeaturesCol(featureNames)
         .setLabelCol(labelName)
-        .setTreeMethod("gpu_hist")
+        .setDevice("cuda")
       classifier.fit(rawInput)
     }
 
@@ -234,5 +255,4 @@ class GpuXGBoostRegressorSuite extends GpuTestSuite {
       assert(testDf.count() === ret.length)
     }
   }
-
 }
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
index 48b31a99f..5fc16ec09 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
@@ -73,7 +73,7 @@ private[scala] case class XGBoostExecutionParams(
     xgbInputParams: XGBoostExecutionInputParams,
     earlyStoppingParams: XGBoostExecutionEarlyStoppingParams,
     cacheTrainingSet: Boolean,
-    treeMethod: Option[String],
+    device: Option[String],
     isLocal: Boolean,
     featureNames: Option[Array[String]],
     featureTypes: Option[Array[String]]) {
@@ -180,6 +180,10 @@ private[this] class XGBoostExecutionParamsFactory(rawParams: Map[String, Any], s
         " as 'hist', 'approx', 'gpu_hist', and 'auto'")
       treeMethod = Some(overridedParams("tree_method").asInstanceOf[String])
     }
+    val device: Option[String] = overridedParams.get("device") match {
+      case None => None
+      case Some(dev: String) => if (treeMethod == "gpu_hist") Some("cuda") else Some(dev)
+    }
     if (overridedParams.contains("train_test_ratio")) {
       logger.warn("train_test_ratio is deprecated since XGBoost 0.82, we recommend to explicitly" +
         " pass a training and multiple evaluation datasets by passing 'eval_sets' and " +
@@ -228,7 +232,7 @@ private[this] class XGBoostExecutionParamsFactory(rawParams: Map[String, Any], s
       inputParams,
       xgbExecEarlyStoppingParams,
       cacheTrainingSet,
-      treeMethod,
+      device,
       isLocal,
       featureNames,
       featureTypes
@@ -318,7 +322,7 @@ object XGBoost extends Serializable {
       val externalCheckpointParams = xgbExecutionParam.checkpointParam
 
       var params = xgbExecutionParam.toMap
-      if (xgbExecutionParam.treeMethod.exists(m => m == "gpu_hist")) {
+      if (xgbExecutionParam.device.exists(m => (m == "cuda" || m == "gpu"))) {
         val gpuId = if (xgbExecutionParam.isLocal) {
           // For local mode, force gpu id to primary device
           0
@@ -328,6 +332,7 @@ object XGBoost extends Serializable {
         logger.info("Leveraging gpu device " + gpuId + " to train")
         params = params + ("device" -> s"cuda:$gpuId")
       }
+
       val booster = if (makeCheckpoint) {
         SXGBoost.trainAndSaveCheckpoint(
           watches.toMap("train"), params, numRounds,
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifier.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifier.scala
index fd4633a0d..ec8766e40 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifier.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifier.scala
@@ -93,6 +93,8 @@ class XGBoostClassifier (
 
   def setTreeMethod(value: String): this.type = set(treeMethod, value)
 
+  def setDevice(value: String): this.type = set(device, value)
+
   def setGrowPolicy(value: String): this.type = set(growPolicy, value)
 
   def setMaxBins(value: Int): this.type = set(maxBins, value)
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressor.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressor.scala
index 99dbdc580..986e04c6b 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressor.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressor.scala
@@ -95,6 +95,8 @@ class XGBoostRegressor (
 
   def setTreeMethod(value: String): this.type = set(treeMethod, value)
 
+  def setDevice(value: String): this.type = set(device, value)
+
   def setGrowPolicy(value: String): this.type = set(growPolicy, value)
 
   def setMaxBins(value: Int): this.type = set(maxBins, value)
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/BoosterParams.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/BoosterParams.scala
index 21a77341c..61efc2865 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/BoosterParams.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/BoosterParams.scala
@@ -154,6 +154,14 @@ private[spark] trait BoosterParams extends Params {
     (value: String) => BoosterParams.supportedTreeMethods.contains(value))
 
   final def getTreeMethod: String = $(treeMethod)
+  /**
+    *  The device for running XGBoost algorithms, options: cpu, cuda
+    */
+  final val device = new Param[String](
+    this, "device", "The device for running XGBoost algorithms, options: cpu, cuda"
+  )
+
+  final def getDevice: String = $(device)
 
   /**
    * growth policy for fast histogram algorithm
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/GeneralParams.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/GeneralParams.scala
index 3f387de9b..b85f4dc8b 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/GeneralParams.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/GeneralParams.scala
@@ -284,7 +284,7 @@ private[spark] trait ParamMapFuncs extends Params {
         (paramName == "updater" && paramValue != "grow_histmaker,prune" &&
           paramValue != "grow_quantile_histmaker" && paramValue != "grow_gpu_hist")) {
         throw new IllegalArgumentException(s"you specified $paramName as $paramValue," +
-          s" XGBoost-Spark only supports gbtree as booster type and grow_histmaker,prune or" +
+          s" XGBoost-Spark only supports gbtree as booster type and grow_histmaker or" +
           s" grow_quantile_histmaker or grow_gpu_hist as the updater type")
       }
       val name = CaseFormat.LOWER_UNDERSCORE.to(CaseFormat.LOWER_CAMEL, paramName)
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala
index 1290465ea..9b53c7642 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostClassifierSuite.scala
@@ -469,7 +469,6 @@ class XGBoostClassifierSuite extends AnyFunSuite with PerTest with TmpFolderPerS
       .setFeatureTypes(featureTypes)
     val model = xgb.fit(trainingDF)
     val modelStr = new String(model._booster.toByteArray("json"))
-    System.out.println(modelStr)
     val jsonModel = parseJson(modelStr)
     implicit val formats: Formats = DefaultFormats
     val featureNamesInModel = (jsonModel \ "learner" \ "feature_names").extract[List[String]]
diff --git a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java
index d53c003a4..70966a38f 100644
--- a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java
+++ b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java
@@ -143,7 +143,6 @@ public class BoosterImplTest {
     booster.saveModel(temp.getAbsolutePath());
 
     String modelString = new String(booster.toByteArray("json"));
-    System.out.println(modelString);
 
     Booster bst2 = XGBoost.loadModel(temp.getAbsolutePath());
     assert (Arrays.equals(bst2.toByteArray("ubj"), booster.toByteArray("ubj")));

From 2a0ff209ff4ffd1ccadb00674d6c75a7ff223b74 Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Mon, 17 Jul 2023 10:53:57 -0700
Subject: [PATCH 038/136] [CI] Block CI from running for dependabot PRs (#9394)

---
 tests/buildkite/pipeline-mgpu.yml  | 2 +-
 tests/buildkite/pipeline-win64.yml | 2 +-
 tests/buildkite/pipeline.yml       | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/buildkite/pipeline-mgpu.yml b/tests/buildkite/pipeline-mgpu.yml
index aff2d078b..3229646d5 100644
--- a/tests/buildkite/pipeline-mgpu.yml
+++ b/tests/buildkite/pipeline-mgpu.yml
@@ -12,7 +12,7 @@ steps:
       queue: pipeline-loader
   - wait
   - block: ":rocket: Run this test job"
-    if: build.pull_request.id != null
+    if: build.pull_request.id != null || build.branch =~ /^dependabot\//
   #### -------- CONTAINER BUILD --------
   - label: ":docker: Build containers"
     commands:
diff --git a/tests/buildkite/pipeline-win64.yml b/tests/buildkite/pipeline-win64.yml
index 0a1f7f164..d4491148e 100644
--- a/tests/buildkite/pipeline-win64.yml
+++ b/tests/buildkite/pipeline-win64.yml
@@ -6,7 +6,7 @@ steps:
       queue: pipeline-loader
   - wait
   - block: ":rocket: Run this test job"
-    if: build.pull_request.id != null
+    if: build.pull_request.id != null || build.branch =~ /^dependabot\//
   #### -------- BUILD --------
   - label: ":windows: Build XGBoost for Windows with CUDA"
     command: "tests/buildkite/build-win64-gpu.ps1"
diff --git a/tests/buildkite/pipeline.yml b/tests/buildkite/pipeline.yml
index fa09242bf..72e1ec1e5 100644
--- a/tests/buildkite/pipeline.yml
+++ b/tests/buildkite/pipeline.yml
@@ -9,7 +9,7 @@ steps:
       queue: pipeline-loader
   - wait
   - block: ":rocket: Run this test job"
-    if: build.pull_request.id != null
+    if: build.pull_request.id != null || build.branch =~ /^dependabot\//
   #### -------- CONTAINER BUILD --------
   - label: ":docker: Build containers"
     commands:

From 6e18d3a290d67943dda4d43c76f78b3c5ba4719b Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 18 Jul 2023 08:47:03 +0800
Subject: [PATCH 039/136] [pyspark] Handle the `device` parameter in pyspark.
 (#9390)

- Handle the new `device` parameter in PySpark.
- Deprecate the old `use_gpu` parameter.
---
 doc/tutorials/spark_estimator.rst             |  23 ++--
 python-package/xgboost/core.py                |  21 +++
 python-package/xgboost/dask.py                |  13 +-
 python-package/xgboost/sklearn.py             |  10 +-
 python-package/xgboost/spark/core.py          |  71 ++++++----
 python-package/xgboost/spark/estimator.py     | 112 +++++++++------
 python-package/xgboost/spark/utils.py         |   7 +-
 src/gbm/gbtree.cc                             |   4 +-
 .../test_gpu_with_spark/test_gpu_spark.py     |  23 +++-
 .../test_with_spark/test_spark_local.py       | 129 +++++++++---------
 10 files changed, 244 insertions(+), 169 deletions(-)

diff --git a/doc/tutorials/spark_estimator.rst b/doc/tutorials/spark_estimator.rst
index 545403a34..44bdd7733 100644
--- a/doc/tutorials/spark_estimator.rst
+++ b/doc/tutorials/spark_estimator.rst
@@ -35,13 +35,13 @@ We can create a ``SparkXGBRegressor`` estimator like:
   )
 
 
-The above snippet creates a spark estimator which can fit on a spark dataset,
-and return a spark model that can transform a spark dataset and generate dataset
-with prediction column. We can set almost all of xgboost sklearn estimator parameters
-as ``SparkXGBRegressor`` parameters, but some parameter such as ``nthread`` is forbidden
-in spark estimator, and some parameters are replaced with pyspark specific parameters
-such as ``weight_col``, ``validation_indicator_col``, ``use_gpu``, for details please see
-``SparkXGBRegressor`` doc.
+The above snippet creates a spark estimator which can fit on a spark dataset, and return a
+spark model that can transform a spark dataset and generate dataset with prediction
+column. We can set almost all of xgboost sklearn estimator parameters as
+``SparkXGBRegressor`` parameters, but some parameter such as ``nthread`` is forbidden in
+spark estimator, and some parameters are replaced with pyspark specific parameters such as
+``weight_col``, ``validation_indicator_col``, for details please see ``SparkXGBRegressor``
+doc.
 
 The following code snippet shows how to train a spark xgboost regressor model,
 first we need to prepare a training dataset as a spark dataframe contains
@@ -88,7 +88,7 @@ XGBoost PySpark fully supports GPU acceleration. Users are not only able to enab
 efficient training but also utilize their GPUs for the whole PySpark pipeline including
 ETL and inference. In below sections, we will walk through an example of training on a
 PySpark standalone GPU cluster. To get started, first we need to install some additional
-packages, then we can set the ``use_gpu`` parameter to ``True``.
+packages, then we can set the ``device`` parameter to ``cuda`` or ``gpu``.
 
 Prepare the necessary packages
 ==============================
@@ -128,7 +128,7 @@ Write your PySpark application
 ==============================
 
 Below snippet is a small example for training xgboost model with PySpark. Notice that we are
-using a list of feature names and the additional parameter ``use_gpu``:
+using a list of feature names and the additional parameter ``device``:
 
 .. code-block:: python
 
@@ -148,12 +148,12 @@ using a list of feature names and the additional parameter ``use_gpu``:
   # get a list with feature column names
   feature_names = [x.name for x in train_df.schema if x.name != label_name]
 
-  # create a xgboost pyspark regressor estimator and set use_gpu=True
+  # create a xgboost pyspark regressor estimator and set device="cuda"
   regressor = SparkXGBRegressor(
     features_col=feature_names,
     label_col=label_name,
     num_workers=2,
-    use_gpu=True,
+    device="cuda",
   )
 
   # train and return the model
@@ -163,6 +163,7 @@ using a list of feature names and the additional parameter ``use_gpu``:
   predict_df = model.transform(test_df)
   predict_df.show()
 
+Like other distributed interfaces, the ```device`` parameter doesn't support specifying ordinal as GPUs are managed by Spark instead of XGBoost (good: ``device=cuda``, bad: ``device=cuda:0``).
 
 Submit the PySpark application
 ==============================
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index d41976e8b..4cacd61f3 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -276,6 +276,27 @@ def _check_call(ret: int) -> None:
         raise XGBoostError(py_str(_LIB.XGBGetLastError()))
 
 
+def _check_distributed_params(kwargs: Dict[str, Any]) -> None:
+    """Validate parameters in distributed environments."""
+    device = kwargs.get("device", None)
+    if device and not isinstance(device, str):
+        msg = "Invalid type for the `device` parameter"
+        msg += _expect((str,), type(device))
+        raise TypeError(msg)
+
+    if device and device.find(":") != -1:
+        raise ValueError(
+            "Distributed training doesn't support selecting device ordinal as GPUs are"
+            " managed by the distributed framework. use `device=cuda` or `device=gpu`"
+            " instead."
+        )
+
+    if kwargs.get("booster", None) == "gblinear":
+        raise NotImplementedError(
+            f"booster `{kwargs['booster']}` is not supported for distributed training."
+        )
+
+
 def build_info() -> dict:
     """Build information of XGBoost.  The returned value format is not stable. Also,
     please note that build time dependency is not the same as runtime dependency. For
diff --git a/python-package/xgboost/dask.py b/python-package/xgboost/dask.py
index 32dd2a4a7..271a5e458 100644
--- a/python-package/xgboost/dask.py
+++ b/python-package/xgboost/dask.py
@@ -70,6 +70,7 @@ from .core import (
     Metric,
     Objective,
     QuantileDMatrix,
+    _check_distributed_params,
     _deprecate_positional_args,
     _expect,
 )
@@ -924,17 +925,7 @@ async def _train_async(
 ) -> Optional[TrainReturnT]:
     workers = _get_workers_from_data(dtrain, evals)
     _rabit_args = await _get_rabit_args(len(workers), dconfig, client)
-
-    if params.get("booster", None) == "gblinear":
-        raise NotImplementedError(
-            f"booster `{params['booster']}` is not yet supported for dask."
-        )
-    device = params.get("device", None)
-    if device and device.find(":") != -1:
-        raise ValueError(
-            "The dask interface for XGBoost doesn't support selecting specific device"
-            " ordinal. Use `device=cpu` or `device=cuda` instead."
-        )
+    _check_distributed_params(params)
 
     def dispatched_train(
         parameters: Dict,
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index d69cb3a01..46a3ffa4a 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -1004,13 +1004,17 @@ class XGBModel(XGBModelBase):
             Validation metrics will help us track the performance of the model.
 
         eval_metric : str, list of str, or callable, optional
+
             .. deprecated:: 1.6.0
-                Use `eval_metric` in :py:meth:`__init__` or :py:meth:`set_params` instead.
+
+            Use `eval_metric` in :py:meth:`__init__` or :py:meth:`set_params` instead.
 
         early_stopping_rounds : int
+
             .. deprecated:: 1.6.0
-                Use `early_stopping_rounds` in :py:meth:`__init__` or
-                :py:meth:`set_params` instead.
+
+            Use `early_stopping_rounds` in :py:meth:`__init__` or :py:meth:`set_params`
+            instead.
         verbose :
             If `verbose` is True and an evaluation set is used, the evaluation metric
             measured on the validation set is printed to stdout at each boosting stage.
diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py
index 283999c6d..998afbf77 100644
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@@ -60,7 +60,7 @@ from scipy.special import expit, softmax  # pylint: disable=no-name-in-module
 import xgboost
 from xgboost import XGBClassifier
 from xgboost.compat import is_cudf_available
-from xgboost.core import Booster
+from xgboost.core import Booster, _check_distributed_params
 from xgboost.sklearn import DEFAULT_N_ESTIMATORS, XGBModel, _can_use_qdm
 from xgboost.training import train as worker_train
 
@@ -92,6 +92,7 @@ from .utils import (
     get_class_name,
     get_logger,
     serialize_booster,
+    use_cuda,
 )
 
 # Put pyspark specific params here, they won't be passed to XGBoost.
@@ -108,7 +109,6 @@ _pyspark_specific_params = [
     "arbitrary_params_dict",
     "force_repartition",
     "num_workers",
-    "use_gpu",
     "feature_names",
     "features_cols",
     "enable_sparse_data_optim",
@@ -132,8 +132,7 @@ _pyspark_param_alias_map = {
 _inverse_pyspark_param_alias_map = {v: k for k, v in _pyspark_param_alias_map.items()}
 
 _unsupported_xgb_params = [
-    "gpu_id",  # we have "use_gpu" pyspark param instead.
-    "device",  # we have "use_gpu" pyspark param instead.
+    "gpu_id",  # we have "device" pyspark param instead.
     "enable_categorical",  # Use feature_types param to specify categorical feature instead
     "use_label_encoder",
     "n_jobs",  # Do not allow user to set it, will use `spark.task.cpus` value instead.
@@ -198,11 +197,24 @@ class _SparkXGBParams(
         "The number of XGBoost workers. Each XGBoost worker corresponds to one spark task.",
         TypeConverters.toInt,
     )
+    device = Param(
+        Params._dummy(),
+        "device",
+        (
+            "The device type for XGBoost executors. Available options are `cpu`,`cuda`"
+            " and `gpu`. Set `device` to `cuda` or `gpu` if the executors are running "
+            "on GPU instances. Currently, only one GPU per task is supported."
+        ),
+        TypeConverters.toString,
+    )
     use_gpu = Param(
         Params._dummy(),
         "use_gpu",
-        "A boolean variable. Set use_gpu=true if the executors "
-        + "are running on GPU instances. Currently, only one GPU per task is supported.",
+        (
+            "Deprecated, use `device` instead. A boolean variable. Set use_gpu=true "
+            "if the executors are running on GPU instances. Currently, only one GPU per"
+            " task is supported."
+        ),
         TypeConverters.toBoolean,
     )
     force_repartition = Param(
@@ -336,10 +348,20 @@ class _SparkXGBParams(
                 f"It cannot be less than 1 [Default is 1]"
             )
 
+        tree_method = self.getOrDefault(self.getParam("tree_method"))
+        if (
+            self.getOrDefault(self.use_gpu) or use_cuda(self.getOrDefault(self.device))
+        ) and not _can_use_qdm(tree_method):
+            raise ValueError(
+                f"The `{tree_method}` tree method is not supported on GPU."
+            )
+
         if self.getOrDefault(self.features_cols):
-            if not self.getOrDefault(self.use_gpu):
+            if not use_cuda(self.getOrDefault(self.device)) and not self.getOrDefault(
+                self.use_gpu
+            ):
                 raise ValueError(
-                    "features_col param with list value requires enabling use_gpu."
+                    "features_col param with list value requires `device=cuda`."
                 )
 
         if self.getOrDefault("objective") is not None:
@@ -392,17 +414,7 @@ class _SparkXGBParams(
                     "`pyspark.ml.linalg.Vector` type."
                 )
 
-        if self.getOrDefault(self.use_gpu):
-            tree_method = self.getParam("tree_method")
-            if (
-                self.getOrDefault(tree_method) is not None
-                and self.getOrDefault(tree_method) != "gpu_hist"
-            ):
-                raise ValueError(
-                    f"tree_method should be 'gpu_hist' or None when use_gpu is True,"
-                    f"found {self.getOrDefault(tree_method)}."
-                )
-
+        if use_cuda(self.getOrDefault(self.device)) or self.getOrDefault(self.use_gpu):
             gpu_per_task = (
                 _get_spark_session()
                 .sparkContext.getConf()
@@ -424,8 +436,8 @@ class _SparkXGBParams(
                 # so it's okay for printing the below warning instead of checking the real
                 # gpu numbers and raising the exception.
                 get_logger(self.__class__.__name__).warning(
-                    "You enabled use_gpu in spark local mode. Please make sure your local node "
-                    "has at least %d GPUs",
+                    "You enabled GPU in spark local mode. Please make sure your local "
+                    "node has at least %d GPUs",
                     self.getOrDefault(self.num_workers),
                 )
             else:
@@ -558,6 +570,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
         #  they are added in `setParams`.
         self._setDefault(
             num_workers=1,
+            device="cpu",
             use_gpu=False,
             force_repartition=False,
             repartition_random_shuffle=False,
@@ -566,9 +579,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
             arbitrary_params_dict={},
         )
 
-    def setParams(
-        self, **kwargs: Dict[str, Any]
-    ) -> None:  # pylint: disable=invalid-name
+    def setParams(self, **kwargs: Any) -> None:  # pylint: disable=invalid-name
         """
         Set params for the estimator.
         """
@@ -613,6 +624,8 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
                     )
                     raise ValueError(err_msg)
                 _extra_params[k] = v
+
+        _check_distributed_params(kwargs)
         _existing_extra_params = self.getOrDefault(self.arbitrary_params_dict)
         self._set(arbitrary_params_dict={**_existing_extra_params, **_extra_params})
 
@@ -709,9 +722,6 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
         # TODO: support "num_parallel_tree" for random forest
         params["num_boost_round"] = self.getOrDefault("n_estimators")
 
-        if self.getOrDefault(self.use_gpu):
-            params["tree_method"] = "gpu_hist"
-
         return params
 
     @classmethod
@@ -883,8 +893,9 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
             dmatrix_kwargs,
         ) = self._get_xgb_parameters(dataset)
 
-        use_gpu = self.getOrDefault(self.use_gpu)
-
+        run_on_gpu = use_cuda(self.getOrDefault(self.device)) or self.getOrDefault(
+            self.use_gpu
+        )
         is_local = _is_local(_get_spark_session().sparkContext)
 
         num_workers = self.getOrDefault(self.num_workers)
@@ -903,7 +914,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
             dev_ordinal = None
             use_qdm = _can_use_qdm(booster_params.get("tree_method", None))
 
-            if use_gpu:
+            if run_on_gpu:
                 dev_ordinal = (
                     context.partitionId() if is_local else _get_gpu_id(context)
                 )
diff --git a/python-package/xgboost/spark/estimator.py b/python-package/xgboost/spark/estimator.py
index ba75aca7f..f11a0eda8 100644
--- a/python-package/xgboost/spark/estimator.py
+++ b/python-package/xgboost/spark/estimator.py
@@ -3,8 +3,8 @@
 # pylint: disable=fixme, too-many-ancestors, protected-access, no-member, invalid-name
 # pylint: disable=unused-argument, too-many-locals
 
-
-from typing import Any, Dict, List, Optional, Type, Union
+import warnings
+from typing import Any, List, Optional, Type, Union
 
 import numpy as np
 from pyspark import keyword_only
@@ -77,27 +77,35 @@ def _set_pyspark_xgb_cls_param_attrs(
         set_param_attrs(name, param_obj)
 
 
+def _deprecated_use_gpu() -> None:
+    warnings.warn(
+        "`use_gpu` is deprecated since 2.0.0, use `device` instead", FutureWarning
+    )
+
+
 class SparkXGBRegressor(_SparkXGBEstimator):
     """SparkXGBRegressor is a PySpark ML estimator. It implements the XGBoost regression
     algorithm based on XGBoost python library, and it can be used in PySpark Pipeline
-    and PySpark ML meta algorithms like :py:class:`~pyspark.ml.tuning.CrossValidator`/
-    :py:class:`~pyspark.ml.tuning.TrainValidationSplit`/
-    :py:class:`~pyspark.ml.classification.OneVsRest`
+    and PySpark ML meta algorithms like
+    - :py:class:`~pyspark.ml.tuning.CrossValidator`/
+    - :py:class:`~pyspark.ml.tuning.TrainValidationSplit`/
+    - :py:class:`~pyspark.ml.classification.OneVsRest`
 
     SparkXGBRegressor automatically supports most of the parameters in
     :py:class:`xgboost.XGBRegressor` constructor and most of the parameters used in
-    :py:meth:`xgboost.XGBRegressor.fit` and :py:meth:`xgboost.XGBRegressor.predict` method.
+    :py:meth:`xgboost.XGBRegressor.fit` and :py:meth:`xgboost.XGBRegressor.predict`
+    method.
 
-    SparkXGBRegressor doesn't support setting `device` but supports another param
-    `use_gpu`, see doc below for more details.
+    To enable GPU support, set `device` to `cuda` or `gpu`.
 
-    SparkXGBRegressor doesn't support setting `base_margin` explicitly as well, but support
-    another param called `base_margin_col`. see doc below for more details.
+    SparkXGBRegressor doesn't support setting `base_margin` explicitly as well, but
+    support another param called `base_margin_col`. see doc below for more details.
 
     SparkXGBRegressor doesn't support `validate_features` and `output_margin` param.
 
-    SparkXGBRegressor doesn't support setting `nthread` xgboost param, instead, the `nthread`
-    param for each xgboost worker will be set equal to `spark.task.cpus` config value.
+    SparkXGBRegressor doesn't support setting `nthread` xgboost param, instead, the
+    `nthread` param for each xgboost worker will be set equal to `spark.task.cpus`
+    config value.
 
 
     Parameters
@@ -133,8 +141,11 @@ class SparkXGBRegressor(_SparkXGBEstimator):
         How many XGBoost workers to be used to train.
         Each XGBoost worker corresponds to one spark task.
     use_gpu:
-        Boolean value to specify whether the executors are running on GPU
-        instances.
+        .. deprecated:: 2.0.0
+
+        Use `device` instead.
+    device:
+        Device for XGBoost workers, available options are `cpu`, `cuda`, and `gpu`.
     force_repartition:
         Boolean value to specify if forcing the input dataset to be repartitioned
         before XGBoost training.
@@ -193,14 +204,17 @@ class SparkXGBRegressor(_SparkXGBEstimator):
         weight_col: Optional[str] = None,
         base_margin_col: Optional[str] = None,
         num_workers: int = 1,
-        use_gpu: bool = False,
+        use_gpu: Optional[bool] = None,
+        device: Optional[str] = None,
         force_repartition: bool = False,
         repartition_random_shuffle: bool = False,
         enable_sparse_data_optim: bool = False,
-        **kwargs: Dict[str, Any],
+        **kwargs: Any,
     ) -> None:
         super().__init__()
         input_kwargs = self._input_kwargs
+        if use_gpu:
+            _deprecated_use_gpu()
         self.setParams(**input_kwargs)
 
     @classmethod
@@ -238,27 +252,29 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
     """SparkXGBClassifier is a PySpark ML estimator. It implements the XGBoost
     classification algorithm based on XGBoost python library, and it can be used in
     PySpark Pipeline and PySpark ML meta algorithms like
-    :py:class:`~pyspark.ml.tuning.CrossValidator`/
-    :py:class:`~pyspark.ml.tuning.TrainValidationSplit`/
-    :py:class:`~pyspark.ml.classification.OneVsRest`
+    - :py:class:`~pyspark.ml.tuning.CrossValidator`/
+    - :py:class:`~pyspark.ml.tuning.TrainValidationSplit`/
+    - :py:class:`~pyspark.ml.classification.OneVsRest`
 
     SparkXGBClassifier automatically supports most of the parameters in
     :py:class:`xgboost.XGBClassifier` constructor and most of the parameters used in
-    :py:meth:`xgboost.XGBClassifier.fit` and :py:meth:`xgboost.XGBClassifier.predict` method.
+    :py:meth:`xgboost.XGBClassifier.fit` and :py:meth:`xgboost.XGBClassifier.predict`
+    method.
 
-    SparkXGBClassifier doesn't support setting `device` but support another param
-    `use_gpu`, see doc below for more details.
+    To enable GPU support, set `device` to `cuda` or `gpu`.
 
-    SparkXGBClassifier doesn't support setting `base_margin` explicitly as well, but support
-    another param called `base_margin_col`. see doc below for more details.
+    SparkXGBClassifier doesn't support setting `base_margin` explicitly as well, but
+    support another param called `base_margin_col`. see doc below for more details.
 
-    SparkXGBClassifier doesn't support setting `output_margin`, but we can get output margin
-    from the raw prediction column. See `raw_prediction_col` param doc below for more details.
+    SparkXGBClassifier doesn't support setting `output_margin`, but we can get output
+    margin from the raw prediction column. See `raw_prediction_col` param doc below for
+    more details.
 
     SparkXGBClassifier doesn't support `validate_features` and `output_margin` param.
 
-    SparkXGBClassifier doesn't support setting `nthread` xgboost param, instead, the `nthread`
-    param for each xgboost worker will be set equal to `spark.task.cpus` config value.
+    SparkXGBClassifier doesn't support setting `nthread` xgboost param, instead, the
+    `nthread` param for each xgboost worker will be set equal to `spark.task.cpus`
+    config value.
 
 
     Parameters
@@ -300,8 +316,11 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
         How many XGBoost workers to be used to train.
         Each XGBoost worker corresponds to one spark task.
     use_gpu:
-        Boolean value to specify whether the executors are running on GPU
-        instances.
+        .. deprecated:: 2.0.0
+
+        Use `device` instead.
+    device:
+        Device for XGBoost workers, available options are `cpu`, `cuda`, and `gpu`.
     force_repartition:
         Boolean value to specify if forcing the input dataset to be repartitioned
         before XGBoost training.
@@ -360,11 +379,12 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
         weight_col: Optional[str] = None,
         base_margin_col: Optional[str] = None,
         num_workers: int = 1,
-        use_gpu: bool = False,
+        use_gpu: Optional[bool] = None,
+        device: Optional[str] = None,
         force_repartition: bool = False,
         repartition_random_shuffle: bool = False,
         enable_sparse_data_optim: bool = False,
-        **kwargs: Dict[str, Any],
+        **kwargs: Any,
     ) -> None:
         super().__init__()
         # The default 'objective' param value comes from sklearn `XGBClassifier` ctor,
@@ -372,6 +392,8 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
         # binary or multinomial input dataset, and we need to remove the fixed default
         # param value as well to avoid causing ambiguity.
         input_kwargs = self._input_kwargs
+        if use_gpu:
+            _deprecated_use_gpu()
         self.setParams(**input_kwargs)
         self._setDefault(objective=None)
 
@@ -422,19 +444,20 @@ class SparkXGBRanker(_SparkXGBEstimator):
     :py:class:`xgboost.XGBRanker` constructor and most of the parameters used in
     :py:meth:`xgboost.XGBRanker.fit` and :py:meth:`xgboost.XGBRanker.predict` method.
 
-    SparkXGBRanker doesn't support setting `device` but support another param `use_gpu`,
-    see doc below for more details.
+    To enable GPU support, set `device` to `cuda` or `gpu`.
 
     SparkXGBRanker doesn't support setting `base_margin` explicitly as well, but support
     another param called `base_margin_col`. see doc below for more details.
 
     SparkXGBRanker doesn't support setting `output_margin`, but we can get output margin
-    from the raw prediction column. See `raw_prediction_col` param doc below for more details.
+    from the raw prediction column. See `raw_prediction_col` param doc below for more
+    details.
 
     SparkXGBRanker doesn't support `validate_features` and `output_margin` param.
 
-    SparkXGBRanker doesn't support setting `nthread` xgboost param, instead, the `nthread`
-    param for each xgboost worker will be set equal to `spark.task.cpus` config value.
+    SparkXGBRanker doesn't support setting `nthread` xgboost param, instead, the
+    `nthread` param for each xgboost worker will be set equal to `spark.task.cpus`
+    config value.
 
 
     Parameters
@@ -467,13 +490,15 @@ class SparkXGBRanker(_SparkXGBEstimator):
         :py:class:`xgboost.XGBRanker` fit method.
     qid_col:
         Query id column name.
-
     num_workers:
         How many XGBoost workers to be used to train.
         Each XGBoost worker corresponds to one spark task.
     use_gpu:
-        Boolean value to specify whether the executors are running on GPU
-        instances.
+        .. deprecated:: 2.0.0
+
+        Use `device` instead.
+    device:
+        Device for XGBoost workers, available options are `cpu`, `cuda`, and `gpu`.
     force_repartition:
         Boolean value to specify if forcing the input dataset to be repartitioned
         before XGBoost training.
@@ -538,14 +563,17 @@ class SparkXGBRanker(_SparkXGBEstimator):
         base_margin_col: Optional[str] = None,
         qid_col: Optional[str] = None,
         num_workers: int = 1,
-        use_gpu: bool = False,
+        use_gpu: Optional[bool] = None,
+        device: Optional[str] = None,
         force_repartition: bool = False,
         repartition_random_shuffle: bool = False,
         enable_sparse_data_optim: bool = False,
-        **kwargs: Dict[str, Any],
+        **kwargs: Any,
     ) -> None:
         super().__init__()
         input_kwargs = self._input_kwargs
+        if use_gpu:
+            _deprecated_use_gpu()
         self.setParams(**input_kwargs)
 
     @classmethod
diff --git a/python-package/xgboost/spark/utils.py b/python-package/xgboost/spark/utils.py
index 46e465dde..5f3bb19ba 100644
--- a/python-package/xgboost/spark/utils.py
+++ b/python-package/xgboost/spark/utils.py
@@ -7,7 +7,7 @@ import os
 import sys
 import uuid
 from threading import Thread
-from typing import Any, Callable, Dict, Set, Type
+from typing import Any, Callable, Dict, Optional, Set, Type
 
 import pyspark
 from pyspark import BarrierTaskContext, SparkContext, SparkFiles
@@ -186,3 +186,8 @@ def deserialize_booster(model: str) -> Booster:
         f.write(model)
     booster.load_model(tmp_file_name)
     return booster
+
+
+def use_cuda(device: Optional[str]) -> bool:
+    """Whether xgboost is using CUDA workers."""
+    return device in ("cuda", "gpu")
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index e97b27665..0806c13a7 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -98,8 +98,8 @@ void MismatchedDevices(Context const* booster, Context const* data) {
 - Use a data structure that matches the device ordinal in the booster.
 - Set the device for booster before call to inplace_predict.
 
-This warning will only be shown once, and subsequent warnings made by the current thread will be
-suppressed.
+This warning will only be shown once for each thread. Subsequent warnings made by the
+current thread will be suppressed.
 )";
   logged = true;
 }
diff --git a/tests/test_distributed/test_gpu_with_spark/test_gpu_spark.py b/tests/test_distributed/test_gpu_with_spark/test_gpu_spark.py
index 1f986f96e..a962f778e 100644
--- a/tests/test_distributed/test_gpu_with_spark/test_gpu_spark.py
+++ b/tests/test_distributed/test_gpu_with_spark/test_gpu_spark.py
@@ -154,7 +154,7 @@ def spark_diabetes_dataset_feature_cols(spark_session_with_gpu):
 def test_sparkxgb_classifier_with_gpu(spark_iris_dataset):
     from pyspark.ml.evaluation import MulticlassClassificationEvaluator
 
-    classifier = SparkXGBClassifier(use_gpu=True, num_workers=num_workers)
+    classifier = SparkXGBClassifier(device="cuda", num_workers=num_workers)
     train_df, test_df = spark_iris_dataset
     model = classifier.fit(train_df)
     pred_result_df = model.transform(test_df)
@@ -169,7 +169,7 @@ def test_sparkxgb_classifier_feature_cols_with_gpu(spark_iris_dataset_feature_co
     train_df, test_df, feature_names = spark_iris_dataset_feature_cols
 
     classifier = SparkXGBClassifier(
-        features_col=feature_names, use_gpu=True, num_workers=num_workers
+        features_col=feature_names, device="cuda", num_workers=num_workers
     )
 
     model = classifier.fit(train_df)
@@ -185,7 +185,7 @@ def test_cv_sparkxgb_classifier_feature_cols_with_gpu(spark_iris_dataset_feature
     train_df, test_df, feature_names = spark_iris_dataset_feature_cols
 
     classifier = SparkXGBClassifier(
-        features_col=feature_names, use_gpu=True, num_workers=num_workers
+        features_col=feature_names, device="cuda", num_workers=num_workers
     )
     grid = ParamGridBuilder().addGrid(classifier.max_depth, [6, 8]).build()
     evaluator = MulticlassClassificationEvaluator(metricName="f1")
@@ -197,11 +197,24 @@ def test_cv_sparkxgb_classifier_feature_cols_with_gpu(spark_iris_dataset_feature
     f1 = evaluator.evaluate(pred_result_df)
     assert f1 >= 0.97
 
+    clf = SparkXGBClassifier(
+        features_col=feature_names, use_gpu=True, num_workers=num_workers
+    )
+    grid = ParamGridBuilder().addGrid(clf.max_depth, [6, 8]).build()
+    evaluator = MulticlassClassificationEvaluator(metricName="f1")
+    cv = CrossValidator(
+        estimator=clf, evaluator=evaluator, estimatorParamMaps=grid, numFolds=3
+    )
+    cvModel = cv.fit(train_df)
+    pred_result_df = cvModel.transform(test_df)
+    f1 = evaluator.evaluate(pred_result_df)
+    assert f1 >= 0.97
+
 
 def test_sparkxgb_regressor_with_gpu(spark_diabetes_dataset):
     from pyspark.ml.evaluation import RegressionEvaluator
 
-    regressor = SparkXGBRegressor(use_gpu=True, num_workers=num_workers)
+    regressor = SparkXGBRegressor(device="cuda", num_workers=num_workers)
     train_df, test_df = spark_diabetes_dataset
     model = regressor.fit(train_df)
     pred_result_df = model.transform(test_df)
@@ -215,7 +228,7 @@ def test_sparkxgb_regressor_feature_cols_with_gpu(spark_diabetes_dataset_feature
 
     train_df, test_df, feature_names = spark_diabetes_dataset_feature_cols
     regressor = SparkXGBRegressor(
-        features_col=feature_names, use_gpu=True, num_workers=num_workers
+        features_col=feature_names, device="cuda", num_workers=num_workers
     )
 
     model = regressor.fit(train_df)
diff --git a/tests/test_distributed/test_with_spark/test_spark_local.py b/tests/test_distributed/test_with_spark/test_spark_local.py
index 124f36d02..50eafb0a1 100644
--- a/tests/test_distributed/test_with_spark/test_spark_local.py
+++ b/tests/test_distributed/test_with_spark/test_spark_local.py
@@ -741,11 +741,6 @@ class TestPySparkLocal:
         with pytest.raises(ValueError, match="early_stopping_rounds"):
             classifier.fit(clf_data.cls_df_train)
 
-    def test_gpu_param_setting(self, clf_data: ClfData) -> None:
-        py_cls = SparkXGBClassifier(use_gpu=True)
-        train_params = py_cls._get_distributed_train_params(clf_data.cls_df_train)
-        assert train_params["tree_method"] == "gpu_hist"
-
     def test_classifier_with_list_eval_metric(self, clf_data: ClfData) -> None:
         classifier = SparkXGBClassifier(eval_metric=["auc", "rmse"])
         model = classifier.fit(clf_data.cls_df_train)
@@ -756,6 +751,53 @@ class TestPySparkLocal:
         model = classifier.fit(clf_data.cls_df_train)
         model.transform(clf_data.cls_df_test).collect()
 
+    def test_regressor_params_basic(self) -> None:
+        py_reg = SparkXGBRegressor()
+        assert hasattr(py_reg, "n_estimators")
+        assert py_reg.n_estimators.parent == py_reg.uid
+        assert not hasattr(py_reg, "gpu_id")
+        assert hasattr(py_reg, "device")
+        assert py_reg.getOrDefault(py_reg.n_estimators) == 100
+        assert py_reg.getOrDefault(getattr(py_reg, "objective")), "reg:squarederror"
+        py_reg2 = SparkXGBRegressor(n_estimators=200)
+        assert py_reg2.getOrDefault(getattr(py_reg2, "n_estimators")), 200
+        py_reg3 = py_reg2.copy({getattr(py_reg2, "max_depth"): 10})
+        assert py_reg3.getOrDefault(getattr(py_reg3, "n_estimators")), 200
+        assert py_reg3.getOrDefault(getattr(py_reg3, "max_depth")), 10
+
+    def test_classifier_params_basic(self) -> None:
+        py_clf = SparkXGBClassifier()
+        assert hasattr(py_clf, "n_estimators")
+        assert py_clf.n_estimators.parent == py_clf.uid
+        assert not hasattr(py_clf, "gpu_id")
+        assert hasattr(py_clf, "device")
+
+        assert py_clf.getOrDefault(py_clf.n_estimators) == 100
+        assert py_clf.getOrDefault(getattr(py_clf, "objective")) is None
+        py_clf2 = SparkXGBClassifier(n_estimators=200)
+        assert py_clf2.getOrDefault(getattr(py_clf2, "n_estimators")) == 200
+        py_clf3 = py_clf2.copy({getattr(py_clf2, "max_depth"): 10})
+        assert py_clf3.getOrDefault(getattr(py_clf3, "n_estimators")) == 200
+        assert py_clf3.getOrDefault(getattr(py_clf3, "max_depth")), 10
+
+    def test_classifier_kwargs_basic(self, clf_data: ClfData) -> None:
+        py_clf = SparkXGBClassifier(**clf_data.cls_params)
+        assert hasattr(py_clf, "n_estimators")
+        assert py_clf.n_estimators.parent == py_clf.uid
+        assert not hasattr(py_clf, "gpu_id")
+        assert hasattr(py_clf, "device")
+        assert hasattr(py_clf, "arbitrary_params_dict")
+        assert py_clf.getOrDefault(py_clf.arbitrary_params_dict) == {}
+
+        # Testing overwritten params
+        py_clf = SparkXGBClassifier()
+        py_clf.setParams(x=1, y=2)
+        py_clf.setParams(y=3, z=4)
+        xgb_params = py_clf._gen_xgb_params_dict()
+        assert xgb_params["x"] == 1
+        assert xgb_params["y"] == 3
+        assert xgb_params["z"] == 4
+
     def test_regressor_model_save_load(self, reg_data: RegData) -> None:
         with tempfile.TemporaryDirectory() as tmpdir:
             path = "file:" + tmpdir
@@ -826,6 +868,24 @@ class TestPySparkLocal:
                 )
             assert_model_compatible(model.stages[0], tmpdir)
 
+    def test_device_param(self, reg_data: RegData, clf_data: ClfData) -> None:
+        clf = SparkXGBClassifier(device="cuda", tree_method="exact")
+        with pytest.raises(ValueError, match="not supported on GPU"):
+            clf.fit(clf_data.cls_df_train)
+        regressor = SparkXGBRegressor(device="cuda", tree_method="exact")
+        with pytest.raises(ValueError, match="not supported on GPU"):
+            regressor.fit(reg_data.reg_df_train)
+
+        reg = SparkXGBRegressor(device="cuda", tree_method="gpu_hist")
+        reg._validate_params()
+        reg = SparkXGBRegressor(device="cuda")
+        reg._validate_params()
+
+        clf = SparkXGBClassifier(device="cuda", tree_method="gpu_hist")
+        clf._validate_params()
+        clf = SparkXGBClassifier(device="cuda")
+        clf._validate_params()
+
 
 class XgboostLocalTest(SparkTestCase):
     def setUp(self):
@@ -1020,55 +1080,6 @@ class XgboostLocalTest(SparkTestCase):
         assert sklearn_regressor.max_depth == 3
         assert sklearn_regressor.get_params()["sketch_eps"] == 0.5
 
-    def test_regressor_params_basic(self):
-        py_reg = SparkXGBRegressor()
-        self.assertTrue(hasattr(py_reg, "n_estimators"))
-        self.assertEqual(py_reg.n_estimators.parent, py_reg.uid)
-        self.assertFalse(hasattr(py_reg, "gpu_id"))
-        self.assertFalse(hasattr(py_reg, "device"))
-        self.assertEqual(py_reg.getOrDefault(py_reg.n_estimators), 100)
-        self.assertEqual(py_reg.getOrDefault(py_reg.objective), "reg:squarederror")
-        py_reg2 = SparkXGBRegressor(n_estimators=200)
-        self.assertEqual(py_reg2.getOrDefault(py_reg2.n_estimators), 200)
-        py_reg3 = py_reg2.copy({py_reg2.max_depth: 10})
-        self.assertEqual(py_reg3.getOrDefault(py_reg3.n_estimators), 200)
-        self.assertEqual(py_reg3.getOrDefault(py_reg3.max_depth), 10)
-
-    def test_classifier_params_basic(self):
-        py_cls = SparkXGBClassifier()
-        self.assertTrue(hasattr(py_cls, "n_estimators"))
-        self.assertEqual(py_cls.n_estimators.parent, py_cls.uid)
-        self.assertFalse(hasattr(py_cls, "gpu_id"))
-        self.assertFalse(hasattr(py_cls, "device"))
-        self.assertEqual(py_cls.getOrDefault(py_cls.n_estimators), 100)
-        self.assertEqual(py_cls.getOrDefault(py_cls.objective), None)
-        py_cls2 = SparkXGBClassifier(n_estimators=200)
-        self.assertEqual(py_cls2.getOrDefault(py_cls2.n_estimators), 200)
-        py_cls3 = py_cls2.copy({py_cls2.max_depth: 10})
-        self.assertEqual(py_cls3.getOrDefault(py_cls3.n_estimators), 200)
-        self.assertEqual(py_cls3.getOrDefault(py_cls3.max_depth), 10)
-
-    def test_classifier_kwargs_basic(self):
-        py_cls = SparkXGBClassifier(**self.cls_params_kwargs)
-        self.assertTrue(hasattr(py_cls, "n_estimators"))
-        self.assertEqual(py_cls.n_estimators.parent, py_cls.uid)
-        self.assertFalse(hasattr(py_cls, "gpu_id"))
-        self.assertFalse(hasattr(py_cls, "device"))
-        self.assertTrue(hasattr(py_cls, "arbitrary_params_dict"))
-        expected_kwargs = {"sketch_eps": 0.03}
-        self.assertEqual(
-            py_cls.getOrDefault(py_cls.arbitrary_params_dict), expected_kwargs
-        )
-
-        # Testing overwritten params
-        py_cls = SparkXGBClassifier()
-        py_cls.setParams(x=1, y=2)
-        py_cls.setParams(y=3, z=4)
-        xgb_params = py_cls._gen_xgb_params_dict()
-        assert xgb_params["x"] == 1
-        assert xgb_params["y"] == 3
-        assert xgb_params["z"] == 4
-
     def test_param_alias(self):
         py_cls = SparkXGBClassifier(features_col="f1", label_col="l1")
         self.assertEqual(py_cls.getOrDefault(py_cls.featuresCol), "f1")
@@ -1200,16 +1211,6 @@ class XgboostLocalTest(SparkTestCase):
         classifier = SparkXGBClassifier(num_workers=0)
         self.assertRaises(ValueError, classifier._validate_params)
 
-    def test_use_gpu_param(self):
-        classifier = SparkXGBClassifier(use_gpu=True, tree_method="exact")
-        self.assertRaises(ValueError, classifier._validate_params)
-        regressor = SparkXGBRegressor(use_gpu=True, tree_method="exact")
-        self.assertRaises(ValueError, regressor._validate_params)
-        regressor = SparkXGBRegressor(use_gpu=True, tree_method="gpu_hist")
-        regressor = SparkXGBRegressor(use_gpu=True)
-        classifier = SparkXGBClassifier(use_gpu=True, tree_method="gpu_hist")
-        classifier = SparkXGBClassifier(use_gpu=True)
-
     def test_feature_importances(self):
         reg1 = SparkXGBRegressor(**self.reg_params)
         model = reg1.fit(self.reg_df_train)

From e082718c66ef1f2460dad7b8bf1b7405284dc279 Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Tue, 18 Jul 2023 01:52:26 -0700
Subject: [PATCH 040/136] [CI] Build pip wheel with RMM support (#9383)

---
 tests/buildkite/build-containers.sh         | 14 +++---
 tests/buildkite/build-cuda-with-rmm.sh      | 40 ++++++++++++++---
 tests/buildkite/build-cuda.sh               |  5 ++-
 tests/buildkite/pipeline.yml                |  1 -
 tests/buildkite/test-cpp-gpu.sh             |  6 +--
 tests/ci_build/Dockerfile.gpu_build_centos7 | 17 ++++++-
 tests/ci_build/Dockerfile.rmm               | 49 ---------------------
 tests/ci_build/prune_libnccl.sh             |  2 +-
 8 files changed, 64 insertions(+), 70 deletions(-)
 delete mode 100644 tests/ci_build/Dockerfile.rmm

diff --git a/tests/buildkite/build-containers.sh b/tests/buildkite/build-containers.sh
index 899976a7d..f46e6ccd0 100755
--- a/tests/buildkite/build-containers.sh
+++ b/tests/buildkite/build-containers.sh
@@ -20,16 +20,18 @@ case "${container}" in
   cpu)
     ;;
 
-  gpu|rmm)
+  gpu)
     BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION"
     BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
-    if [[ $container == "rmm" ]]
-    then
-      BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION"
-    fi
     ;;
 
-  gpu_build_centos7|jvm_gpu_build)
+  gpu_build_centos7)
+    BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION"
+    BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION"
+    BUILD_ARGS="$BUILD_ARGS --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
+    ;;
+
+  jvm_gpu_build)
     BUILD_ARGS="$BUILD_ARGS --build-arg CUDA_VERSION_ARG=$CUDA_VERSION"
     BUILD_ARGS="$BUILD_ARGS --build-arg NCCL_VERSION_ARG=$NCCL_VERSION"
     ;;
diff --git a/tests/buildkite/build-cuda-with-rmm.sh b/tests/buildkite/build-cuda-with-rmm.sh
index 2e0b9fe2c..46bc98028 100755
--- a/tests/buildkite/build-cuda-with-rmm.sh
+++ b/tests/buildkite/build-cuda-with-rmm.sh
@@ -2,9 +2,11 @@
 
 set -euo pipefail
 
+WHEEL_TAG=manylinux2014_x86_64
+
 source tests/buildkite/conftest.sh
 
-echo "--- Build with CUDA ${CUDA_VERSION}, RMM enabled"
+echo "--- Build with CUDA ${CUDA_VERSION} with RMM"
 
 if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]]
 then
@@ -13,14 +15,40 @@ else
   arch_flag=""
 fi
 
-command_wrapper="tests/ci_build/ci_build.sh rmm docker --build-arg "`
+command_wrapper="tests/ci_build/ci_build.sh gpu_build_centos7 docker --build-arg "`
                 `"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "`
-                `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION --build-arg "`
-                `"NCCL_VERSION_ARG=$NCCL_VERSION"
+                `"NCCL_VERSION_ARG=$NCCL_VERSION --build-arg "`
+                `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
 
 echo "--- Build libxgboost from the source"
-$command_wrapper tests/ci_build/build_via_cmake.sh --conda-env=gpu_test -DUSE_CUDA=ON \
-  -DUSE_NCCL=ON -DPLUGIN_RMM=ON ${arch_flag}
+$command_wrapper tests/ci_build/prune_libnccl.sh
+$command_wrapper tests/ci_build/build_via_cmake.sh -DCMAKE_PREFIX_PATH="/opt/grpc;/opt/rmm" \
+  -DUSE_CUDA=ON -DUSE_NCCL=ON -DUSE_OPENMP=ON -DHIDE_CXX_SYMBOLS=ON -DPLUGIN_FEDERATED=ON \
+  -DPLUGIN_RMM=ON -DUSE_NCCL_LIB_PATH=ON -DNCCL_INCLUDE_DIR=/usr/include \
+  -DNCCL_LIBRARY=/workspace/libnccl_static.a ${arch_flag}
+echo "--- Build binary wheel"
+$command_wrapper bash -c \
+  "cd python-package && rm -rf dist/* && pip wheel --no-deps -v . --wheel-dir dist/"
+$command_wrapper python tests/ci_build/rename_whl.py python-package/dist/*.whl \
+  ${BUILDKITE_COMMIT} ${WHEEL_TAG}
+
+echo "--- Audit binary wheel to ensure it's compliant with manylinux2014 standard"
+tests/ci_build/ci_build.sh auditwheel_x86_64 docker auditwheel repair \
+  --plat ${WHEEL_TAG} python-package/dist/*.whl
+$command_wrapper python tests/ci_build/rename_whl.py wheelhouse/*.whl \
+  ${BUILDKITE_COMMIT} ${WHEEL_TAG}
+mv -v wheelhouse/*.whl python-package/dist/
+# Make sure that libgomp.so is vendored in the wheel
+tests/ci_build/ci_build.sh auditwheel_x86_64 docker bash -c \
+  "unzip -l python-package/dist/*.whl | grep libgomp  || exit -1"
+
+echo "--- Upload Python wheel"
+buildkite-agent artifact upload python-package/dist/*.whl
+if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]]
+then
+  aws s3 cp python-package/dist/*.whl s3://xgboost-nightly-builds/experimental_build_with_rmm/ \
+    --acl public-read --no-progress
+fi
 
 echo "-- Stash C++ test executable (testxgboost)"
 buildkite-agent artifact upload build/testxgboost
diff --git a/tests/buildkite/build-cuda.sh b/tests/buildkite/build-cuda.sh
index c180695e8..1926754b8 100755
--- a/tests/buildkite/build-cuda.sh
+++ b/tests/buildkite/build-cuda.sh
@@ -17,11 +17,12 @@ fi
 
 command_wrapper="tests/ci_build/ci_build.sh gpu_build_centos7 docker --build-arg "`
                 `"CUDA_VERSION_ARG=$CUDA_VERSION --build-arg "`
-                `"NCCL_VERSION_ARG=$NCCL_VERSION"
+                `"NCCL_VERSION_ARG=$NCCL_VERSION --build-arg "`
+                `"RAPIDS_VERSION_ARG=$RAPIDS_VERSION"
 
 echo "--- Build libxgboost from the source"
 $command_wrapper tests/ci_build/prune_libnccl.sh
-$command_wrapper tests/ci_build/build_via_cmake.sh -DCMAKE_PREFIX_PATH=/opt/grpc \
+$command_wrapper tests/ci_build/build_via_cmake.sh -DCMAKE_PREFIX_PATH="/opt/grpc" \
   -DUSE_CUDA=ON -DUSE_NCCL=ON -DUSE_OPENMP=ON -DHIDE_CXX_SYMBOLS=ON -DPLUGIN_FEDERATED=ON \
   -DUSE_NCCL_LIB_PATH=ON -DNCCL_INCLUDE_DIR=/usr/include \
   -DNCCL_LIBRARY=/workspace/libnccl_static.a ${arch_flag}
diff --git a/tests/buildkite/pipeline.yml b/tests/buildkite/pipeline.yml
index 72e1ec1e5..905535c52 100644
--- a/tests/buildkite/pipeline.yml
+++ b/tests/buildkite/pipeline.yml
@@ -16,7 +16,6 @@ steps:
       - "tests/buildkite/build-containers.sh cpu"
       - "tests/buildkite/build-containers.sh gpu"
       - "tests/buildkite/build-containers.sh gpu_build_centos7"
-      - "tests/buildkite/build-containers.sh rmm"
     key: build-containers
     agents:
       queue: linux-amd64-cpu
diff --git a/tests/buildkite/test-cpp-gpu.sh b/tests/buildkite/test-cpp-gpu.sh
index 7c8f5e505..58d250308 100755
--- a/tests/buildkite/test-cpp-gpu.sh
+++ b/tests/buildkite/test-cpp-gpu.sh
@@ -16,8 +16,8 @@ echo "--- Run Google Tests with CUDA, using a GPU, RMM enabled"
 rm -rfv build/
 buildkite-agent artifact download "build/testxgboost" . --step build-cuda-with-rmm
 chmod +x build/testxgboost
-tests/ci_build/ci_build.sh rmm nvidia-docker \
+tests/ci_build/ci_build.sh gpu nvidia-docker \
   --build-arg CUDA_VERSION_ARG=$CUDA_VERSION \
   --build-arg RAPIDS_VERSION_ARG=$RAPIDS_VERSION \
-  --build-arg NCCL_VERSION_ARG=$NCCL_VERSION bash -c \
-  "source activate gpu_test && build/testxgboost --use-rmm-pool"
+  --build-arg NCCL_VERSION_ARG=$NCCL_VERSION \
+  build/testxgboost --use-rmm-pool
diff --git a/tests/ci_build/Dockerfile.gpu_build_centos7 b/tests/ci_build/Dockerfile.gpu_build_centos7
index bfc79c216..4f9823baa 100644
--- a/tests/ci_build/Dockerfile.gpu_build_centos7
+++ b/tests/ci_build/Dockerfile.gpu_build_centos7
@@ -2,6 +2,7 @@ ARG CUDA_VERSION_ARG
 FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
 ARG CUDA_VERSION_ARG
 ARG NCCL_VERSION_ARG
+ARG RAPIDS_VERSION_ARG
 
 # Install all basic requirements
 RUN \
@@ -16,8 +17,8 @@ RUN \
     bash conda.sh -b -p /opt/mambaforge && \
     /opt/mambaforge/bin/python -m pip install awscli && \
     # CMake
-    wget -nv -nc https://cmake.org/files/v3.18/cmake-3.18.0-Linux-x86_64.sh --no-check-certificate && \
-    bash cmake-3.18.0-Linux-x86_64.sh --skip-license --prefix=/usr
+    wget -nv -nc https://cmake.org/files/v3.24/cmake-3.24.0-linux-x86_64.sh --no-check-certificate && \
+    bash cmake-3.24.0-linux-x86_64.sh --skip-license --prefix=/usr
 
 # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
 RUN \
@@ -33,9 +34,21 @@ ENV PATH=/opt/mambaforge/bin:/usr/local/ninja:$PATH
 ENV CC=/opt/rh/devtoolset-9/root/usr/bin/gcc
 ENV CXX=/opt/rh/devtoolset-9/root/usr/bin/c++
 ENV CPP=/opt/rh/devtoolset-9/root/usr/bin/cpp
+ENV CUDAHOSTCXX=/opt/rh/devtoolset-9/root/usr/bin/c++
 
 ENV GOSU_VERSION 1.10
 
+# Install RMM
+RUN git clone -b v${RAPIDS_VERSION_ARG}.00 https://github.com/rapidsai/rmm.git --recurse-submodules --depth 1 && \
+    pushd rmm && \
+    mkdir build && \
+    pushd build && \
+    cmake .. -GNinja -DCMAKE_INSTALL_PREFIX=/opt/rmm -DCUDA_STATIC_RUNTIME=ON && \
+    cmake --build . --target install && \
+    popd && \
+    popd && \
+    rm -rf rmm
+
 # Install gRPC
 RUN git clone -b v1.49.1 https://github.com/grpc/grpc.git \
       --recurse-submodules --depth 1 && \
diff --git a/tests/ci_build/Dockerfile.rmm b/tests/ci_build/Dockerfile.rmm
deleted file mode 100644
index 16db377c2..000000000
--- a/tests/ci_build/Dockerfile.rmm
+++ /dev/null
@@ -1,49 +0,0 @@
-ARG CUDA_VERSION_ARG
-FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-ubuntu20.04
-ARG CUDA_VERSION_ARG
-ARG RAPIDS_VERSION_ARG
-ARG NCCL_VERSION_ARG
-
-# Environment
-ENV DEBIAN_FRONTEND noninteractive
-SHELL ["/bin/bash", "-c"]   # Use Bash as shell
-
-# Install all basic requirements
-RUN \
-    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub && \
-    apt-get update && \
-    apt-get install -y wget unzip bzip2 libgomp1 build-essential ninja-build git && \
-    # Python
-    wget -nv -O conda.sh https://github.com/conda-forge/miniforge/releases/download/22.11.1-2/Mambaforge-22.11.1-2-Linux-x86_64.sh && \
-    bash conda.sh -b -p /opt/mambaforge
-
-# NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
-RUN \
-    export CUDA_SHORT=`echo $CUDA_VERSION_ARG | grep -o -E '[0-9]+\.[0-9]'` && \
-    export NCCL_VERSION=$NCCL_VERSION_ARG && \
-    apt-get update && \
-    apt-get install -y --allow-downgrades --allow-change-held-packages libnccl2=${NCCL_VERSION}+cuda${CUDA_SHORT} libnccl-dev=${NCCL_VERSION}+cuda${CUDA_SHORT}
-
-ENV PATH=/opt/mambaforge/bin:$PATH
-
-# Create new Conda environment with RMM
-RUN \
-    conda install -c conda-forge mamba && \
-    mamba create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
-        python=3.10 rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG cmake && \
-    mamba clean --all
-
-ENV GOSU_VERSION 1.10
-
-# Install lightweight sudo (not bound to TTY)
-RUN set -ex; \
-    wget -nv -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
-    chmod +x /usr/local/bin/gosu && \
-    gosu nobody true
-
-# Default entry-point to use if running locally
-# It will preserve attributes of created files
-COPY entrypoint.sh /scripts/
-
-WORKDIR /workspace
-ENTRYPOINT ["/scripts/entrypoint.sh"]
diff --git a/tests/ci_build/prune_libnccl.sh b/tests/ci_build/prune_libnccl.sh
index 5b6e48ad5..a81d6e4ac 100755
--- a/tests/ci_build/prune_libnccl.sh
+++ b/tests/ci_build/prune_libnccl.sh
@@ -26,7 +26,7 @@ set_property(TARGET test PROPERTY CUDA_ARCHITECTURES \${CMAKE_CUDA_ARCHITECTURES
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 EOF
 
-cmake . -GNinja
+cmake . -GNinja -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
 gen_code=$(grep -o -- '--generate-code=\S*' compile_commands.json | paste -sd ' ')
 
 nvprune ${gen_code} /usr/lib64/libnccl_static.a -o ../libnccl_static.a

From 0897477af0cf3bf0d04dac069712bb8a7750c1f3 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 18 Jul 2023 18:23:43 +0800
Subject: [PATCH 041/136] Remove unmaintained jvm readme and dev scripts.
 (#9395)

---
 demo/README.md                    |   2 +-
 jvm-packages/README.md            | 160 ++----------------------------
 jvm-packages/dev/.gitattributes   |   3 -
 jvm-packages/dev/.gitignore       |   1 -
 jvm-packages/dev/Dockerfile       |  58 -----------
 jvm-packages/dev/build-linux.cmd  |  44 --------
 jvm-packages/dev/build-linux.sh   |  41 --------
 jvm-packages/dev/package-linux.sh |  36 -------
 8 files changed, 8 insertions(+), 337 deletions(-)
 delete mode 100644 jvm-packages/dev/.gitattributes
 delete mode 100644 jvm-packages/dev/.gitignore
 delete mode 100644 jvm-packages/dev/Dockerfile
 delete mode 100644 jvm-packages/dev/build-linux.cmd
 delete mode 100755 jvm-packages/dev/build-linux.sh
 delete mode 100755 jvm-packages/dev/package-linux.sh

diff --git a/demo/README.md b/demo/README.md
index 26deb453b..df53b05bb 100644
--- a/demo/README.md
+++ b/demo/README.md
@@ -145,7 +145,7 @@ Send a PR to add a one sentence description:)
 ## Tools using XGBoost
 
 - [BayesBoost](https://github.com/mpearmain/BayesBoost) - Bayesian Optimization using xgboost and sklearn API
-- [FLAML](https://github.com/microsoft/FLAML) - An open source AutoML library 
+- [FLAML](https://github.com/microsoft/FLAML) - An open source AutoML library
 designed to automatically produce accurate machine learning models with low computational cost. FLAML includes [XGBoost as one of the default learners](https://github.com/microsoft/FLAML/blob/main/flaml/model.py) and can also be used as a fast hyperparameter tuning tool for XGBoost ([code example](https://microsoft.github.io/FLAML/docs/Examples/AutoML-for-XGBoost)).
 - [gp_xgboost_gridsearch](https://github.com/vatsan/gp_xgboost_gridsearch) - In-database parallel grid-search for XGBoost on [Greenplum](https://github.com/greenplum-db/gpdb) using PL/Python
 - [tpot](https://github.com/rhiever/tpot) - A Python tool that automatically creates and optimizes machine learning pipelines using genetic programming.
diff --git a/jvm-packages/README.md b/jvm-packages/README.md
index 451a0d981..78f9a5e0f 100644
--- a/jvm-packages/README.md
+++ b/jvm-packages/README.md
@@ -3,161 +3,15 @@
 [![Documentation Status](https://readthedocs.org/projects/xgboost/badge/?version=latest)](https://xgboost.readthedocs.org/en/latest/jvm/index.html)
 [![GitHub license](http://dmlc.github.io/img/apache2.svg)](../LICENSE)
 
-[Documentation](https://xgboost.readthedocs.org/en/latest/jvm/index.html) |
+[Documentation](https://xgboost.readthedocs.org/en/stable/jvm/index.html) |
 [Resources](../demo/README.md) |
 [Release Notes](../NEWS.md)
 
-XGBoost4J is the JVM package of xgboost. It brings all the optimizations
-and power xgboost into JVM ecosystem.
+XGBoost4J is the JVM package of xgboost. It brings all the optimizations and power xgboost
+into JVM ecosystem.
 
-- Train XGBoost models in scala and java with easy customizations.
-- Run distributed xgboost natively on jvm frameworks such as
-Apache Flink and Apache Spark.
+- Train XGBoost models in scala and java with easy customization.
+- Run distributed xgboost natively on jvm frameworks such as Apache Flink and Apache
+Spark.
 
-You can find more about XGBoost on [Documentation](https://xgboost.readthedocs.org/en/latest/jvm/index.html) and [Resource Page](../demo/README.md).
-
-## Add Maven Dependency
-
-XGBoost4J, XGBoost4J-Spark, etc. in maven repository is compiled with g++-4.8.5.
-
-### Access release version
-
-<b>Maven</b>
-
-```
-<dependency>
-    <groupId>ml.dmlc</groupId>
-    <artifactId>xgboost4j_2.12</artifactId>
-    <version>latest_version_num</version>
-</dependency>
-<dependency>
-    <groupId>ml.dmlc</groupId>
-    <artifactId>xgboost4j-spark_2.12</artifactId>
-    <version>latest_version_num</version>
-</dependency>
-```
-or 
-```
-<dependency>
-    <groupId>ml.dmlc</groupId>
-    <artifactId>xgboost4j_2.13</artifactId>
-    <version>latest_version_num</version>
-</dependency>
-<dependency>
-    <groupId>ml.dmlc</groupId>
-    <artifactId>xgboost4j-spark_2.13</artifactId>
-    <version>latest_version_num</version>
-</dependency>
-```
-
-<b>sbt</b>
-```sbt
-libraryDependencies ++= Seq(
-  "ml.dmlc" %% "xgboost4j" % "latest_version_num",
-  "ml.dmlc" %% "xgboost4j-spark" % "latest_version_num"
-)
-```
-
-For the latest release version number, please check [here](https://github.com/dmlc/xgboost/releases).
-
-
-### Access SNAPSHOT version
-
-First add the following Maven repository hosted by the XGBoost project:
-
-<b>Maven</b>:
-
-```xml
-<repository>
-  <id>XGBoost4J Snapshot Repo</id>
-  <name>XGBoost4J Snapshot Repo</name>
-  <url>https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/snapshot/</url>
-</repository>
-```
-
-<b>sbt</b>:
-
-```sbt
-resolvers += "XGBoost4J Snapshot Repo" at "https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/snapshot/"
-```
-
-Then add XGBoost4J as a dependency:
-
-<b>Maven</b>
-
-```
-<dependency>
-    <groupId>ml.dmlc</groupId>
-    <artifactId>xgboost4j_2.12</artifactId>
-    <version>latest_version_num-SNAPSHOT</version>
-</dependency>
-<dependency>
-    <groupId>ml.dmlc</groupId>
-    <artifactId>xgboost4j-spark_2.12</artifactId>
-    <version>latest_version_num-SNAPSHOT</version>
-</dependency>
-```
-or with scala 2.13 
-```
-<dependency>
-    <groupId>ml.dmlc</groupId>
-    <artifactId>xgboost4j_2.13</artifactId>
-    <version>latest_version_num-SNAPSHOT</version>
-</dependency>
-<dependency>
-    <groupId>ml.dmlc</groupId>
-    <artifactId>xgboost4j-spark_2.13</artifactId>
-    <version>latest_version_num-SNAPSHOT</version>
-</dependency>
-```
-
-<b>sbt</b>
-```sbt
-libraryDependencies ++= Seq(
-  "ml.dmlc" %% "xgboost4j" % "latest_version_num-SNAPSHOT",
-  "ml.dmlc" %% "xgboost4j-spark" % "latest_version_num-SNAPSHOT"
-)
-```
-
-For the latest release version number, please check [the repository listing](https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/list.html).
-
-### GPU algorithm
-To enable the GPU algorithm (`tree_method='gpu_hist'`), use artifacts `xgboost4j-gpu_2.12` and `xgboost4j-spark-gpu_2.12` instead.
-Note that scala 2.13 is not supported by the [NVIDIA/spark-rapids#1525](https://github.com/NVIDIA/spark-rapids/issues/1525) yet, so the GPU algorithm can only be used with scala 2.12.
-
-## Examples
-
-Full code examples for Scala, Java, Apache Spark, and Apache Flink can
-be found in the [examples package](https://github.com/dmlc/xgboost/tree/master/jvm-packages/xgboost4j-example).
-
-**NOTE on LIBSVM Format**:
-
-There is an inconsistent issue between XGBoost4J-Spark and other language bindings of XGBoost.
-
-When users use Spark to load trainingset/testset in LIBSVM format with the following code snippet:
-
-```scala
-spark.read.format("libsvm").load("trainingset_libsvm")
-```
-
-Spark assumes that the dataset is 1-based indexed. However, when you do prediction with other bindings of XGBoost (e.g. Python API of XGBoost), XGBoost assumes that the dataset is 0-based indexed. It creates a pitfall for the users who train model with Spark but predict with the dataset in the same format in other bindings of XGBoost.
-
-## Development
-
-You can build/package xgboost4j locally with the following steps:
-
-**Linux:**
-1. Ensure [Docker for Linux](https://docs.docker.com/install/) is installed.
-2. Clone this repo: `git clone --recursive https://github.com/dmlc/xgboost.git`
-3. Run the following command:
-  - With Tests: `./xgboost/jvm-packages/dev/build-linux.sh`
-  - Skip Tests: `./xgboost/jvm-packages/dev/build-linux.sh --skip-tests`
-
-**Windows:**
-1. Ensure [Docker for Windows](https://docs.docker.com/docker-for-windows/install/) is installed.
-2. Clone this repo: `git clone --recursive https://github.com/dmlc/xgboost.git`
-3. Run the following command:
-  - With Tests: `.\xgboost\jvm-packages\dev\build-linux.cmd`
-  - Skip Tests: `.\xgboost\jvm-packages\dev\build-linux.cmd --skip-tests`
-
-*Note: this will create jars for deployment on Linux machines.*
+You can find more about XGBoost on [Documentation](https://xgboost.readthedocs.org/en/stable/jvm/index.html) and [Resource Page](../demo/README.md).
\ No newline at end of file
diff --git a/jvm-packages/dev/.gitattributes b/jvm-packages/dev/.gitattributes
deleted file mode 100644
index ed670eced..000000000
--- a/jvm-packages/dev/.gitattributes
+++ /dev/null
@@ -1,3 +0,0 @@
-# Set line endings to LF, even on Windows. Otherwise, execution within Docker fails.
-# See https://help.github.com/articles/dealing-with-line-endings/
-*.sh text eol=lf
diff --git a/jvm-packages/dev/.gitignore b/jvm-packages/dev/.gitignore
deleted file mode 100644
index eb713db19..000000000
--- a/jvm-packages/dev/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-.m2
diff --git a/jvm-packages/dev/Dockerfile b/jvm-packages/dev/Dockerfile
deleted file mode 100644
index 72ccdeba0..000000000
--- a/jvm-packages/dev/Dockerfile
+++ /dev/null
@@ -1,58 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-FROM centos:7
-
-# Install all basic requirements
-RUN \
-    yum -y update && \
-    yum install -y bzip2 make tar unzip wget xz git centos-release-scl yum-utils java-1.8.0-openjdk-devel && \
-    yum-config-manager --enable centos-sclo-rh-testing && \
-    yum -y update && \
-    yum install -y devtoolset-7-gcc devtoolset-7-binutils devtoolset-7-gcc-c++ && \
-    # Python
-    wget https://repo.continuum.io/miniconda/Miniconda3-4.5.12-Linux-x86_64.sh && \
-    bash Miniconda3-4.5.12-Linux-x86_64.sh -b -p /opt/python && \
-    # CMake
-    wget -nv -nc https://cmake.org/files/v3.18/cmake-3.18.3-Linux-x86_64.sh --no-check-certificate && \
-    bash cmake-3.18.3-Linux-x86_64.sh --skip-license --prefix=/usr && \
-    # Maven
-    wget https://archive.apache.org/dist/maven/maven-3/3.6.1/binaries/apache-maven-3.6.1-bin.tar.gz && \
-    tar xvf apache-maven-3.6.1-bin.tar.gz -C /opt && \
-    ln -s /opt/apache-maven-3.6.1/ /opt/maven
-
-# Set the required environment variables
-ENV PATH=/opt/python/bin:/opt/maven/bin:$PATH
-ENV CC=/opt/rh/devtoolset-7/root/usr/bin/gcc
-ENV CXX=/opt/rh/devtoolset-7/root/usr/bin/c++
-ENV CPP=/opt/rh/devtoolset-7/root/usr/bin/cpp
-ENV JAVA_HOME=/usr/lib/jvm/java
-
-# Install Python packages
-RUN \
-    pip install numpy pytest scipy scikit-learn wheel kubernetes urllib3==1.22 awscli
-
-ENV GOSU_VERSION 1.10
-
-# Install lightweight sudo (not bound to TTY)
-RUN set -ex; \
-    wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
-    chmod +x /usr/local/bin/gosu && \
-    gosu nobody true
-
-WORKDIR /xgboost
diff --git a/jvm-packages/dev/build-linux.cmd b/jvm-packages/dev/build-linux.cmd
deleted file mode 100644
index a5d962f5f..000000000
--- a/jvm-packages/dev/build-linux.cmd
+++ /dev/null
@@ -1,44 +0,0 @@
-@echo off
-
-rem
-rem Licensed to the Apache Software Foundation (ASF) under one
-rem or more contributor license agreements.  See the NOTICE file
-rem distributed with this work for additional information
-rem regarding copyright ownership.  The ASF licenses this file
-rem to you under the Apache License, Version 2.0 (the
-rem "License"); you may not use this file except in compliance
-rem with the License.  You may obtain a copy of the License at
-rem
-rem   http://www.apache.org/licenses/LICENSE-2.0
-rem
-rem Unless required by applicable law or agreed to in writing,
-rem software distributed under the License is distributed on an
-rem "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-rem KIND, either express or implied.  See the License for the
-rem specific language governing permissions and limitations
-rem under the License.
-rem
-
-rem The the local path of this file
-set "BASEDIR=%~dp0"
-
-rem The local path of .m2 directory for maven
-set "M2DIR=%BASEDIR%\.m2\"
-
-rem Create a local .m2 directory if needed
-if not exist "%M2DIR%" mkdir "%M2DIR%"
-
-rem Build and tag the Dockerfile
-docker build -t dmlc/xgboost4j-build %BASEDIR%
-
-docker run^
- -it^
- --rm^
- --memory 12g^
- --env JAVA_OPTS="-Xmx9g"^
- --env MAVEN_OPTS="-Xmx3g"^
- --ulimit core=-1^
- --volume %BASEDIR%\..\..:/xgboost^
- --volume %M2DIR%:/root/.m2^
- dmlc/xgboost4j-build^
- /xgboost/jvm-packages/dev/package-linux.sh "%*"
diff --git a/jvm-packages/dev/build-linux.sh b/jvm-packages/dev/build-linux.sh
deleted file mode 100755
index 1509a3752..000000000
--- a/jvm-packages/dev/build-linux.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/usr/bin/env bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-BASEDIR="$( cd "$( dirname "$0" )" && pwd )" # the directory of this file
-
-docker build -t dmlc/xgboost4j-build "${BASEDIR}" # build and tag the Dockerfile
-
-exec docker run \
-  -it \
-  --rm  \
-  --memory 12g \
-  --env JAVA_OPTS="-Xmx9g" \
-  --env MAVEN_OPTS="-Xmx3g -Dmaven.repo.local=/xgboost/jvm-packages/dev/.m2" \
-  --env CI_BUILD_UID=`id -u` \
-  --env CI_BUILD_GID=`id -g` \
-  --env CI_BUILD_USER=`id -un` \
-  --env CI_BUILD_GROUP=`id -gn` \
-  --ulimit core=-1 \
-  --volume "${BASEDIR}/../..":/xgboost \
-  dmlc/xgboost4j-build \
-  /xgboost/tests/ci_build/entrypoint.sh jvm-packages/dev/package-linux.sh "$@"
-
-# CI_BUILD_UID, CI_BUILD_GID, CI_BUILD_USER, CI_BUILD_GROUP
-# are used by entrypoint.sh to create the user with the same uid in a container
-# so all produced artifacts would be owned by your host user
\ No newline at end of file
diff --git a/jvm-packages/dev/package-linux.sh b/jvm-packages/dev/package-linux.sh
deleted file mode 100755
index 1fd777d9b..000000000
--- a/jvm-packages/dev/package-linux.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/usr/bin/env bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-cd jvm-packages
-
-case "$1" in
-  --skip-tests) SKIP_TESTS=true ;;
-  "")           SKIP_TESTS=false ;;
-esac
-
-if [[ -n ${SKIP_TESTS} ]]; then
-  if [[ ${SKIP_TESTS} == "true" ]]; then
-    mvn --batch-mode clean package -DskipTests
-  elif [[ ${SKIP_TESTS} == "false" ]]; then
-    mvn --batch-mode clean package
-  fi
-else
-  echo "Usage: $0 [--skip-tests]"
-  exit 1
-fi

From 7a0ccfbb491607a7b7dc2f13256db999b11d09cb Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 19 Jul 2023 13:42:38 +0800
Subject: [PATCH 042/136] Add compute 90. (#9397)

---
 cmake/Utils.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
index dc523d03a..cb239f79c 100644
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@@ -90,8 +90,8 @@ function(format_gencode_flags flags out)
   endif()
   # Set up architecture flags
   if(NOT flags)
-    if (CUDA_VERSION VERSION_GREATER_EQUAL "11.1")
-      set(flags "50;60;70;80")
+    if (CUDA_VERSION VERSION_GREATER_EQUAL "11.8")
+      set(flags "50;60;70;80;90")
     elseif (CUDA_VERSION VERSION_GREATER_EQUAL "11.0")
       set(flags "50;60;70;80")
     elseif(CUDA_VERSION VERSION_GREATER_EQUAL "10.0")

From f7f673b00c15458fb4dd74a2a0d2ba80369c5faf Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Wed, 19 Jul 2023 17:21:00 -0700
Subject: [PATCH 043/136] Switch to per-thread default stream (#9396)

---
 cmake/Utils.cmake                             |  1 +
 src/collective/nccl_device_communicator.cu    | 27 +++++++------------
 src/collective/nccl_device_communicator.cuh   |  1 -
 src/common/device_helpers.cuh                 |  2 +-
 src/common/hist_util.cuh                      |  4 +--
 src/tree/gpu_hist/row_partitioner.cu          |  2 --
 src/tree/gpu_hist/row_partitioner.cuh         | 21 +++++++--------
 .../cpp/tree/gpu_hist/test_row_partitioner.cu |  2 +-
 8 files changed, 25 insertions(+), 35 deletions(-)

diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
index cb239f79c..1e0530efa 100644
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@@ -127,6 +127,7 @@ endfunction(format_gencode_flags flags)
 # Set CUDA related flags to target.  Must be used after code `format_gencode_flags`.
 function(xgboost_set_cuda_flags target)
   target_compile_options(${target} PRIVATE
+    $<$<COMPILE_LANGUAGE:CUDA>:--default-stream per-thread>
     $<$<COMPILE_LANGUAGE:CUDA>:--expt-extended-lambda>
     $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
     $<$<COMPILE_LANGUAGE:CUDA>:${GEN_CODE}>
diff --git a/src/collective/nccl_device_communicator.cu b/src/collective/nccl_device_communicator.cu
index 470700d2d..51fa5693c 100644
--- a/src/collective/nccl_device_communicator.cu
+++ b/src/collective/nccl_device_communicator.cu
@@ -44,16 +44,12 @@ NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, bool needs_sy
   nccl_unique_id_ = GetUniqueId();
   dh::safe_cuda(cudaSetDevice(device_ordinal_));
   dh::safe_nccl(ncclCommInitRank(&nccl_comm_, world_size_, nccl_unique_id_, rank_));
-  dh::safe_cuda(cudaStreamCreate(&cuda_stream_));
 }
 
 NcclDeviceCommunicator::~NcclDeviceCommunicator() {
   if (world_size_ == 1) {
     return;
   }
-  if (cuda_stream_) {
-    dh::safe_cuda(cudaStreamDestroy(cuda_stream_));
-  }
   if (nccl_comm_) {
     dh::safe_nccl(ncclCommDestroy(nccl_comm_));
   }
@@ -123,8 +119,8 @@ ncclRedOp_t GetNcclRedOp(Operation const &op) {
 
 template <typename Func>
 void RunBitwiseAllreduce(char *out_buffer, char const *device_buffer, Func func, int world_size,
-                         std::size_t size, cudaStream_t stream) {
-  dh::LaunchN(size, stream, [=] __device__(std::size_t idx) {
+                         std::size_t size) {
+  dh::LaunchN(size, [=] __device__(std::size_t idx) {
     auto result = device_buffer[idx];
     for (auto rank = 1; rank < world_size; rank++) {
       result = func(result, device_buffer[rank * size + idx]);
@@ -142,25 +138,22 @@ void NcclDeviceCommunicator::BitwiseAllReduce(void *send_receive_buffer, std::si
 
   // First gather data from all the workers.
   dh::safe_nccl(ncclAllGather(send_receive_buffer, device_buffer, count, GetNcclDataType(data_type),
-                              nccl_comm_, cuda_stream_));
+                              nccl_comm_, dh::DefaultStream()));
   if (needs_sync_) {
-    dh::safe_cuda(cudaStreamSynchronize(cuda_stream_));
+    dh::DefaultStream().Sync();
   }
 
   // Then reduce locally.
   auto *out_buffer = static_cast<char *>(send_receive_buffer);
   switch (op) {
     case Operation::kBitwiseAND:
-      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_and<char>(), world_size_, size,
-                          cuda_stream_);
+      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_and<char>(), world_size_, size);
       break;
     case Operation::kBitwiseOR:
-      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_or<char>(), world_size_, size,
-                          cuda_stream_);
+      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_or<char>(), world_size_, size);
       break;
     case Operation::kBitwiseXOR:
-      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_xor<char>(), world_size_, size,
-                          cuda_stream_);
+      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_xor<char>(), world_size_, size);
       break;
     default:
       LOG(FATAL) << "Not a bitwise reduce operation.";
@@ -179,7 +172,7 @@ void NcclDeviceCommunicator::AllReduce(void *send_receive_buffer, std::size_t co
   } else {
     dh::safe_nccl(ncclAllReduce(send_receive_buffer, send_receive_buffer, count,
                                 GetNcclDataType(data_type), GetNcclRedOp(op), nccl_comm_,
-                                cuda_stream_));
+                                dh::DefaultStream()));
   }
   allreduce_bytes_ += count * GetTypeSize(data_type);
   allreduce_calls_ += 1;
@@ -206,7 +199,7 @@ void NcclDeviceCommunicator::AllGatherV(void const *send_buffer, size_t length_b
   for (int32_t i = 0; i < world_size_; ++i) {
     size_t as_bytes = segments->at(i);
     dh::safe_nccl(ncclBroadcast(send_buffer, receive_buffer->data().get() + offset, as_bytes,
-                                ncclChar, i, nccl_comm_, cuda_stream_));
+                                ncclChar, i, nccl_comm_, dh::DefaultStream()));
     offset += as_bytes;
   }
   dh::safe_nccl(ncclGroupEnd());
@@ -217,7 +210,7 @@ void NcclDeviceCommunicator::Synchronize() {
     return;
   }
   dh::safe_cuda(cudaSetDevice(device_ordinal_));
-  dh::safe_cuda(cudaStreamSynchronize(cuda_stream_));
+  dh::DefaultStream().Sync();
 }
 
 }  // namespace collective
diff --git a/src/collective/nccl_device_communicator.cuh b/src/collective/nccl_device_communicator.cuh
index bb3fce45c..d99002685 100644
--- a/src/collective/nccl_device_communicator.cuh
+++ b/src/collective/nccl_device_communicator.cuh
@@ -77,7 +77,6 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
   int const world_size_;
   int const rank_;
   ncclComm_t nccl_comm_{};
-  cudaStream_t cuda_stream_{};
   ncclUniqueId nccl_unique_id_{};
   size_t allreduce_bytes_{0};  // Keep statistics of the number of bytes communicated.
   size_t allreduce_calls_{0};  // Keep statistics of the number of reduce calls.
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index db38b2222..c45949f66 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -1176,7 +1176,7 @@ inline void CUDAEvent::Record(CUDAStreamView stream) {  // NOLINT
   dh::safe_cuda(cudaEventRecord(event_, cudaStream_t{stream}));
 }
 
-inline CUDAStreamView DefaultStream() { return CUDAStreamView{cudaStreamLegacy}; }
+inline CUDAStreamView DefaultStream() { return CUDAStreamView{cudaStreamPerThread}; }
 
 class CUDAStream {
   cudaStream_t stream_;
diff --git a/src/common/hist_util.cuh b/src/common/hist_util.cuh
index 0dcdad64d..5e5ce80ca 100644
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@@ -135,12 +135,12 @@ void LaunchGetColumnSizeKernel(std::int32_t device, IterSpan<BatchIt> batch_iter
       CHECK(!force_use_u64);
       auto kernel = GetColumnSizeSharedMemKernel<kBlockThreads, std::uint32_t, BatchIt>;
       auto grid_size = EstimateGridSize<kBlockThreads>(device, kernel, required_shared_memory);
-      dh::LaunchKernel{grid_size, kBlockThreads, required_shared_memory, dh::DefaultStream()}(
+      dh::LaunchKernel{grid_size, kBlockThreads, required_shared_memory}(
           kernel, batch_iter, is_valid, out_column_size);
     } else {
       auto kernel = GetColumnSizeSharedMemKernel<kBlockThreads, std::size_t, BatchIt>;
       auto grid_size = EstimateGridSize<kBlockThreads>(device, kernel, required_shared_memory);
-      dh::LaunchKernel{grid_size, kBlockThreads, required_shared_memory, dh::DefaultStream()}(
+      dh::LaunchKernel{grid_size, kBlockThreads, required_shared_memory}(
           kernel, batch_iter, is_valid, out_column_size);
     }
   } else {
diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu
index 015d817f3..78b04883c 100644
--- a/src/tree/gpu_hist/row_partitioner.cu
+++ b/src/tree/gpu_hist/row_partitioner.cu
@@ -18,12 +18,10 @@ RowPartitioner::RowPartitioner(int device_idx, size_t num_rows)
   dh::safe_cuda(cudaSetDevice(device_idx_));
   ridx_segments_.emplace_back(NodePositionInfo{Segment(0, num_rows)});
   thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size());
-  dh::safe_cuda(cudaStreamCreate(&stream_));
 }
 
 RowPartitioner::~RowPartitioner() {
   dh::safe_cuda(cudaSetDevice(device_idx_));
-  dh::safe_cuda(cudaStreamDestroy(stream_));
 }
 
 common::Span<const RowPartitioner::RowIndexT> RowPartitioner::GetRows(bst_node_t nidx) {
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index f1c420ba0..215a0e49b 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -116,7 +116,7 @@ template <typename RowIndexT, typename OpT, typename OpDataT>
 void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
                        common::Span<RowIndexT> ridx, common::Span<RowIndexT> ridx_tmp,
                        common::Span<bst_uint> d_counts, std::size_t total_rows, OpT op,
-                       dh::device_vector<int8_t>* tmp, cudaStream_t stream) {
+                       dh::device_vector<int8_t>* tmp) {
   dh::LDGIterator<PerNodeData<OpDataT>> batch_info_itr(d_batch_info.data());
   WriteResultsFunctor<OpDataT> write_results{batch_info_itr, ridx.data(), ridx_tmp.data(),
                                              d_counts.data()};
@@ -135,12 +135,12 @@ void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
   size_t temp_bytes = 0;
   if (tmp->empty()) {
     cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator, discard_write_iterator,
-                                   IndexFlagOp(), total_rows, stream);
+                                   IndexFlagOp(), total_rows);
     tmp->resize(temp_bytes);
   }
   temp_bytes = tmp->size();
   cub::DeviceScan::InclusiveScan(tmp->data().get(), temp_bytes, input_iterator,
-                                 discard_write_iterator, IndexFlagOp(), total_rows, stream);
+                                 discard_write_iterator, IndexFlagOp(), total_rows);
 
   constexpr int kBlockSize = 256;
 
@@ -149,7 +149,7 @@ void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
   const int grid_size = xgboost::common::DivRoundUp(total_rows, kBlockSize * kItemsThread);
 
   SortPositionCopyKernel<kBlockSize, RowIndexT, OpDataT>
-      <<<grid_size, kBlockSize, 0, stream>>>(batch_info_itr, ridx, ridx_tmp, total_rows);
+      <<<grid_size, kBlockSize, 0>>>(batch_info_itr, ridx, ridx_tmp, total_rows);
 }
 
 struct NodePositionInfo {
@@ -221,7 +221,6 @@ class RowPartitioner {
   dh::device_vector<int8_t> tmp_;
   dh::PinnedMemory pinned_;
   dh::PinnedMemory pinned2_;
-  cudaStream_t stream_;
 
  public:
   RowPartitioner(int device_idx, size_t num_rows);
@@ -278,7 +277,7 @@ class RowPartitioner {
     }
     dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
                                   h_batch_info.size() * sizeof(PerNodeData<OpDataT>),
-                                  cudaMemcpyDefault, stream_));
+                                  cudaMemcpyDefault));
 
     // Temporary arrays
     auto h_counts = pinned_.GetSpan<bst_uint>(nidx.size(), 0);
@@ -287,12 +286,12 @@ class RowPartitioner {
     // Partition the rows according to the operator
     SortPositionBatch<RowIndexT, UpdatePositionOpT, OpDataT>(
         dh::ToSpan(d_batch_info), dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_counts),
-        total_rows, op, &tmp_, stream_);
+        total_rows, op, &tmp_);
     dh::safe_cuda(cudaMemcpyAsync(h_counts.data(), d_counts.data().get(), h_counts.size_bytes(),
-                                  cudaMemcpyDefault, stream_));
+                                  cudaMemcpyDefault));
     // TODO(Rory): this synchronisation hurts performance a lot
     // Future optimisation should find a way to skip this
-    dh::safe_cuda(cudaStreamSynchronize(stream_));
+    dh::DefaultStream().Sync();
 
     // Update segments
     for (size_t i = 0; i < nidx.size(); i++) {
@@ -327,13 +326,13 @@ class RowPartitioner {
     dh::TemporaryArray<NodePositionInfo> d_node_info_storage(ridx_segments_.size());
     dh::safe_cuda(cudaMemcpyAsync(d_node_info_storage.data().get(), ridx_segments_.data(),
                                   sizeof(NodePositionInfo) * ridx_segments_.size(),
-                                  cudaMemcpyDefault, stream_));
+                                  cudaMemcpyDefault));
 
     constexpr int kBlockSize = 512;
     const int kItemsThread = 8;
     const int grid_size = xgboost::common::DivRoundUp(ridx_.size(), kBlockSize * kItemsThread);
     common::Span<const RowIndexT> d_ridx(ridx_.data().get(), ridx_.size());
-    FinalisePositionKernel<kBlockSize><<<grid_size, kBlockSize, 0, stream_>>>(
+    FinalisePositionKernel<kBlockSize><<<grid_size, kBlockSize, 0>>>(
         dh::ToSpan(d_node_info_storage), d_ridx, d_out_position, op);
   }
 };
diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
index f82123452..050980400 100644
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -73,7 +73,7 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se
   dh::device_vector<int8_t> tmp;
   SortPositionBatch<uint32_t, decltype(op), int>(dh::ToSpan(d_batch_info), dh::ToSpan(ridx),
                                                  dh::ToSpan(ridx_tmp), dh::ToSpan(counts),
-                                                 total_rows, op, &tmp, nullptr);
+                                                 total_rows, op, &tmp);
 
   auto op_without_data = [=] __device__(auto ridx) { return ridx % 2 == 0; };
   for (size_t i = 0; i < segments.size(); i++) {

From dbd5309b553a94656448595c9e09d2005dce855d Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 20 Jul 2023 23:30:04 +0800
Subject: [PATCH 044/136] Fix warning message for device. (#9402)

---
 src/common/error_msg.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/common/error_msg.cc b/src/common/error_msg.cc
index 593c7d6de..062549794 100644
--- a/src/common/error_msg.cc
+++ b/src/common/error_msg.cc
@@ -20,7 +20,7 @@ void WarnDeprecatedGPUHist() {
       "The tree method `gpu_hist` is deprecated since 2.0.0. To use GPU training, set the `device` "
       R"(parameter to CUDA instead.
 
-    E.g. tree_method = "hist", device = "CUDA"
+    E.g. tree_method = "hist", device = "cuda"
 )";
   LOG(WARNING) << msg;
 }

From 0de7c474959c47003abb941dc7a657d8e010b96b Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 22 Jul 2023 08:39:21 +0800
Subject: [PATCH 045/136] Fix metric serialization. (#9405)

---
 src/learner.cc            |  8 ++++----
 tests/cpp/test_learner.cc | 28 ++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/src/learner.cc b/src/learner.cc
index 2f453ea30..b2d6baff0 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -583,8 +583,9 @@ class LearnerConfiguration : public Learner {
     auto& objective_fn = learner_parameters["objective"];
     obj_->SaveConfig(&objective_fn);
 
-    std::vector<Json> metrics(metrics_.size(), Json{Object{}});
+    std::vector<Json> metrics(metrics_.size());
     for (size_t i = 0; i < metrics_.size(); ++i) {
+      metrics[i] = Object{};
       metrics_[i]->SaveConfig(&metrics[i]);
     }
     learner_parameters["metrics"] = Array(std::move(metrics));
@@ -807,14 +808,13 @@ class LearnerConfiguration : public Learner {
 
   void ConfigureMetrics(Args const& args) {
     for (auto const& name : metric_names_) {
-      auto DupCheck = [&name](std::unique_ptr<Metric> const& m) {
-                        return m->Name() != name;
-                      };
+      auto DupCheck = [&name](std::unique_ptr<Metric> const& m) { return m->Name() != name; };
       if (std::all_of(metrics_.begin(), metrics_.end(), DupCheck)) {
         metrics_.emplace_back(std::unique_ptr<Metric>(Metric::Create(name, &ctx_)));
         mparam_.contain_eval_metrics = 1;
       }
     }
+
     for (auto& p_metric : metrics_) {
       p_metric->Configure(args);
     }
diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc
index 2165c6c8d..3615f7587 100644
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@@ -215,6 +215,34 @@ TEST(Learner, JsonModelIO) {
   }
 }
 
+TEST(Learner, ConfigIO) {
+  bst_row_t n_samples = 128;
+  bst_feature_t n_features = 12;
+  std::shared_ptr<DMatrix> p_fmat{
+      RandomDataGenerator{n_samples, n_features, 0}.GenerateDMatrix(true, false, 2)};
+
+  auto serialised_model_tmp = std::string{};
+  std::string eval_res_0;
+  std::string eval_res_1;
+  {
+    std::unique_ptr<Learner> learner{Learner::Create({p_fmat})};
+    learner->SetParams(Args{{"eval_metric", "ndcg"}, {"eval_metric", "map"}});
+    learner->Configure();
+    learner->UpdateOneIter(0, p_fmat);
+    eval_res_0 = learner->EvalOneIter(0, {p_fmat}, {"Train"});
+    common::MemoryBufferStream fo(&serialised_model_tmp);
+    learner->Save(&fo);
+  }
+
+  {
+    common::MemoryBufferStream fi(&serialised_model_tmp);
+    std::unique_ptr<Learner> learner{Learner::Create({p_fmat})};
+    learner->Load(&fi);
+    eval_res_1 = learner->EvalOneIter(0, {p_fmat}, {"Train"});
+  }
+  ASSERT_EQ(eval_res_0, eval_res_1);
+}
+
 // Crashes the test runner if there are race condiditions.
 //
 // Build with additional cmake flags to enable thread sanitizer

From 22b0a55a047e5ad1cb40aed4f45fac0ce5d8aa28 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 22 Jul 2023 10:43:12 +0800
Subject: [PATCH 046/136] Remove hist builder class. (#9400)

* Remove hist build class.

* Cleanup this stateless class.

* Add comment to thread block.
---
 src/common/hist_util.cc                     | 32 +++++++--------
 src/common/hist_util.h                      | 29 +++-----------
 src/common/threading_utils.h                | 43 ++++++++++++---------
 src/tree/hist/histogram.h                   | 16 +++-----
 tests/cpp/tree/hist/test_evaluate_splits.cc |  4 +-
 tests/cpp/tree/hist/test_histogram.cc       |  7 ++--
 6 files changed, 54 insertions(+), 77 deletions(-)

diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc
index c9b50792d..1d950e70a 100644
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -8,8 +8,8 @@
 
 #include <vector>
 
-#include "../common/common.h"
-#include "column_matrix.h"
+#include "../data/adapter.h"         // for SparsePageAdapterBatch
+#include "../data/gradient_index.h"  // for GHistIndexMatrix
 #include "quantile.h"
 #include "xgboost/base.h"
 #include "xgboost/context.h"  // Context
@@ -24,9 +24,7 @@
   #define PREFETCH_READ_T0(addr) do {} while (0)
 #endif  // defined(XGBOOST_MM_PREFETCH_PRESENT)
 
-namespace xgboost {
-namespace common {
-
+namespace xgboost::common {
 HistogramCuts::HistogramCuts() {
   cut_ptrs_.HostVector().emplace_back(0);
 }
@@ -350,9 +348,8 @@ void BuildHistDispatch(Span<GradientPair const> gpair, const RowSetCollection::E
 }
 
 template <bool any_missing>
-void GHistBuilder::BuildHist(Span<GradientPair const> gpair,
-                             const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat,
-                             GHistRow hist, bool force_read_by_column) const {
+void BuildHist(Span<GradientPair const> gpair, const RowSetCollection::Elem row_indices,
+               const GHistIndexMatrix &gmat, GHistRow hist, bool force_read_by_column) {
   /* force_read_by_column is used for testing the columnwise building of histograms.
    * default force_read_by_column = false
    */
@@ -369,14 +366,13 @@ void GHistBuilder::BuildHist(Span<GradientPair const> gpair,
       });
 }
 
-template void GHistBuilder::BuildHist<true>(Span<GradientPair const> gpair,
-                                            const RowSetCollection::Elem row_indices,
-                                            const GHistIndexMatrix &gmat, GHistRow hist,
-                                            bool force_read_by_column) const;
+template void BuildHist<true>(Span<GradientPair const> gpair,
+                              const RowSetCollection::Elem row_indices,
+                              const GHistIndexMatrix &gmat, GHistRow hist,
+                              bool force_read_by_column);
 
-template void GHistBuilder::BuildHist<false>(Span<GradientPair const> gpair,
-                                             const RowSetCollection::Elem row_indices,
-                                             const GHistIndexMatrix &gmat, GHistRow hist,
-                                             bool force_read_by_column) const;
-}  // namespace common
-}  // namespace xgboost
+template void BuildHist<false>(Span<GradientPair const> gpair,
+                               const RowSetCollection::Elem row_indices,
+                               const GHistIndexMatrix &gmat, GHistRow hist,
+                               bool force_read_by_column);
+}  // namespace xgboost::common
diff --git a/src/common/hist_util.h b/src/common/hist_util.h
index 2781da8e0..c0fe5b44f 100644
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -16,11 +16,9 @@
 #include <vector>
 
 #include "categorical.h"
-#include "common.h"
 #include "quantile.h"
 #include "row_set.h"
 #include "threading_utils.h"
-#include "timer.h"
 #include "xgboost/base.h"  // for bst_feature_t, bst_bin_t
 #include "xgboost/data.h"
 
@@ -598,6 +596,8 @@ class ParallelGHistBuilder {
     }
   }
 
+  [[nodiscard]] bst_bin_t TotalBins() const { return nbins_; }
+
  private:
   void MatchNodeNidPairToHist() {
     size_t hist_allocated_additionally = 0;
@@ -643,27 +643,10 @@ class ParallelGHistBuilder {
   std::map<std::pair<size_t, size_t>, int> tid_nid_to_hist_;
 };
 
-/*!
- * \brief builder for histograms of gradient statistics
- */
-class GHistBuilder {
- public:
-  GHistBuilder() = default;
-  explicit GHistBuilder(uint32_t nbins): nbins_{nbins} {}
-
-  // construct a histogram via histogram aggregation
-  template <bool any_missing>
-  void BuildHist(Span<GradientPair const> gpair, const RowSetCollection::Elem row_indices,
-                 const GHistIndexMatrix& gmat, GHistRow hist,
-                 bool force_read_by_column = false) const;
-  uint32_t GetNumBins() const {
-      return nbins_;
-  }
-
- private:
-  /*! \brief number of all bins over all features */
-  uint32_t nbins_ { 0 };
-};
+// construct a histogram via histogram aggregation
+template <bool any_missing>
+void BuildHist(Span<GradientPair const> gpair, const RowSetCollection::Elem row_indices,
+               const GHistIndexMatrix& gmat, GHistRow hist, bool force_read_by_column = false);
 }  // namespace common
 }  // namespace xgboost
 #endif  // XGBOOST_COMMON_HIST_UTIL_H_
diff --git a/src/common/threading_utils.h b/src/common/threading_utils.h
index d80008cc0..0247e4dcc 100644
--- a/src/common/threading_utils.h
+++ b/src/common/threading_utils.h
@@ -30,9 +30,7 @@ inline int32_t omp_get_thread_limit() { return std::numeric_limits<int32_t>::max
 }
 #endif  // defined(_MSC_VER)
 
-namespace xgboost {
-namespace common {
-
+namespace xgboost::common {
 // Represent simple range of indexes [begin, end)
 // Inspired by tbb::blocked_range
 class Range1d {
@@ -69,7 +67,7 @@ class Range1d {
 // [1,2], [3,4], [5,6], [7,8], [9]
 // The class helps to process data in several tree nodes (non-balanced usually) in parallel
 // Using nested parallelism (by nodes and by data in each node)
-// it helps  to improve CPU resources utilization
+// it helps to improve CPU resources utilization
 class BlockedSpace2d {
  public:
   // Example of space:
@@ -86,39 +84,47 @@ class BlockedSpace2d {
   // dim1 - size of the first dimension in the space
   // getter_size_dim2 - functor to get the second dimensions for each 'row' by row-index
   // grain_size - max size of produced blocks
-  template<typename Func>
-  BlockedSpace2d(size_t dim1, Func getter_size_dim2, size_t grain_size) {
-    for (size_t i = 0; i < dim1; ++i) {
-      const size_t size = getter_size_dim2(i);
-      const size_t n_blocks = size/grain_size + !!(size % grain_size);
-      for (size_t iblock = 0; iblock < n_blocks; ++iblock) {
-        const size_t begin = iblock * grain_size;
-        const size_t end   = std::min(begin + grain_size, size);
+  template <typename Func>
+  BlockedSpace2d(std::size_t dim1, Func getter_size_dim2, std::size_t grain_size) {
+    for (std::size_t i = 0; i < dim1; ++i) {
+      std::size_t size = getter_size_dim2(i);
+      // Each row (second dim) is divided into n_blocks
+      std::size_t n_blocks = size / grain_size + !!(size % grain_size);
+      for (std::size_t iblock = 0; iblock < n_blocks; ++iblock) {
+        std::size_t begin = iblock * grain_size;
+        std::size_t end = std::min(begin + grain_size, size);
         AddBlock(i, begin, end);
       }
     }
   }
 
   // Amount of blocks(tasks) in a space
-  size_t Size() const {
+  [[nodiscard]] std::size_t Size() const {
     return ranges_.size();
   }
 
   // get index of the first dimension of i-th block(task)
-  size_t GetFirstDimension(size_t i) const {
+  [[nodiscard]] std::size_t GetFirstDimension(size_t i) const {
     CHECK_LT(i, first_dimension_.size());
     return first_dimension_[i];
   }
 
   // get a range of indexes for the second dimension of i-th block(task)
-  Range1d GetRange(size_t i) const {
+  [[nodiscard]] Range1d GetRange(size_t i) const {
     CHECK_LT(i, ranges_.size());
     return ranges_[i];
   }
 
  private:
-  void AddBlock(size_t first_dimension, size_t begin, size_t end) {
-    first_dimension_.push_back(first_dimension);
+  /**
+   * @brief Add a parallel block.
+   *
+   * @param first_dim The row index.
+   * @param begin     The begin of the second dimension.
+   * @param end       The end of the second dimension.
+   */
+  void AddBlock(std::size_t first_dim, std::size_t begin, std::size_t end) {
+    first_dimension_.push_back(first_dim);
     ranges_.emplace_back(begin, end);
   }
 
@@ -303,7 +309,6 @@ class MemStackAllocator {
  * \brief Constant that can be used for initializing static thread local memory.
  */
 std::int32_t constexpr DefaultMaxThreads() { return 128; }
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
 
 #endif  // XGBOOST_COMMON_THREADING_UTILS_H_
diff --git a/src/tree/hist/histogram.h b/src/tree/hist/histogram.h
index 562a0b2d4..b7f5f5da6 100644
--- a/src/tree/hist/histogram.h
+++ b/src/tree/hist/histogram.h
@@ -22,7 +22,6 @@ class HistogramBuilder {
   common::HistCollection hist_;
   /*! \brief culmulative local parent histogram of gradients. */
   common::HistCollection hist_local_worker_;
-  common::GHistBuilder builder_;
   common::ParallelGHistBuilder buffer_;
   BatchParam param_;
   int32_t n_threads_{-1};
@@ -49,7 +48,6 @@ class HistogramBuilder {
     hist_.Init(total_bins);
     hist_local_worker_.Init(total_bins);
     buffer_.Init(total_bins);
-    builder_ = common::GHistBuilder(total_bins);
     is_distributed_ = is_distributed;
     is_col_split_ = is_col_split;
     // Workaround s390x gcc 7.5.0
@@ -88,8 +86,7 @@ class HistogramBuilder {
                                                     elem.begin + end_of_row_set, nid);
       auto hist = buffer_.GetInitializedHist(tid, nid_in_set);
       if (rid_set.Size() != 0) {
-        builder_.template BuildHist<any_missing>(gpair_h, rid_set, gidx, hist,
-                                                 force_read_by_column);
+        common::BuildHist<any_missing>(gpair_h, rid_set, gidx, hist, force_read_by_column);
       }
     });
   }
@@ -163,9 +160,9 @@ class HistogramBuilder {
                                 std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
                                 std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
                                 int starting_index, int sync_count) {
-    const size_t nbins = builder_.GetNumBins();
+    auto n_bins = buffer_.TotalBins();
     common::BlockedSpace2d space(
-        nodes_for_explicit_hist_build.size(), [&](size_t) { return nbins; }, 1024);
+        nodes_for_explicit_hist_build.size(), [&](size_t) { return n_bins; }, 1024);
     common::ParallelFor2d(space, n_threads_, [&](size_t node, common::Range1d r) {
       const auto &entry = nodes_for_explicit_hist_build[node];
       auto this_hist = this->hist_[entry.nid];
@@ -188,14 +185,13 @@ class HistogramBuilder {
     });
 
     collective::Allreduce<collective::Operation::kSum>(
-        reinterpret_cast<double *>(this->hist_[starting_index].data()),
-        builder_.GetNumBins() * sync_count * 2);
+        reinterpret_cast<double *>(this->hist_[starting_index].data()), n_bins * sync_count * 2);
 
     ParallelSubtractionHist(space, nodes_for_explicit_hist_build, nodes_for_subtraction_trick,
                             p_tree);
 
     common::BlockedSpace2d space2(
-        nodes_for_subtraction_trick.size(), [&](size_t) { return nbins; }, 1024);
+        nodes_for_subtraction_trick.size(), [&](size_t) { return n_bins; }, 1024);
     ParallelSubtractionHist(space2, nodes_for_subtraction_trick, nodes_for_explicit_hist_build,
                             p_tree);
   }
@@ -203,7 +199,7 @@ class HistogramBuilder {
   void SyncHistogramLocal(RegTree const *p_tree,
                           std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
                           std::vector<ExpandEntry> const &nodes_for_subtraction_trick) {
-    const size_t nbins = this->builder_.GetNumBins();
+    const size_t nbins = this->buffer_.TotalBins();
     common::BlockedSpace2d space(
         nodes_for_explicit_hist_build.size(), [&](size_t) { return nbins; }, 1024);
 
diff --git a/tests/cpp/tree/hist/test_evaluate_splits.cc b/tests/cpp/tree/hist/test_evaluate_splits.cc
index 677687255..7bde3aca2 100644
--- a/tests/cpp/tree/hist/test_evaluate_splits.cc
+++ b/tests/cpp/tree/hist/test_evaluate_splits.cc
@@ -48,12 +48,10 @@ void TestEvaluateSplits(bool force_read_by_column) {
   std::iota(row_indices.begin(), row_indices.end(), 0);
   row_set_collection.Init();
 
-  auto hist_builder = common::GHistBuilder(gmat.cut.Ptrs().back());
   hist.Init(gmat.cut.Ptrs().back());
   hist.AddHistRow(0);
   hist.AllocateAllData();
-  hist_builder.template BuildHist<false>(row_gpairs, row_set_collection[0],
-                                         gmat, hist[0], force_read_by_column);
+  common::BuildHist<false>(row_gpairs, row_set_collection[0], gmat, hist[0], force_read_by_column);
 
   // Compute total gradient for all data points
   GradientPairPrecise total_gpair;
diff --git a/tests/cpp/tree/hist/test_histogram.cc b/tests/cpp/tree/hist/test_histogram.cc
index 8eb043cec..0198c6c80 100644
--- a/tests/cpp/tree/hist/test_histogram.cc
+++ b/tests/cpp/tree/hist/test_histogram.cc
@@ -13,8 +13,7 @@
 #include "../../categorical_helpers.h"
 #include "../../helpers.h"
 
-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {
 namespace {
 void InitRowPartitionForTest(common::RowSetCollection *row_set, size_t n_samples, size_t base_rowid = 0) {
   auto &row_indices = *row_set->Data();
@@ -487,5 +486,5 @@ TEST(CPUHistogram, ExternalMemory) {
   TestHistogramExternalMemory(&ctx, {kBins, sparse_thresh}, false, false);
   TestHistogramExternalMemory(&ctx, {kBins, sparse_thresh}, false, true);
 }
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
+

From 275da176ba097fa035e3deaca68400812d2a9dfa Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 22 Jul 2023 15:26:29 +0800
Subject: [PATCH 047/136] Document for device ordinal. (#9398)

- Rewrite GPU demos. notebook is converted to script to avoid committing additional png plots.
- Add GPU demos into the sphinx gallery.
- Add RMM demos into the sphinx gallery.
- Test for firing threads with different device ordinals.
---
 demo/c-api/basic/c-api-demo.c             |  10 +-
 demo/gpu_acceleration/README.md           |   5 -
 demo/gpu_acceleration/README.rst          |   8 +
 demo/gpu_acceleration/cover_type.py       |  56 +++---
 demo/gpu_acceleration/shap.ipynb          | 211 ----------------------
 demo/gpu_acceleration/tree_shap.py        |  55 ++++++
 demo/nvflare/horizontal/custom/trainer.py |   3 +-
 demo/rmm_plugin/README.md                 |  47 -----
 demo/rmm_plugin/README.rst                |  51 ++++++
 demo/rmm_plugin/rmm_mgpu_with_dask.py     |  40 ++--
 demo/rmm_plugin/rmm_singlegpu.py          |   7 +-
 doc/.gitignore                            |   2 +
 doc/conf.py                               |  11 +-
 doc/gpu/index.rst                         |  14 +-
 doc/install.rst                           |   4 +-
 doc/parameter.rst                         |  46 ++---
 doc/python/.gitignore                     |   4 +-
 doc/python/index.rst                      |   2 +
 doc/treemethod.rst                        |   4 +-
 doc/tutorials/categorical.rst             |   4 +-
 doc/tutorials/external_memory.rst         |  14 +-
 doc/tutorials/monotonic.rst               |  15 +-
 doc/tutorials/param_tuning.rst            |   4 -
 doc/tutorials/rf.rst                      |  15 +-
 doc/tutorials/saving_model.rst            |   2 +-
 python-package/xgboost/sklearn.py         |   8 +-
 python-package/xgboost/spark/estimator.py |  15 ++
 src/data/data.cc                          |  12 +-
 src/gbm/gbtree.cc                         |  17 +-
 tests/ci_build/lint_python.py             |   2 +
 tests/python-gpu/test_from_cupy.py        |   2 +-
 tests/python-gpu/test_gpu_with_sklearn.py |  59 +++++-
 32 files changed, 351 insertions(+), 398 deletions(-)
 delete mode 100644 demo/gpu_acceleration/README.md
 create mode 100644 demo/gpu_acceleration/README.rst
 delete mode 100644 demo/gpu_acceleration/shap.ipynb
 create mode 100644 demo/gpu_acceleration/tree_shap.py
 delete mode 100644 demo/rmm_plugin/README.md
 create mode 100644 demo/rmm_plugin/README.rst

diff --git a/demo/c-api/basic/c-api-demo.c b/demo/c-api/basic/c-api-demo.c
index 15a224e9e..e7dfa23b9 100644
--- a/demo/c-api/basic/c-api-demo.c
+++ b/demo/c-api/basic/c-api-demo.c
@@ -53,15 +53,7 @@ int main() {
   // configure the training
   // available parameters are described here:
   //   https://xgboost.readthedocs.io/en/latest/parameter.html
-  safe_xgboost(XGBoosterSetParam(booster, "tree_method", use_gpu ? "gpu_hist" : "hist"));
-  if (use_gpu) {
-    // set the GPU to use;
-    // this is not necessary, but provided here as an illustration
-    safe_xgboost(XGBoosterSetParam(booster, "gpu_id", "0"));
-  } else {
-    // avoid evaluating objective and metric on a GPU
-    safe_xgboost(XGBoosterSetParam(booster, "gpu_id", "-1"));
-  }
+  safe_xgboost(XGBoosterSetParam(booster, "device", use_gpu ? "cuda" : "cpu"));
 
   safe_xgboost(XGBoosterSetParam(booster, "objective", "binary:logistic"));
   safe_xgboost(XGBoosterSetParam(booster, "min_child_weight", "1"));
diff --git a/demo/gpu_acceleration/README.md b/demo/gpu_acceleration/README.md
deleted file mode 100644
index a49cd0c18..000000000
--- a/demo/gpu_acceleration/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# GPU Acceleration Demo
-
-`cover_type.py` shows how to train a model on the [forest cover type](https://archive.ics.uci.edu/ml/datasets/covertype) dataset using GPU acceleration. The forest cover type dataset has 581,012 rows and 54 features, making it time consuming to process. We compare the run-time and accuracy of the GPU and CPU histogram algorithms.
-
-`shap.ipynb` demonstrates using GPU acceleration to compute SHAP values for feature importance.
diff --git a/demo/gpu_acceleration/README.rst b/demo/gpu_acceleration/README.rst
new file mode 100644
index 000000000..77bd221d1
--- /dev/null
+++ b/demo/gpu_acceleration/README.rst
@@ -0,0 +1,8 @@
+:orphan:
+
+GPU Acceleration Demo
+=====================
+
+This is a collection of demonstration scripts to showcase the basic usage of GPU. Please
+see :doc:`/gpu/index` for more info. There are other demonstrations for distributed GPU
+training using dask or spark.
diff --git a/demo/gpu_acceleration/cover_type.py b/demo/gpu_acceleration/cover_type.py
index 1f2322d05..a582aaad3 100644
--- a/demo/gpu_acceleration/cover_type.py
+++ b/demo/gpu_acceleration/cover_type.py
@@ -1,41 +1,49 @@
+"""
+Using xgboost on GPU devices
+============================
+
+Shows how to train a model on the `forest cover type
+<https://archive.ics.uci.edu/ml/datasets/covertype>`_ dataset using GPU
+acceleration. The forest cover type dataset has 581,012 rows and 54 features, making it
+time consuming to process. We compare the run-time and accuracy of the GPU and CPU
+histogram algorithms.
+
+In addition, The demo showcases using GPU with other GPU-related libraries including
+cupy and cuml. These libraries are not strictly required.
+
+"""
 import time
 
+import cupy as cp
+from cuml.model_selection import train_test_split
 from sklearn.datasets import fetch_covtype
-from sklearn.model_selection import train_test_split
 
 import xgboost as xgb
 
 # Fetch dataset using sklearn
-cov = fetch_covtype()
-X = cov.data
-y = cov.target
+X, y = fetch_covtype(return_X_y=True)
+X = cp.array(X)
+y = cp.array(y)
+y -= y.min()
 
 # Create 0.75/0.25 train/test split
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, train_size=0.75,
-                                                    random_state=42)
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.25, train_size=0.75, random_state=42
+)
 
 # Specify sufficient boosting iterations to reach a minimum
 num_round = 3000
 
 # Leave most parameters as default
-param = {'objective': 'multi:softmax', # Specify multiclass classification
-         'num_class': 8, # Number of possible output classes
-         'tree_method': 'gpu_hist' # Use GPU accelerated algorithm
-         }
-
-# Convert input data from numpy to XGBoost format
-dtrain = xgb.DMatrix(X_train, label=y_train)
-dtest = xgb.DMatrix(X_test, label=y_test)
-
-gpu_res = {} # Store accuracy result
-tmp = time.time()
+clf = xgb.XGBClassifier(device="cuda", n_estimators=num_round)
 # Train model
-xgb.train(param, dtrain, num_round, evals=[(dtest, 'test')], evals_result=gpu_res)
-print("GPU Training Time: %s seconds" % (str(time.time() - tmp)))
+start = time.time()
+clf.fit(X_train, y_train, eval_set=[(X_test, y_test)])
+gpu_res = clf.evals_result()
+print("GPU Training Time: %s seconds" % (str(time.time() - start)))
 
 # Repeat for CPU algorithm
-tmp = time.time()
-param['tree_method'] = 'hist'
-cpu_res = {}
-xgb.train(param, dtrain, num_round, evals=[(dtest, 'test')], evals_result=cpu_res)
-print("CPU Training Time: %s seconds" % (str(time.time() - tmp)))
+clf = xgb.XGBClassifier(device="cpu", n_estimators=num_round)
+start = time.time()
+cpu_res = clf.evals_result()
+print("CPU Training Time: %s seconds" % (str(time.time() - start)))
diff --git a/demo/gpu_acceleration/shap.ipynb b/demo/gpu_acceleration/shap.ipynb
deleted file mode 100644
index 7f1ee87d5..000000000
--- a/demo/gpu_acceleration/shap.ipynb
+++ /dev/null
@@ -1,211 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      ".. _california_housing_dataset:\n",
-      "\n",
-      "California Housing dataset\n",
-      "--------------------------\n",
-      "\n",
-      "**Data Set Characteristics:**\n",
-      "\n",
-      "    :Number of Instances: 20640\n",
-      "\n",
-      "    :Number of Attributes: 8 numeric, predictive attributes and the target\n",
-      "\n",
-      "    :Attribute Information:\n",
-      "        - MedInc        median income in block\n",
-      "        - HouseAge      median house age in block\n",
-      "        - AveRooms      average number of rooms\n",
-      "        - AveBedrms     average number of bedrooms\n",
-      "        - Population    block population\n",
-      "        - AveOccup      average house occupancy\n",
-      "        - Latitude      house block latitude\n",
-      "        - Longitude     house block longitude\n",
-      "\n",
-      "    :Missing Attribute Values: None\n",
-      "\n",
-      "This dataset was obtained from the StatLib repository.\n",
-      "http://lib.stat.cmu.edu/datasets/\n",
-      "\n",
-      "The target variable is the median house value for California districts.\n",
-      "\n",
-      "This dataset was derived from the 1990 U.S. census, using one row per census\n",
-      "block group. A block group is the smallest geographical unit for which the U.S.\n",
-      "Census Bureau publishes sample data (a block group typically has a population\n",
-      "of 600 to 3,000 people).\n",
-      "\n",
-      "It can be downloaded/loaded using the\n",
-      ":func:`sklearn.datasets.fetch_california_housing` function.\n",
-      "\n",
-      ".. topic:: References\n",
-      "\n",
-      "    - Pace, R. Kelley and Ronald Barry, Sparse Spatial Autoregressions,\n",
-      "      Statistics and Probability Letters, 33 (1997) 291-297\n",
-      "\n",
-      "Wall time: 28.9 s\n"
-     ]
-    }
-   ],
-   "source": [
-    "import numpy as np\n",
-    "import xgboost as xgb\n",
-    "from sklearn.datasets import fetch_california_housing\n",
-    "\n",
-    "# Fetch dataset using sklearn\n",
-    "data = fetch_california_housing()\n",
-    "print( data.DESCR)\n",
-    "X = data.data\n",
-    "y = data.target\n",
-    "\n",
-    "num_round = 500\n",
-    "\n",
-    "param = {\n",
-    "    \"eta\": 0.05,\n",
-    "    \"max_depth\": 10,\n",
-    "    \"tree_method\": \"gpu_hist\",\n",
-    "}\n",
-    "\n",
-    "# GPU accelerated training\n",
-    "dtrain = xgb.DMatrix(X, label=y, feature_names=data.feature_names)\n",
-    "%time model = xgb.train(param, dtrain,num_round)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Wall time: 3.73 s\n"
-     ]
-    }
-   ],
-   "source": [
-    "%%time\n",
-    "# Compute shap values using GPU with xgboost\n",
-    "# model.set_param({\"predictor\":\"cpu_predictor\"})\n",
-    "model.set_param({\"predictor\": \"gpu_predictor\"})\n",
-    "shap_values = model.predict(dtrain, pred_contribs=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Wall time: 49.3 s\n"
-     ]
-    }
-   ],
-   "source": [
-    "%%time\n",
-    "# Compute shap interaction values using GPU\n",
-    "shap_interaction_values = model.predict(dtrain, pred_interactions=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Wall time: 3.69 s\n"
-     ]
-    },
-    {
-     "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAABJ8AAAEACAYAAAAdhddAAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAgAElEQVR4nOzdeXxU1f3/8dedmawkk7CEJSxh3wUUREBA1OJScJca6lKttfqtVq1+W6yt/rS1X6W2ltZatdZqXaOCC1VxRRTZFFwAZRGRsO9kgSyTZO7vj3MnMxOSkEAmM5O8n49HHpnlzp0z986dufc9n3OuZds2IiIiIiIiIiIikeCKdgNERERERERERKTlUvgkIiIiIiIiIiIRo/BJREREREREREQiRuGTiIiIiIiIiIhEjMInERERERERERGJGIVPIiIiIiIiIiISMQqfREREREREREQkYhQ+iYiIiIiIiIhIxCh8EhERERERERGRiFH4JCIiIiIiIiIiEaPwSURERERERERilzc3I9pNkGOj8ElERERERESkJfLmbsKbe1kd992ON/e/jZjXk3hz/9VUTWuk1/Dm/q7mjTkzty/Imbn9t9FokDSOJ9oNEBEREREREZFmVpT3f9FuQiP8GFiANxeK8u6MdmOk8RQ+iYiIiIiIiEjz8uYmUJRXEXJ9EvDBER51B97cNynKWxrJpjVGzsztCfkzsiuOPGXrpvBJREREREREpOXqgTf3feAkYBPwU4ryFuPNvQsYT1He9wDw5nYGHgMmAruAmcC/gF4U5W1y5pWEN/cxYBpwCPgdRXmPVj+TN3cCcC8wGDgA/AN4gKI82wmX3gOuAu4GsoD0kHYucm6rTRfgDeC/wLLaJsiZuX0YMAs43nnufwP35s/IrsqZuf1BIDF/Rva1zrQLgR75M7JznOszgIn5M7KnONfPB+4A+gA7gHvyZ2Q/69x3JfBb4FHgJqAQGFJHu8Wh8ElERERERESk5foxcB6wFvgT8B+gXy3TPQsUAN2BZODFWqa5GLgEuBY4H3gBb+5bFOXl480dArwJXAa87jzHPGAP8JTzeDdwNiYgCq8WMlVQe2t9Bd7cl4C5FOXdUNvdOTO3ZwDvAn935t8bE1aVA/djQq+/ONOmASOAAzkzt/fPn5G9Hvie03ZyZm6fDDzuvL5FwCjg7ZyZ27fkz8j+yHnKnkC28xqtWtssYRQ+iYiIiIiIiLRcj1KU9xWAM2D4zYedPc6b2w04DehDUV4RUIQ39/fAKTXmNZ+ivLnO5Zfx5hZggpx84H+AlyjKe825fy3e3L8DVxAMnwBuoyivsK7G5szcbgFp+TOyi0Nunk5R3s56XuMUwIepULKBNTkzt88EbsGETx8A3XNmbu8NDAI+Bb4BJufM3J4PnOxMC6aa6a/5M7IXOtc/yZm5/RnndQTCpwrgtvwZ2eX1tElCKHwSERERERERabl2hFw+5PxPrzFNV+f/5pDb8o8wr8D8AvPqBZyGN/fCkPtdwJaQ6/4a18M4wdPjzuOurL6j/uAJTLXWJid4CvjWuZ38GdlFOTO3L8dUOA3CVEltAC7FVIQV5c/IXhXyOk7Nmbn9lpB5uYGFIdd3KHhqHIVPIiIiIiIiIq3bNud/D2BjyOXGyAf+TVHe9fVMY1OUZ9d2R0jwNApThdUYW4CcnJnbrZAAqjfhQdd7BMOnq4DvgH8C64H3a7yOJ/NnZN9fz/P5G9m+Vk/hk4iIiIiIiEhrVpS3FW/uAuA+vLlXAymYQbUb4x/Ah3hz3wLeAmygP5BFUd6HDXj8RZhQCGBPzsztdU3XL39G9oYat72BGWz89pyZ2+/HVC/NwAwKHvAepkudD/gsf0a2P2fm9u8w41fdHDLdLOCJnJnblwKLMVVPxwFW/ozs5Q14HVILV7QbICIiIiIiIiJR90MgFdgKfAy85NzesO5lRXmrgamYIGcHsBt4krrPYFfTa8ArwCpM1VJWHX8baz4wf0Z2IXAGprJpF/A2ZpypB0ImW4LJQObnz8gOVC69B3id/4F5vQP8FDNW1F7ntfwFSGvg65BaWLZda8WbiIiIiIiIiLRW3twzMYFQSl1d5ZpazsztHiAPKM6fkX3VkaaX+KHwSURERERERKS18+YOx3SVW4XptvYC8DVFeT9qzmY4AVRm/ozsvc35vBJZGvNJRERERERERNoBjwFdgEJgHnBrczcif0Z2Jaa7m7QgqnwSEREREREREZGI0YDjIiIiIiIiIiISMep2JyIiYuwEOkW7EXFgF9A52o0QERFpRbSP0jq1qH0udbsTEREx9IXYcFa0GyAiItKKaB+l9Wox+1zqdiciIiIiIiIiIhGj8ElERERERERERCJG4ZOIiEgdZs2axcaNG6PdjKOyYMECXn755Wg3Q0REROLMq6++yvz58xs0bTzvKzUV7XM1jMInEREREREREYkZTz75JJ999lnMzk8aT+GTiIiIiIiIiIhEjCfaDRAREYll27ZtY968eRQXFzNw4ECmTp2Kx+OhtLSUV155ha1bt+L3++nRowdTp07F6/UC8MUXX/Dhhx9y6NAhUlNTOe200xg2bBgAn3/+OYsWLeLgwYN07dqVc845h8zMzMOe+5lnnqF///6MHj26+raHH36YSZMmMWjQIObNm8eaNWsoLy+nXbt2nHXWWeTk5Bw2n02bNvHyyy9zyy23VN82a9Yszj33XHr37o1t2yxatIgVK1ZQVlZG7969mTp1KikpKU29OEVERKQJzJo1ixNPPJGVK1eyf/9+hg4dyumnn86rr77K5s2b6datG9OmTav+Ll+3bh3vvfcexcXFdO7cmSlTppCVlQXAjh07mDt3Lvv27aNfv35YVvgJ1tavX8/8+fMpKCggKyuLqVOn0qlTpyO2saysjHnz5vHNN9+QkJDAyJEjmTBhApZlsWDBAvbv38+FF14IQEFBAbNmzeLOO+/kgw8+ID8/n61bt/LWW28xYsQIvv/973PXXXdx9tlns3TpUsrLyxkxYgSTJ08+6vmF0j5X5Cl8EhERqceqVau47LLLSExM5LnnnuOjjz7itNNOw7ZtRowYwbRp0/D7/bz22mu8+eab5Obm4vP5mDdvHtdccw0dOnSguLiY0tJSANauXcvChQuZPn067du35+OPP2bOnDlcffXVhz33cccdx/Lly6t3hPbs2UNhYSH9+vUDoGvXrpxyyikkJyezdOlSXnrpJW6++WY8nsZ9vS9btoy1a9dy1VVXkZqayrx583jjjTe4+OKLj3HpiYiISKSsWbOGyy+/HL/fz6OPPsrOnTs599xzycrK4tlnn2XZsmVMmjSJffv2MXv2bHJzc+nZsydLly7l+eef5/rrrwcgLy+PMWPGMHr0aNatW8fs2bMZP348YIKp1157jenTp5Odnc3KlSt5/vnnueGGG464vzFv3jzKysq46aabKC0t5emnnyYtLY0TTjih3sedfvrpbNmyhWHDhh027Zo1a/jpT3+Kz+fjqaeeokOHDsc0vwDtc0Weut2JiIjUY/To0WRkZJCSksLEiRNZtWoVAKmpqQwePJiEhASSkpKYOHEimzZtqn6cZVns3r2biooK0tPT6dixIwDLly9n/PjxZGVl4XK5mDBhAjt37qSgoOCw5x44cGDYfStXrmTQoEHVOzrDhg0jNTUVl8vFuHHjqKysZO/evY1+jcuXL+e0007D6/Xi8XiYNGkSX3/9NX6/v9HzEhERkeYxevRo0tLS8Hq99OjRg65du9KlSxc8Hk/1PgTA6tWr6d+/P3369MHtdjNu3DgqKirYsmVLdQX3mDFjcLvdDB48mK5du1Y/x4oVKxg5ciTdunXD5XIxYsQI3G43W7durbdtfr+f1atX873vfY+kpCQyMzMZO3YsK1euPKbXPH78eFJSUsjIyGDMmDHV+2XHSvtckafKJxERkXoEutEBZGRkUFxcDEBFRQVvvfUWGzZsoKysDIDy8nL8fj+JiYlcfPHFLF68mLlz59K9e3fOPPNMOnToQGFhIW+99RbvvPNO9Xxt26a4uPiwrndJSUn079+f1atXM378eFavXs0555xTff/ixYv57LPPKC4uxrIsysvLKSkpafRrLCws5IUXXggrs3e5XBw8eDDs9YuIiEjsSEtLq76ckJBw2HWfzwdAcXExGRkZ1fdZlkVGRgZFRUW4XC7S09PD9gFCpy0sLOTLL7/kk08+qb6tqqqqen+oLiUlJVRVVYXNKzMzk6KioqN4pUF17ZcdK+1zRZ7CJxERkXqE7iQVFhaSnp4OmJ2Qffv2cc0115CWlsbOnTt55JFHqqft27cvffv2paKigvnz5zN37lx+/OMf4/V6mTBhQvX4T0cydOhQPvzwQ3JycqisrKRXr14A5Ofns2jRIq644go6duyIZVncd999tc4jISGBioqK6ut+v59Dhw5VX/d6vZx33nn06NGj4QtGRERE4kJ6ejq7d++uvm7bNoWFhdVhR3FxMbZtVwcihYWFtGvXDqB6v2XixImNes7U1FTcbjeFhYXVY0uFPmfNfZODBw82aL5FRUXV1eSh+2VHO79Q2ueKLHW7ExERqccnn3xCUVERpaWlLFy4kKFDhwLg8/nweDwkJydTWlrKggULqh9z8OBB1q1bVz1NYmIiLpf5yh01ahQff/xx9U5gWVkZX331VZ3P369fPwoKCvjggw8YMmRI9Y6hz+fD5XLRpk0b/H4/H374IeXl5bXOo3379lRWVrJ+/Xqqqqr46KOPqKqqqr5/1KhR1QOJAhw6dIi1a9ce/UITERGRmDFkyBDWr1/Pxo0bqaqqYsmSJXg8Hrp370737t1xuVwsW7YMv9/PmjVr2LZtW/VjR44cyfLly9m6dSu2bePz+Vi/fn2d+xwBLpeLIUOG8P7771NeXk5BQQFLliyp/vGtc+fO5OfnU1hYSFlZGQsXLgx7fFpaGgcOHDhsvosWLaK0tJTCwkKWLVtWvV92tPMLpX2uyFLlk4iISD2OO+44nn76aYqLixkwYED1L39jxoxhzpw5/PGPfyQ9PZ2xY8dW7zzYts3ixYt5+eWXsSyr+qwyAIMGDcLn8zF79mwKCwtJSkqiT58+DBkypNbn93g8DBo0iM8//5zTTz+9+vY+ffrQt29fHnzwQRISEhg7dmxYaXuo5ORkpkyZwty5c7Ftm5NPPjmstHvMmDEA1a+zTZs2DB06lIEDBx77AhQREZGo6tChAxdeeCHz5s2jqKiIzp07M336dNxuNwCXXHIJc+fOZf78+fTr149BgwZVPzY7O5tzzz2XN998k/379+PxeOjRo0etZ3qr6eyzz2bevHn89a9/xePxMHLkSI4//niA6n2fhx9+mNTUVE4++WTWrVtX/diTTjqJV199lU8//ZThw4dz9tlnA2Zspn/+85+UlZUxYsSIY55fKO1zRZZl23a02yAiIhIL9IXYcNaRJxEREZEmon0U4K677uLGG2+s7hLYSrSYfS51uxMRERERERERkYhR+CQiIiIiIiIiIhGjbnciIiKGvhAbrsWUgIuIiMQB7aO0Xi1mn0uVTyIiIiIiIiIiEjEKn0REREREREREJGIUPomIiIiIiIiISMQofBIRETF2RbsBcULLSUREpHnpu7d1alHrXQOOi4iIiIiIiMixiLdgocUM5B0vVPkkIiIiIiIiIiIRo/BJRERERERERCJq2rRpWJaFZVlcfPHFTfK4zZs3c/XVV9OzZ0+SkpLo3bs3d9xxBz6fr6mbL8fIE+0GiIiIiIiIiEjL9cQTTzB79uwmfdyePXsYPXo0u3btIjU1lUGDBrF27Vruuece1qxZc1TPJ5GjyicRERERERERiYhvv/2WG2+8kbFjx9KtW7cme9xLL73Erl1mTO6FCxfyxRdfMHfuXADmzJnD0qVLm+YFSJNQ+CQiIiIiIiIiTa6yspJLL70Ul8vFs88+i9vtbrLH+f3+6ssul4k2LCs4jvg777xzjK2XpqTwSURERERERESa3N13382yZcv4xz/+Qa9evZr0cVOmTCE9PR2A8ePHc/zxx3PuuedW379t27Zja7w0KYVPIiIiIiIiItKkli9fzr333stll13GpZde2uSP69WrF++//z6TJ08mMTGRzZs3c9FFF5GZmQlAQkLCMb8GaTqWbdvRboOIiIiIiIiIxK/DgoUnn3ySq666iuTk5OpucyUlJdi2jdvtJjk5mW3btpGRkdEkjwNT7RQYH+rBBx/khhtuqKu9Vl13SGSo8klEREREREREIqKsrIxDhw5x6NAhAsUvVVVV1ddPP/10Bg4cyK9//etGPQ7g448/prKyEjAB1fXXXw9AYmIiF154YXO9RGkAhU8iIiIiIiIi0qSuvPJKbNsO+8vJyQHgoosuwrZtMjMz+fbbb1m3bh07duxo1OMAbrjhBjp06MCwYcPo3Lkzr732GgB//vOfyc7OjsKrlroofBIREYlBxcXFdnFxsfrGi4iIiNThjDPOoG3btqxfvx6ASZMm8frrr9fX3U6iRGM+iYiIxKBA8JSenq4xCURERCTWxVuwoP2rZqbKJxERERERERERiRiFTyIiIiIiIiIiEjEKn0REREREREREJGIUPomIiIiIiIiISMQofBIRERERERERkYhR+CQiIiIiIiIiIhGj8ElEREREREREjsWuaDegEeKprS2Gwqc4Y1nWWZZlrbMsa4NlWbfVcv+VlmXtsSzrC+fvJ9FopzSeZVn/tixrt2VZq+u437Is62/Oul9pWdYJzd1GOToNWLeTLMsqDNlu72zuNsrRsSyru2VZH1iWtcayrK8sy7qplmm07capBq5fbb9xyLKsZMuyPrEs60tn3d5dyzRJlmW94Gy7yyzL6tn8LZWj0cD1q33mOGZZltuyrM8ty3q9lvu07UZHZ8Bqij/LsjyWZX1RXFxMcXExNe67yrKsvc72/aVlWdccxXN0jvCykFp4ot0AaTjLstzAQ8BkYCvwqWVZc23b/rrGpC/Ytn1DszdQjtWTwN+Bp+q4/2ygn/N3EvCw819i35PUv24BFtq2PbV5miNNqBK41bbtzyzLSgdWWJb1bo3PZW278ash6xe0/cajcuA027YPWpaVAHxsWdY827aXhkxzNXDAtu2+lmXlAjOBS6LRWGm0hqxf0D5zPLsJWAN4a7lP2278C6zfEXXcr203DqnyKb6MBjbYtr3Rtm0fkAecF+U2SROxbfsjYH89k5wHPGUbS4FMy7K6NE/r5Fg0YN1KnLJte4dt2585l4sxO0pda0ymbTdONXD9ShxytseDztUE58+uMdl5wH+cy7OB0y3LspqpiXIMGrh+JU5ZltUNmAL8q45JtO3GsQasX4lTCp/iS1dgS8j1rdS+E3yR07VjtmVZ3ZunadIMGrr+JT6NdUqH51mWNSTajZHGc8r6jweW1bhL224LUM/6BW2/ccnptvMFsBt417btOrdd27YrgUKgffO2Uo5WA9YvaJ85Xs0CfgX467hf2258O9L6BW27cUnhU3ypLbGv+SvOf4Getm0PA94jmPpL/GvI+pf49BmQY9v2cOBB4NUot0caybKsNGAOcLNt20U1767lIdp248gR1q+23zhl23aVbdsjgG7AaMuyhtaYRNtuHGvA+tU+cxyyLGsqsNu27RX1TVbLbdp240AD16+23Til8Cm+bAVCk91uwPbQCWzb3mfbdrlz9TFgZDO1TSLviOtf4pNt20WB7gG2bb8JJFiW1SHKzZIGcsYTmQM8a9v2y7VMom03jh1p/Wr7jX+2bRcAC4CzatxVve1aluUBMlAX6rhT1/rVPnPcOhk417KsTZghSE6zLOuZGtNo241fNdfvYbTtxi+FT/HlU6CfZVm9LMtKBHKBuaET1BhH5FzM+BTSMswFrnDOnDUGKLRte0e0GyXHzrKszoGxCCzLGo35bN4X3VZJQzjr7XFgjW3bD9QxmbbdONWQ9avtNz5ZlpVlWVamczkF+B6wtsZkc4EfOZcvBubbtq3qiTjQkPWrfeb4ZNv2r23b7mbbdk/MsdB827YvqzGZtt04Vcv6PYy23fils93FEdu2Ky3LugF4G3AD/7Zt+yvLsn4HLLdtey5wo2VZ52LO0LMfuDJqDZZGsSzreWAS0MGyrK3A/8MMkIlt248AbwLfBzYAJcBV0WmpNFYD1u3FwP9YllUJlAK52kmKGycDlwOrnLFFAG4HeoC23RagIetX22986gL8xzmTsAt40bbt12vsUz0OPG1Z1gbMPlWtB0ISkxqyfrXP3IJo223ZtO22DJb2j0RERGJPcXGxDZCenq4z9IiIiEiro32hlkXd7kREREREREREJGIUPomIiIiIiIiISMQofBIRERERERERkYhR+CQiIiIiIiIiIhGj8ElERERERERERCJG4VMLYVnWT6PdBokMrduWTeu35dK6bdm0fls2rd+WS+u2ZdP6bbm0buOfwqeWQxtjy6V127Jp/bZcWrctm9Zvy6b123Jp3bZsWr8tl9ZtnFP4JCIiIiIiIiIiEWPZth3tNjSLs846y967d2+0mxExe/bsISsrK9rNkAjQum3ZtH5brmNdt36/HwCXS78TxSJtuy2b1m/LpXXbsmn9tiyh+0Jat7FtxYoVb9u2fVZ907Sa8AloNS9URETiX3FxMQDp6elRbomIiIhI89O+UFyxjjSBfk4VEREREREREZGIUfgkIiIiIiIiIiIRo/BJREREREREREQiRuGTiIiIiIiIiIhEjCfaDRAREZHDaXBNEREREWkpVPkkIiIiIiIiIiIRo/BJREREREREREQiRuGTiIiIiIiIiIhEjMInERERERERERGJGIVPIiISHZVVtV+u7XpD2fbRP7YhIjnvoxHp19saaXmKiIhIHPDbNn7bjnYzGkxnuxMRkeZT5Yete2HdNnN5+QY4Lgcy28BDb8LAbnB8L+jghcfegcKShs23gxdG9oW+XWDdVnjvy6Zrs8uC0f1haA9okwx/f8O0PZrapcGovtAvGzbugnkrotueeJecACf2h8HdwVcBT7wf7RaJiIjIszcB8MO8vVFuSGwalJXA2QNSSE2w6Jbhxpsc27VFCp9ERKT5uCzYcQBSk8z1Pp1McARQXgFt20DHTGdaFyxY3bD5HpcDP/6eudwjCz5cDU35Q9D0idAu3VwuKoVPv2nCmR+F/tnwkzPM5Zws+Ogr8MfPL18xJykBrj3L/AdYvx22749um0RERASARfm+aDchJo3rkUSi26LSD6UVNt7kaLeofrEdjYmISMtiWcEQB6CwNHh53EBYsi54fezAhs/3q83BKqkOXlMR1JRC2zVuQNPO+2h8sx32FpnL3lQYmhPd9sS78gr4bGPw+rhGvPdEREREmpnHBaO6JlZfj/WqJ1D4JCIiza19SPiU5Q1eHtUXVnwb7NI2uLvpjtcQfhs+WR+83tThwZK1wcsnDTAVXNFkA0tDg7oYCMTiXeg6bkzwKSIiItLMhndJJDXRxDmJbkiOgz5tCp9ERKR5tW0TDG86t4WdB8zl5ETo0xnWbDHXXRac1L/h810cwfBg7VbYX2wuZ7YxwVi0RfL1tkbL1geDz0HdzLhaIiIiIjFobI/wqifLivIPow2g8ElERJqXywVtQw7s9xQFL48dcPShyucbocwZE6B7B/PXVA6rNIqBsGdVPhQ73RY7ZkC/LtFtT7wrLoXV+cHrY1RNJiIiIrHHZcFJ3ZKqr2ckx37wBAqfREQkGtqHdLfLTA1ePmlAeMhzfC9ICf6yUy9fpem2F9DUAdHiGOvmVuUP72oYC4FYvFM1mYiIiMS4gVkJZKaYKMfjgtQEhU8iIiK1a5cGge/J7lnBLm0ZqZCVAd/uMNcTPHBi34bPNzQ8aOqBwVd+BwfLzOXObaF3p6ad/9EIe70KS45Z6MDyw3tCm6Q6JxURERGJhnjscgcKn0REJBo8bsgIGUw89LT24wbWqDJqRKjy6TdQWWUu9+9qznzXVCr9Zv5H065I+exbKKswl3tkQdf20W1PvNtbBOu3mcseN5zYL7rtEREREalhTPf463IHCp9ERCRaQs96lxJSYTJ2QPiZx07sZ4KAhjhYBis3Ba839bg9sVZpVF5pAqiAWOgOGO9Cg89YWMciIiIijp5t3XRON/vFLgvaJCp8EhERqV9o+NSzY8jg2ZngdsEOpxoqNQlG9Gr4fCMZEK3YYMaWAujVyXS/i7YlMRaIxbvQ5TmyLyTGwbmLRUREpFUY2yP4g603ycIVJ13uQOGTiIhES2ICpKeYy24XbNkbvO+wrneNqOhZFjII93E5wedoCmUV5qx6R9OuSFm23gw+DjCwW3ioJ423ZS9sdd6LKYkwond02yMiIiLiGBvS5c6bHF9xTny1VkREWpZ2IUGJO+SXm5P6w7KQ8OmkRoQ8+4qD4/a4XTCqEQOWN0To2fiaulvf0ThYBl9tDl4frXGKjtnSkABzTP/otUNERETEkdXGRa92wYrs9KT4qXoChU8iIhJNvorg5Yqq4OW9xeGDhe8ravg8LaB9yGP3NuKxDdEhgvM+WmFtKo5eO1qKWFzHIiIi0qoVl/vxVdnV1yv9UWzMUVD4JCIi0WHbpkopIPS09kvWhp9NLnQcpyPp3zXY9aywJLwqqCmMO8p2RUrPjpDdzlwuKYcvNtY/vdTP4w6vHouFdSwiIiKtXlklfL7dV329qCy+0ieFTyIiEh0Hy4KDdxeXQE4nc9lvmzO4hXaXa0wAEBpaLVtn5tdUOrc1A42DafuKDU0376MV+nqXbwivIJPGG9HLDHIPZtD7Tbuj2x4RERERx5LN5dWXC8uacB+3GSh8EhGR6AjtSrd5rzlfLMDXW0zAk5Jorm/bFz4Y+ZGMCxmHKXTQ8qYQOsD45xvNAOTRFtomVekcu9BxvJY08ftHRERE5Bh8utVHlfPDakmFTUVV/ARQCp9ERCQ6QrvcJbiDl4+ly133DtCtg7lc6mv6Lmix1uWuYwb07WIuV1SZyic5ei4rPHyKhXUsIiIi4igqt/l6d0XY9Xih8ElERJpfSbkJhwDKfMGubGDOJhd6hrHGBACh4dCKDcFufU0hsw0M6m4uV/lh2fr6p28OoSHdl9+Z5YRWiswAACAASURBVCpHb2A3aJdmLh84CGu3Rrc9IiIiIjUs2Ryf4z4pfBIRkeYXWvW0aTckOKeN/XanOVNdRpvgdOu3NXy+keyCNmZAsGvgV5uhqKRp5380Yq0SK96FhnlLm3i8MBEREZEmsHRL8MfGg+V2dTe8WKfwSUREml/oeE92yBfmkrXhYzYtWQsN/T7t4DVnugOorIJPvznmZoYJDXqWxEDQ402FwU4llt82YYkcm3Ea70lERERi255DfjbsM13vbOKn653CJxERaV7lFeZMd2BCopyOwfuWrKsR8jQiAAitevpyExxqwi5oqUkwvFfweiwEE2P6g9v5Gl+zBQoORbc98a5nR+jSzlwuKYcvvotue0RERETqEI9d7xQ+iYhI86rZ5a76tPYHTLe2jpnm+sEyWLWp4fONZGXSiX2Dg6Jv2AG7C5t2/kdDXe6aVujy/PQbE4yKiIiIxKClm4M/shaX2/jt2K9+UvgkIiLNK7TLXXnwVxvT5S4kAPhkPVQ28Jec9BQYmmMu++2mr0yKtS53KYlwfO/g9VhoU7wbG2PrWERERKQOmwur2FZkTqzjt83YT7FO4ZOIiDSfyqrwgbq7tDf/bRsWrYHhPaG41NzWmADghN6mO195hTlD2YGDTdZk3C44vk+wq2AsVBmN6AVVNlRUwsadsLMg2i2Kbx280KezuVxRCZ9uiG57RERERI5gyWYftm1z0OePi3GfLDsOyrOaSKt5oSIizca2oaLKhD6+CiivdC5XQpnPXPdVQKkPSsuhxGfCpW93mHF1DhwEj9OdreCQuexNgaknmml8lfX/VTj/K6sgLQVG9jFBwp4m6BZX6jPtO3AIikugaweYNNS0K9oqKmHddhjUDbLbwS6FT8csOcFUzJ13EmzeE+3WiIiItHrF100G4MlVUW5IjEpLtEhPsvhqVwU3jU8nM9kdzeZYR5xA4ZOISCtk2ya0CQRFgaqhwOUyJ0wqcUKjUp8Jk0qc/2UVTrjkA5fL/HlcpkrI7TZjN7lcwf8elxkzKcFjvpr8NlT5oarKdK0LhEiB+ZY5bfC4TSiQnFj//6QEsI74nVc/X6UJrXYVmEqiykro1BY6ZUCnTEhJapJFf8x8FfDhV9AxA4b1PPbXLUEvfgw/GB/tVoiIiAhAxww2PvcJ3113QbRbEtO+2VtBSqLF5ce3wRW9/cIjPrGnOVohIiJNxO93AiKnoqi8IuRypQmDyitNVVEgNCoNCYxKfcFpwgKjkL+w0MgJjBLc5i+zDSRkBIMkjxM0Vfmd0CokmApcLil12uDcbttmzKLkxPAQKb3N4cGSO4K9w/027C+GnQdM4FRwyFRNdcqEk7uY1xprwY6vEhauMYOyj+gVe+2Ld5bzvhcREZHosyx9NTdA/ywPy7dW8PGmcib2So52c+qk8ElEpDn4/SFd0mp0TwvcFgiKSkOrjWoEOb5KJzByB/+7rJDgyLkcGholeSAt2bnNDYlOaHSk4CJQHRXahoOlh7ep1GemS0oIhkYpzl9GiqkcCgRKKYkNe+5IKS41YdPOAnPGujZJ0DkTjsuBLG+wC2As8lXCwq9MO0/oreApElwWuLVcRUREYoLLwkIDVR+RZTG8SwIffVdOz7YeemTGZswTm60SEYkVgYqeurqnBcY2Cu2eFqg0KvUFu7BVVjqBkRMaBaqOXC5zsOtywqPQKqOUJPCmBquMEtxNE9xUVjmVUaEhkq/GdSdU8riCFUopCcHLGW3CryclmAP3WFNeAbudbnQ7D5j12bkt9MiCE/uZMCweVFTCR19B+3QzrpWCp8jRshUREYkNgconfTUfUWqCxaAsD7NXlXDtSWm0SYy9yE7hk4i0TJVVNbqkhYZHzm1lFeHd0wKBUVlFsAtZZZUTGHlq76IW6KYWCIwSPJCeDO3aBLulJTiPjeRBrd8+PDiqLVQq9ZlpQ8OkQBe49mnh3eFSItztLRKq/KYr3Y4DJmwqKDGVQl3aQv/s2OxKdyQVlWaMp3ZpMKpv/LU/nmgPV0REJHZYFhZWNMcxiiud0z3sK7GZu6aU3GGpWDG23BQ+iUjssG0nNKrRPS2sm1rF4ZVGgW5goZVGfjs4yLXb7XRFcwUrjAIDY4eGRhltoENGeKWRO8Kh0ZGWR0VVjWDMV2McJ1+Nbm81QqX0VMgKCZhSEs3rirEvo6Nm26YrXSBs2lUA6SmmumlEb8jKiL8ALVRFFXz4NbRNg9H9Ws56i1UaWEJERCR2uEzlk3Z/Gm5wJw9LN/v4ZKuPk7rHyMlyHAqfROTYBUKSusYzClwO65IWUm0UOLNZmc/Mz+MOjk8UGhS5Q8czcgKiRA+0SQ4PjBI8sR04VPlrD5Jq++9yhQdHgXGTMlPDq5aSE1tPxUZ5hQmaAoGT3zZhU8+OMHaAWRYtQUUVfLjarOuT+mvPqzlYtJ7tSEREJNZZlr6aG8llWZyQncD8b8vonuEh2xs745kqfBJpzQIDStc1nlGg6qikPLz7VmjVTZkznUUwLAqER9WDYtcyCHZyoqlQCQ2MAo+PR37bLLO6QqXQy5VVwQG5Qwfnbpd2+O2xPAB2c6nyw54iJ3DaD4Ul0DEDurSDwd0hI7XlBTOVVfDRavCmmECtpb2+WKVudyIiIrHD0lfz0UhLcjGwQwJzVpdwzYlpJCfExgJU+CQSj/x+JyAKGc+otu5ppTWCj9CxgAJBU3VYVOMMaqH/Q0OjtGRo2+bw0KgldlUJVHSV+ZyxoWqESTUHFk/0OOFRUjA8SksxXb9CQ6ZEj8KE+tg2FJXAdids2lVgBl7Pbgcn9IWOXvPebKkqq8wYT21SYNyAlrltxSp1uxMREYkdLpcqn45S90w3+0r9vLmulAuGpMTE+E8Kn0Sak99/+HhGYWdOqwh2QQsdz6jcGeMocLmi8vAKowRXyNhGzv/EQDjkMdUhHUICo0Tnsa3xQKuqKryKq7SOM9WV+kxFV2iYlJIIqYGzvYXclpQQv1VbsaDMZ7rRbd9vAicsM0h4n84wflDL6Up3JJVOV7vUJDh5YOvcPqNJe7giIiKxw8KpftJ389EY1jmBhZvK+XKHjxHZ0R//SeGTSENU+WvpklYR3mUtbKyekLOLVXfFqoDKymBgVB0cuULGNnIqjUKritolHT6eUUsaMLqp2HZ4ZVdomFRaI2SqrAoPkwLVSm3ToEuN2xP0MRkRVVWwuygYNhWVQKdMU910XI6pdGpt7/GqKliw2rw3xw9S8BQNGtVUREQkdmjMp2PicluM6pbI29+Uke310DEtuj0HdFQlLVtlVY0uabWMZxR6prTQU9OXh3SnqrKD3c+qB8IOdE0Lvd2pKEpOcMYzcpsuVoHAyKPQqFFCz/Z2WJhUo1KpzGeWcyBICoRHbZLNGexqVilpPTQv24aCQyZo2rYfdheaarzsdjC6f/yfle5YVVXBgq9M8DRhsIKnaFG3OxERkdjhnO1O4dPRy0x2MSArgZe/KuHHo9JIdEdvYSp8kthj205oVKN7Ws3xjGqGRoHAKHB7WYUZBDrRXX+1kSckIEpNgsw2wa5qgfDI7VJY0ZSq/HV3eQuMrRS4DcLDpFTnckZm+O3Jia07vIhFpT5T2bR9P2zfZw7qs9tB/2yYNNSEgGI+7xasNp83pwxR+BFN+nlVREQkdqjyqUn0aedh3yE/735TxpSBKVFrh8InaTqB0KisjvGMDuueFtI1LTDOUSA8gmCFUXVo5Do8PErwmHDImwrtPYdXGik0aj52yNneaguRApdLfOZ9ULPLW0qiqYTp0jb8dnUxjB+VVaaiads+U91UXApdMiG7PYzo2Tq70h1Jld+c1c7jNoGcAtTo0s+rIiIiscOy1CO+iZzQNYEFG8vpucvNkE7RGUtV4ZOEn9GrtvGMqrun1RjLqMwHZZXBwKi8wuy0HxYa1RIeBQKitmnhYVGiJziNxIaKyhphUkiIVFojYKrZ7S01EVKTob3X3JYaOBOcur21CLYNBw6asGn7fthZYLbpru1h3EB1pTuSKr85q53LBacO07KKBep2JyIiEjtcTuVTtNvRAiS5LUZ3S2Te+jK6pLtpl9r8x9sKn+KZbYeHReV1dE8rDakuKg8ESM7lcqcSye0Mch0IjgJjGiUExjUKGc8o0QNpyeZ/dXAUUmkksS+s21u5CZZKQkOlkMu2HezqlhoSLHXKCAZNqU63N4WGLV9JebCyads+s867toNB3UyAkqyudA1S5TdntcOG04frszNWqLZfREQkdlgWlmXh0ndzk2jfxk3/Dh5e/bqUK05og6eZl6vCp2jw+53QqDIYAIWdOa0i2A2tukuaL7xbWmA6jye8yqi2iqNEd3A8o4zUYGCUFBIa6Zfe+Gfb5r0UqEIqKQ+vUAr9X15hQoLqUCnQ7a0NdGlnrqeq25tgutLtPABb95mw6WCZGbepW3sY2cd8pkjj+J2udrYN3xuu0DaWqNudiIhI7HCOQfTN3HT6t/ew56CPDzaWMblv847/pPCpMar8hwdEYaFR5eFhUZlzoB+YttwHvqrgeEWhgVFYiOQKjmeUngzt08IrjxITFAq0FpVVNSqTysO7wYX+97iDoVH1/yRolxYeNCUnKHCU2tk27Cs2QdPWfWYMp/bpprppwhDo6NV751j4nYqniio4Y4SCp1ijyicREZHYoQHHm55lMaZ7Iu9sKKdnZgX9OjRfr4XWEz6VV5iD87q6qYUOfl1eo/IoMG2VP6RbWh3BUeiA123bQEJGePe0JOdxCo2ktBwOlh8eKNW87LfDg6RAgNQxM3g5Ncl0e0vQgawchUNlwcqmrXvNZ1W3DjA0x1Q56ax0TcPvjPHkq4KzTlDwFIs05pOIiEjscFkqSo6A5ASLsT0SeGNdGVeluclIbp59n9YRPnlzLWZc6IRDId3UEjymwsjjCQ6CnZgAbVKCFUaB6qNEhUbShIpK4IWPIbNNMDxKTTRnA+vcNjxkSvTofSeRY9uQtxB6ZJnA6cR+5n0oTS9/DxSUwHknKSiOWTqljoiISMywAkXJ+m5uap3SPHT3+vlkSzmT+zVP97vWET6BqXCaPjHarRAx/LYZK+eS8dFuibR2Nqaq86wTot2Slq/Kb7pQJ7Wer964o9p+ERGR2GGp8imSvMkWlf7me77WswdsoVJ6iR0u51NU70mJNttWV6Pm4nKB5dKyjmXaFkRERGKHy4z5pMKnyGju5drKwie9ayVGuCydVUlig43ei83FQt9FsU7bgoiISOxQ5VNENfdybT3hE3rXSgxxWToIldhgo/dic6mueNSyjlnaFkRERGKHZWFhacynCDF1ZXazPV8rCp9QKb3EDnW/kVihbnfNx+XSso51Wj8iIiKxQ2e7i6hAPURzaT3hk37NlFiiyieJFep213wsdbeNeVo/IiIiscOpeFLhU2RozKeI0emTJZZYwQNRkWjTSI7Nw9J2H/O0LYiIiMQOjfkUUQqfIsUC3Cqllxjhdrrf6D0p0Rbodqf3YuS5nb2n0GV91V9h6olw0bimeY7pf4LzT4JLJjTN/FobbQsiInK0ev8U5t8DPTtGuyUth3O2u+YIn05+ZDcvTG9Ht4ymiUhOeHAXr17enh6ZsRu5xO6A497cBcBwoDNFeeVN8uze3CTgLuBSIAvYCvwT+BNFeU0/8lVzLt0/vATvfgF7CqFTW7j+7LoPLpasgx/+GVISg7f97odwsTP9JffD5xvB4zbXO2eaDzY9Fsp8kJgAX86CpISmeV5fpbluWZDdDi49BY7vXf9jA77bBWfeBd8fCbN+Qp1iZeDhzXvgrudh2XqzHH9wMvz64tqnfe9L+OPLsHUfDOwGM6+AftnmPtuGP78KLy2GknIY3B1+/0Po3zV8HgWH4LTfQu/OMHuGuc1XCTc9BivzYds+eP5/YeyA4GMefRvmLDb3tU2Dy0+Fa88Mn++/3zN/+4rNOnvsevMctg0PvQnPfQRFJTDpOLj3ckhPCbbnN8/A4rXm+sTBcM9lwfsDlq6D3D/BDd+H/73A3LZuG9zzIqzeDAcOwqbHgtOXV8Adz8KiNeY5cjrCLy+AU48LTlNaDn+YDW8sh8oqGNQNXvyVua+wBO7Ogw9Xm+uXTYJfnBt87In/C23bwI4D0CYJpk80r/uFj4PLFWDWXJj1X3jmFzB+cO3rtam73W3ZAxN/Y7abey5tmnlGQm3L5vhfQId02LIPhveEF35Z/zxeW2a2if0HzTzuvxIy29Q9vVXLdv+fm4OXX1p0+Do8GtH+ifA/82H2YrONnHMi/PnHwfs++xYeeA1W5ZuQZ8wAuCsXOmaa+xuyvTd0Xo+/B0++b7bP1CQT8t1+cfD7oTbRXnYiIhJ5J98Guwtg2f3QLj14+9l3w5qtsPBe6N7h6OYd+J6/9d/QpW1wvzHaXv8U/jIXdh6ALu3MfumZx9c+7b2zYe4nUFwKGalmP/OGKea+jTvh/2ab7+AqPwzrCXdNhz6dzf0vLYIZ/4HkkGOmx39u9u33Fpn922XrodQH/bPhtz8wx1l1OULl09lP7OH/ne5lTI+kRi2Oq+fsZ8qAZC4cmlp925L/CYaGd7xbSKc0NzeMTWvUfGtqzsO9t9aX8fDSg+wr8ZPghpNzkrjtlHTSksyPamMf3h02fXmlzeS+SVw4JPWwedm2zZ8WFvPSqhJKfDZDOiXw+8kZ9M9KAKCg1M9v3ilgUb4PgIm9knjt61Jv/ozsovra2LDwyZvbE5gAFALnAi816HFH9hLQGfg+sBYYBTwNdAdubKLnMJq7q0NqktnQeneCLzfBFbOgZycY1beWtgGdMs0HYG0sTNAxfaIeG/rYLXth4q8hyQPvr4Qpo479eX9xLsx8Gb591BygrNwEP/ijub2+xwbc+RwM7+XMr573W+C+aHbv8FXCZX+BK06Fh64zg+x+t7P2Nn23C27+Fzx5k/mCePRt+MnfTbDncZsA5cVFMOc26Noe/vQK/OLf8Oad4fO5bw707QJ+O3wZnNgPrp4M//NILd1ebHjgahPO5O+Byx8wQcu5o83dz38EL34MT9wE/bqYQC0j1cxjzmJ4ZalpV0Yq3PQvE7Y9cLV57J9eNaHUwntNUHXdwyaQuPOS4NNXVMLvXnC+GEM+RxI85mD2ilPhmofC2+y3nTDoV9C1HXywCm54FN6+O7gz8+unzZf2+783gcXXm4PzuOcFE6wuug/2Fpvgs1t7+MF4c/+BgzBuALx1F2zdCxfNhKmjgssTIH83zPsMOmYc+fOvKbsavbzULOv/fmKWY1JC08y3KdW1bNwWnHWCWbeL19S/TNZvg9ufNu+7oT3gtqdM4Pj3a+t+zJG2+6b4XAg8NJqfLZ0y4edT4aPVUFYR3paiUvjhKTBxCHhccMdz8Msn4alfOBMcYXsPdaR5TR4O004278eCg3DdI/DkfLjmjLrbrm53IiItn4XZH5v7KVx1urlt7Vbz4yEc23Fj4HvEsoiZYV92HoBfPA6P3QCThsL8VfCzR8x+Zgfv4dNfMgFuPtccz+48YI4X+naBs0eaQGryCPjzVdAmGf76Ovz0oeCP/ZYFJ/Qx+941lfrMcdIdl5jnfWEh/PhB0442ybW33TKVT1Ydw2IH7qvr/roc6XFWyDTH4mjadrSO75LIf6a1p22KixKfn99/UMRDSw9x2ylmHS/9n07V05ZU+DntX3sY0z2p1hHH31hbxourSphzaQe6et38aWExN79RwJtXZgFw/8IiCstsFl7bERu47pX9YIqKbqmvjQ2tfLoCWAosA35EIHzy5o4BXgW6UpRX5dx2AXA3RXnD8Oa6gF8B1wCZwPvAdRTl7cebezpwBtCPorwtzvMsxZt7GbAYb+7fKMrbgDe3HfBn4EwgBfiQorzznec6D7gb6A3sAa6nKO8tvLmbgJ9QlPeeM9//x+vL4cZzTGAx7ldw3xXmF1OAn54J157VwEXRQKEp98i+MLq/qaoZ3f/waQNn1qnzDDuWua+2+1vzY19Zaj7cRvQ2IcM5o81jt+yFUbfApw8Eu0/MWwG/fyE4n4fnBathxg+C/7uCsC+IwPOP6G2qdzbvCd7+zXZz0Pn1ZujcFmZcBGccb6ogUpNMqv/mCvh0gwnBfj7FPO7Fj01QMryX+bBN8MBxPc0vCH96xYRBv/mBOVgCmL/ShBDb90NaCvzkDLiuCd+ncxabg8TQ9/6QnNqnXfi1ee+e5FQkXf99+Ot/4ZNvTMXH1n0wup8JWAEuHAePvxu+DldsgPXb4YcTzesP3JecCNc4lQ1u1+Hr/mdTgpf7ZZtlveJbOH8M+P2mHQ9cbaqxAHp1Dk7//krInQDdnMDnZ9+H3D/CvVdASpIJbs48ATKcapWzTjAVi6HP/693zcHt3uLwM2H1yzZ/3+0y10Mfk5YCt4Z8Bkw+HrpnwVebTRXUtztMJdknfw5WWQ0P+dXnvZXmALpNivnLnWDCvVwnVK2qgpMHm/dQr85m2e8qcAaxd9rxv0+Y/3uK4Ff/MaHsGc4vXNNmwoVjzfvTtuHrLXDxffDy7eb63Xnw6lLznuzaHh78qVm+5RWm0uf1T819Z50Ad+aGVwTOWQK/vBD+8prZuZkyKnjfh6vhzmdNm84fYwKci8YFw+K8hfDoW6ZidHgvmPmj4LprSnc+B7dPg988Hb5OkxNh7EDzfsYKX6fPfQj/mGcq2U7sZ8LA740w03f/Mdx6vqn++3C1eW33XGbmXeWHP7xoKoESPab67bqH4bvHTHAbWBej+pr2VFTBoOtNmPLVQ+HrCsznSN5HZl0BfPSVWaa7C810NuHbUHMt01BTTjT/V+Wb6rzQ5Xj68PBpr/oeTLsvOE1923tNR5pX6GeB5TLhYv6e+s9mp7PdiYi0DheOg5eXmB8/wey/XDQO7n8l+D16pP2eR+bBY++Yy7+80PyvPpu1FfxOCRx/PnC12ecv9cFPJptjUzD7Cv9403xn7ysy1fv/+rn58aUp7CoAb2rwe3PyCEhNNO0KVAuHCvRsqH49Fmzea17LCX3NX8A1Z8CDr5uq/bZpTvBG7d+lPTuFH3dcdqrpLfTdblNBVZsGdLurrTKqsMzP7W8XsmqXj0q/CWbuOM1L53Q3f1tczGfbK1i5s4I/LizmvEEp/OZUL8f9dSdv/KgDy7b4eHNdGQDPfFHC6G6J/P3cttX3B7rR/eadAjqlublxnKmee2LFIZ767BCWBT93KqYClU++Spu/LSnm7fVl+Krg9D5J/OoUL8mepgumsr3Bym6XZXZ7thRU1rrs3t9QTrsUF4M7eigoP7zD2ZbCSk7smlj9Wi8YnMLjnx4M3l9QxRn9kkl3qqrO7J/ConzfkCO1sTHh0wOY8Gkp3txOFOXtoihvKd7cQ8BpwLvOtD8EnnMu3wicD5yCCYf+BjwETAcmA8tCgiejKG8Z3tytwOnABkwl1EFgiPPf9HPy5o4GngIuxoRaXYB06hJ41waW/pJ1sGim+QX8B3+EIT3MAWZNf38DHnqj7iWz5h913xdQ6oOV38GVp9W+5ViW+aA5/mbzYXbmCTDjQhNkBNp+32xTAtmnswk7xg3UYwsOmkBkykiY9kdzu4UJeAoOwRl3wh8uN499bRlMGGxKQQdfbw7wLhoHv5lmnueOZ81jH3nLPM/F98FtF5uqjXXbzC/o+4rg+JvMvEf0hiX3w+p8k9i/9Ctz4Bk4QD/zeBMkTf+T6fo3faJp++cbTfiy8F7zC8T1j5gvgEUzTdeuax4yFSxtkuGXT8AjP4OT+pvn3LKn9vfPJ+vhR7Pqfv/95+baQ8/PN5pffa74C3zxHQzsCr+/FAZ1r2NGdvD5bctcX7/NbDfnn2QqXb7bBT06mGBr0nHB6av88Ntn4P6rTDkztXxLBNRXW2vbJvC6bJKZZkeBObhdvw1uedwczF88Dm45L/xLLzA/CyivhE27zTZ/5enw1Hy4wDmwnbfCHOwGpt+613SDevsu0z2vtm+/wPX6vhX3FJqqsoHdzHRffGfCiwdec0LADLjl/PCgpuZzrd8WvJ6WAh9/bbpJ5u8xB+e5E0yo47JMcLQq3+zUPLPAfPbc+JipROvbJfhzjssKdrsLrJMFq817auF94E2BDTvMDovLMtvK5j3wzt2Q4IbrH4W/zYVfTzPtWrbe/EJ2/kmwYbsTCjtBxP5iuO4f8JerzTJ+8n2zrV48zsz7rc/MZ+2TN0GvTuaz94ZHYe5va1+mg35W9/K+fkqwPLym/35qQqDvDQ8Jn0KWc2CnKXT5f/y1qdp7/lYTRv/+BRN+/+zs4DRffGc+zx68Fn7xL/MaTz0OnvnILNN37jbL7rfPmOkD30eB5xrQFe79kVkmr94e/j4IbUvoutpfDNc+ZLq1nXk8PPG+Wd/NvUzrUrPttflkvVmmtU1Tc3s/ktrm9coSU5V2sAzapZmDhsbuwYqISMszso8Jn77dYcKe/35ivn/vfyX4HV3ffs8Hq0xPgBd+CT2yzH47HP79Hnr8ufwb+Ohe88Pz1N+b/b5+2fDoOzB3GTz9C3Ps8/UWc1xU2/fR0XxXj+htnue9L0wA9e4XZriNIT3q/s77+xvmB96ScvP6LhhT+7SffmMqyds7h+GWZYakGHajqey/aJz5Ib62Lu+rN5seBr071XNccOSv5tq6tlnABUNSeGBKJn7b5rfvFnLvgiIePLctN5+czhfbfUwdlMLFQ8O7nFkWXDIslS93+OiU7uamcemH3R96WBG4vnBTOf/57BCPX9iWrhlu/t97RWHT/2VRMVuLqphzWQcSXPDLeYU8uuwgvxh/eHyxYpuP6187UOfrfei8tozsmljrfSu2+fjZawc46LNJ8Vj89ZzMWpfd3DWlnDcoGbfLOaar4ZxBKfx3bRkb91fSPcPN7NUlnNI7WJ32oxPa8NTnhzhvsPkRfd66UoB5dTbaceTwyZs7HsgBXqQoby/e3G8xAdNfnCmex4RJ7+LNTcd0oftfb5eLPQAAGytJREFU575rgRsoytvqzOsuYDPe3MuBDsCOOp51B9ABb24X4GygPUV5gTXwofP/auDfFOUFQq9tR3wtLgsCx6O3ngdpyWajy51gNvhJQw9/zI1Tzd+x+PVTZgyc046rvfSyfxd493emy9DWfeYg8e48M34ImL6w/bNNlcNry+DKv8J7vzOD2bXWx+4/aCoCyivMLxU5WfDqsuBjZ/0Xlm8wj33tdlNFNOc2U+V29YOmG93TC8wB5S8vgFG3wsu3QXoynHqHqVCZNtO057qzzMHP1BPNmELX/N1U6Nw727R58nATTOROMAHU9ImmJDUnyzx2zmIz/o0L8+E9faKpjhqWYw5Kbz3PHLSeehwkuk0gOjTHfEh/s9106WmXZv5qM2YArHu48e/LHQfMWEdP3mSCuX+9Y4K0hfeag/NQpwyB/3sJlq6FUf3MF5KvynSpcVmmAuykAaYbpNtlfqmZPSP46fzYe06VWi9YZz4Oag9iMcuprm+Y+181B6TTJ5hpdjofCx99BR/cYyrZcv9knv+ySXDaMPNL0nmjTXXTP94005c77R7e0wSRQ39ubp8w2JRfB57/zudMMJqeEv4NE+pI4VNFJfz8nzBtvHlvgmn32m1mp+OLWea9evlfTADRP9u8Fx56E/72ExMovbDQhNiB50hOMDtJcz8x1xM9Jjg9LseMJXXPi+Y9deMUeHaB+TVp8nDzOReoygy8ltDvG5dl5nWoDDbuMF0NBzjjdtm2qf6Zf09wB+Omc0zZ9m9+YK7PXmTa3i7NVOFccK8JSDp4TRXUgK5mOwLzS9mjbwfb8cwCs3MSeL6bz4EH3zBj/9Q27sLRvOcPlcHMOZD3v+HrLXTduayQCkjn/6tLzXsu0KX2N9NMeFblD07z8ynwVT6kJcHJg0xl5OnDzK+lP5lswsbCQ6Zkfc3W8OetHguK8OcNCH3fhU4zf5V5vwS6pF17JvyzmZdpfap//axj2/h6ixl764kba5+m5vZen7rmddE487dxp/nxoVPGEcKnetorIiIth8syP9bMXmzGI+qXHaw0cgqX6t3vef1TyB1vjvHA7F+9uqxG+FTj+/3W881YncflmMet2WK+o5//KHgMA+b+uhzNd7XLDdPGmfCsvMIcYz32M3MsXJcbp5p9iNWbzY9ZmamHfz9u329+yLtrevC+cQNgwR/Mfs+6bXDtwya4q3k8XVxqxny95fwjjpdZ71dzHbsa7VJdnNU/uXqi60anceXs/cHp6nhc9eo7wv1O06qneeebMi4YnMIAZ0ykG8am8ea6MuetYDNndSmvXt6edinmzXDt6Db8cl4ht044PHw6sVsin1zf6bDbG+LEbol8en0ndh2s4qVVJXTLcB++2oqqWL7Nxx/O8FJUS9UTQMc0N6O7JXLqY7txW9DF6+b53PbV9w/tlEBFlc3wv+4E4OScRIAjVuU0pLb8R8A7FOXtda4/59xGyPULncHDLwQ+oygv37kvB3gFb24B3twCYA1QBXQC9mKqlWrTxbm/O7A/JHgK1R34tgHtD3K5TOkgQLesYEll9yzYVRi83pR/v3/BbHj/+jm43bVP07mdqTbxeEw54p258ManwftH9QNvG9NNKHei6WIzf2XrfuycxeYg93eXmsdeNM4cWAQee8kEc4A3qq/pYjesp+n+OKi7+aC87SlTGfLsAph4uwlMemRBmpN+f/tPE0aePRKWrIX2XvPYvUVmutA2JyWYA5tpJ5uS3Iw2wTLbHlmwsyD43svyBl9ngjMWTqe2wduSE52QwQX/vtG87hNvhQvvNdUtTfneTEk0FVGTR5jnvX6KGUvo252HTzugm+l6dfszMOImOHDIfEF2bW/uf+A1+PI7+HwWbH7cfAFfPNOEU7sLTRe826cFl4NF7W2CYLlyzb8n3jfr+NlbzXvD5QpWy90wFdqmQ04nuOI0c1DucpnQ74KxZkykSb8xXdUg2O6f/sP8wvTtP804Xz07mqDI5YJ3vzTVEheMddptBddrbe2u6/afP2YCnfuuCFn2SeaL+JbzzbIfP9gEFh99Ze7/v8vN+hl3G1z1N9OGLu3MfYUlpgvgz74P254wy3xojgl1LMw4VqP6mveexxNsd7cOIe/FGq+l+tc5F0wcCj+e/P/bu/P4KOr7j+Pv2YRAAtkAEUg4REiLIIeCICoqCCgq9QQ1ij4QPBEPFFBAa+lPhJ9HUVBRVMQWpREUxPvA8pNDqEcVLQG1tnIKCAlZIAdJdvvHZzezSTZZ9edAS17Px2MekN2Z2dk5dmY+8/l+vratu91qNXT2l0h5+23fHDRZOvpGGy7/gwVkfT7LKHvtY2loH/v7hA62nhevsb937nHXu89nv4eZTdzl2LrbavZE5t3pJkkh239+qX3+oVfsOD2qRQ3rwak8RF7fUWDnicjfqSl24bZjj7udM5pIe4vtvZT6UmH4ON6xx9Z9ZNrIBWzVz6zp2Ki2jFHjxFqnLZse3HU6bLqUdb0Ni9bUvuzRw8YfbNopV0gnd/pxx3tNQ7x5+XzSr1pKHdtYrbXa5lXT8jIwMDAwHD5D5Px0ySn2gGnBKvu/z+dei8a77tmxR2oVdX4/slnt5/fItULk/ehrhW151lTcq++7ItceTC6eKG151jK8xs61Bze1TZeQYA/ekpPsGir6vbx91sLjqoF2HxZ5vV2GXWclJlo5j7EXWG3Y6GlLyqzVxvFZVlsq7raqfokWXZYr1uvFZSH9bmmBBjyzUz0f36ErF+YpUBJSMBSqcTrJjRnGe7/qODv3lSvT73Mvu8NN4BxJ+cVBFZWFNHT+bp0wa4dOmLVD1y3OV35RsMbv9f8dMlITdOpR9TXuzT3V3nt1fZF6tKynNo0TKwJoVT2yaq/Wfn9Aa0a10NfjMjWmT6ouy9mtotKgJGnUkjy1a5qo3NsytO62DLVtkihJz8eYVSW1Zz75s5MlXSIpQf7s7eFX60tqLH/2sQrkrFUgJ1f+7I2yDKXoJneStFnSSAVyVsWY91JJY+TPblOp6Z01p2sj6S+yZnZN5c9urEDOnipz2Cwpq4Yl3y8pOofOCj9Ehyq357ntWbfttqZRsUKqD79qT1NrsvGZmt+7/2ULHiy5ywqe/lgJTrhuRw0h3qpP5evatEUHLOOjPGgZIwWFVtOkoNCyDbq0tWK1bdIt62Lrbum6M93pWzaVZlxrNVWyH5K+edL9rIraTuGTxSmdpCUfWZbDDWdZAGBrnh2lkWVev8VSUgdNtmmffsee1g/cas2dMpuo4glIpLlMxd8xvnfkKcnxWdLzt1vmzJz3rJjf2hnV18/qr6TsWgqh54yv3HtcROcjrZlKRVO6yOfHWCZJOr+3DZJlcPx5udSjvY2bu9lqsrQOR8QvP82aMv5jm51QdxZIp06094oP2NDlZunLmdW7NY+VO/vCB9ae/NW73c+Q7BhOSqw8jRP1HXwJ0oQhNkiWIp3ZxIqA+xzLcHtguGW8SdYM7zf32nsrcy2gFsmK2lto+8SGLZYWHVHTvh0KWdPKXQHrwS+68HaXI6t/18jZzOdYgGJ2VFr1lAXuut4c3kdP72rfvXW6pULnLLf6UCtybT/eV2zLvjtg+06rdOuF0efYBU9xOJMqJNt/o5tCXj/Ihh8KrLD8rDctAyw5SVo5zY6Dqt7+1J5iTfij1USTLBNt4Spp1Nl2wfXuZ1H7W8iy7yLfuWW6ZSQO7VN93rG0raU3yTHnVe4dMGJFrvR9ngU2JHfd3ByV4Ror8ymjsbR1l/v3/mLLmNu+x31tW550oNQyOiV3H2zR2M43kWM+f58772qZT+GdN3pfit5Wkm2TyDgZTexpZPQ63ZZ3cNdpbT0C1vQbt3mXZZaOPV+69JTq09V0vMcSb17RgkHLLK018ynG7w8A4PDjc6yVwpHNpPfXSjOucX//fY7ULLX2654Wje2cG30dIFU+j1Rcj0bNN/ocU3G+bipt2il1rqn0RZSfc67O3WQ1KnuEb52Pz7Jhxbqaay1FKw+GayaGl33PfuslfFAPO//WxufY9Ulk2pJSa5mS2VSaPjL+OddnBbt9sVoPSVbO26n+/nOf7tN3+eVaOOwINWuYoNydpbpg3q6KcX1O7Pn6ank/OdFRSbkqXttdGFJmqo3TvFGCtu8NVry3fV95xfzSUxLUIFF6Y3gzZaTW0uNu2MdbDujaRXk1vv/0RU3Vq3XsZnfRgiFHm/aUV/uOS9YX6bpejex71lAMff3OUp3bKVmZ4SDaxV1T9D/vF+ibXWXqlpmk9TvLNOXMxkpJsvu4Ycc11AufF54Tb5niNbu7QJap1FXSgajXF8jqQI0N/z1fVt/pJEnRfWs/Kek++bOHK5CzUf7sZpJOViBniQI5S+XPfl/Sy/Jnj5D1dtdLVuPpCQVyvpEk+bPfkjRL/uzRsmDUSQrkLJc0R9K78me/LmmZIjWfAjkbJH0uKTs87bGShlY7+Kcvsdojm36Q/rxCenJU7J1/7PnxD6pYHn7V2hG/dnfsXgSircy1jItW6fbDNWWBdHYPW56C/ZbxcnJHa4a1eI3VBpp6hXuDXNemzd1s/39pgj3Vb9nUMkWGP2LN3mbfaNN2aGmBo8QEC4x8uN4+96r+0uT5khz73Ly9FpRIT3Wboixa7X5uuxZWm2nQcVLPLOtdb9QTVmPlw/VWE2fxJGuvfMdz0rrNVj9p/EV2Y3vTOe4NbfR+WFPQwudIZeXW5HBQd6u3408JF+OOsY/26ShtnvPT99FL+lhW2PJ11tzsqXdsHUTqElX1+b8sDTh/n3WhelZ3tzlPj/bWDGzISdZV/cIP7TtkZViQ67OH3fm8skZ6abV1cV8v/ANcUmonJsmmO1BqwRrHseDF1IX2lKZ9lRTURg1s2z7+hjWhCxRac8qbB9t3yN9nJ8ijmktfbbNmdOMvdNud92hvN7qTL7O/5y2zwJDPsS7Zx5zrftakeXazP+4C90RaUmrLK9kyO44bZBo715pNLppoKdbR+nS0m+qZr9mFwqffSqs2SL+/3Ob9rx0WsE5raPvmvGX2W+JzrGaTZMdC387WLG/JX+14KiiUXplowaTB91qK9YurpCv6So+9aU3hfI5txzc/sZ76vs+Xvtho29LnWNe5wZCtz0YNrIlfos/W2ZX9LKh4/3CpWZodvxu2SP27WW2sYX2tSVrE9/nSwHtsnEHdLePwrU/d+kQ7C1TRzHJEf2nay3YR1LG1bctlX7oBz6p+zj7/ykQLGkUMvEeaMszqH0S2adEBqeSAXWgFCu14H3qy1WMb2sd+V6YutP1kZa70169sXk+9Y9ln/pSop52O1b966l2rAbW/2L6/VDkIHVkHLdIsOFZW7jZ9jd5W2/OtCUCzcNOxyDp94xP7LZvz3sFfp7GUldsQDNlwoNT2n8QE22cumiZdPVAaObD6tLUd71XFm9e8ZVYctlmaNXOd8ZrUv2vs37eIyG80AODwFjlPz7zWrhVTk91rOp8T/7rnwhMtWz77VKt3+tDi8LSKuu6Pfrik6g8dI+Ne2c9KgXRsbee+3M32sLRpjHLGP+dc3SNLmvm6PXTt2tZ6817ztRVbr3rOCwalPy2z6+u0FOlv/5TmLpVuPc/GDRRaveTeHaTJ2dU/a+lau+5onmYdDU1fYuUvfI49UL/6UQvqPXFD7DpQVTlO3OdCwWBIpeVu87EEn1RYGlKDREeNG/gUKA7q8dV7Jbmb4IgUn7YEqhfjjnzWEQ192lJQ+f1OzRP1+oYiHX1EolZuLNFHW0rUNaOefI40+OgGuvPtAl3UOVmt0xL1WPjzHEdK9Dm6tFuKpv1fQJMH+JXeMEHb95br611lOq1dlXsESb3bJOmLWzOqvR7Pktwi9WqdpMxUn7YFyvXwqr06uW1Spe/wt60HtGNvUOd0bFDpdrSqYzOS9MaGYp3bKVnpKT69sq5IpUFFMpzULbOectYWalI/i3PMX7tfktbGW8Z4wafhkuYqkLOp0qv+7MckzZQ/+04FcspkdZ+mSXorqnmeJM2QHVrvyp/dUtJOSS9KCnczpyGy3ureltWA2irpGUkPRM3jSll9qQ2SkmSBpuUK5HwUDlo9LKmdpB2SRofH+214mfJlNaLmS7q50gHfp5M1ZwqGLDgwoFucVfETTVlgNw8njHNfuy0qkNV6pLTgDguyfLnRej/as996CRjcU/rtJbas5UFp6kt2E+vzWc2j529zb/rr4rT3vmjjjXjUnTY1WRrWz3r46nCj/eq0bWbbt28Xa+62IOpz6yXanrl+s2UODepuhW2/CpcOe3ap+7kdWlqQZOA9duOammzL+sYn0uf/lGaPsmCCZE3TBt9rvayt/U4afrqdUHyOKtVqqSnzyZH7q7dwlQV5gkFrMjL7xl/2pujoVjbPcXOlXQVSt3bS/LEWbJCs2dxJHd19dtI8O2klJtjN633D3OUZc65l+PS7SyostpTbP95q21eq3BtaWoqtz8wm7mu9x1sGQ+RzJWntI/Y0aupLltp7xj3u+Bf3seCxJD04XBozx7J8/CnuOnfCwafL/mDZb+mplr0W6VJXkh67TrrzT1LXWyzwcHyWNOsG+15pKZUzFpOTLIgUaTa1aZd07Bj3/VYjLXj5xQwLaj/3FwtEHTPaHWf61Rb0q1/PmhPd+rR1UdsmXXryBgtaSnZRMGmeBZOyMqSnRrs1BRo3tADf6x9Lc5bacp3V3eoz5ax0g90L7rBt+32+9ew4e5Q7/9Hn2L7bKTzfzm2ksnD9ov3F1uRu405bzv7dLDPI50i/v0x6YLFl+eXttadWIwfYPJavkz64r/J2zWxiv60vrrBi9s/dYsGS0bNtG3ZvZ5/hc6xuUWGJZVpt2WXN107v6haD/yVUfRCQ4LN91B/ucbCwxGpvRWRdb+tm5f9aMPL0u20fOLmjFR//cIPVMpCs1tas691jOxLEuKq/Ncs9bVJ4fXa1QGi9hPAZP+ritG8Xu/DsNNr+/nZ29W11cR+rFedz7Hctsk5vfsoyf3p3cOd3MNZpLNOXSPcvcv9euMoy5yYMsabO3+20gq4PLnbH2fKs/RvveD/pDjuXXtIn/rw++sZ60dlfYsft+b2lu4aS+QQAcK/HszIqvxb9Xk3XPQOPlc48zq4rL5xq4951sdWPir7Od6LmpSr/jz7/3zTYAjND77fP+XVLad6YX+58dOox1oHTiJmWQZ2eKt1+nnv/u2CV9VK8+gFbrjc/le5dYMuU0cR6pb5hkC3vW59ap0VfbbWs+4jVD9h18Ip11rnJ/hK7TrnkFLuX8DnWYuWdz+zatf117rSRe+KY4vd2d/WiyhV6Rp/YSCN7NtSY1/eo1+M71LyRT9f0bKj3/lFSsQlGHN9Q497ao/mfF+rCzsn63YA0Se4murRbim5akq/uj27XiW2SNPvCprpngF/j3yzQC58V6oxfN9AZv2pQsWynZzXQiJ5lunKB1ZW6/dRUvbq+uGJ+E/r6NfPDvRo6f7fyi4Jq0cinYcc1VL/21YNPP9e3eaV6cHlABSUhpdV31K99fY0/zV9p3S1eV6RBHRrIH+6lLvLe1kCZBj7zg5Ze00yt/Im64cRG2lVYrrPn/qCi0pDaNknUkxc0UVoDm+7Bsxtr8tIC9Z61XSFJx2UmSdJV8ZbRCYVC8cb57+fPdjRxSFAThlrKYLdbpN3P/7hoK+CFXQHL6rr2zEO9JKjrQiF72jZx6MH93GDQAipP3xS7p9HD0ZcbLfPvxZXS3x891EuDWKa9dPCPBQAAUKMQ52bP/H1HqbYFyjWoQ/IvMbu40dL4vd0dNiLpj5E/a8kzA7wWXaUO+E9wMPbFpWutGHpykjWBCskKkx/Ox0HRAcsKG9DNDTr/ptfh/Z3/27FtAAD4j+FInJs94oSbNR4sdSj4BAA4pD762tr6l5ZZs8/5Yys3yTwchUJWw2jEDMu27XpU5bpYAAAAQB1Q94JPbZtLgZxDvRQAUPdMutiGuiSlvvTBVPv/lxutmLz/J/R+CgAAABwGfPFHAQAAAAAAAH4egk8AAAAAAADwDMEnAAAAAAAAeIbgEwAAAAAAADxTdwqO33nRoV4CwJWeKl1zxqFeCsBMGHKol6Bu6HKk1LnNoV4KAAAAQJ2bJ+qY5gcvJOSEQqGD9mGHWJ35ogAAAAAAAAeJE28Emt0BAAAAAADAMwSfAAAAAAAA4BmCTwAAAAAAAPAMwScAAAAAAAB4huATAAAAAAAAPEPwCQAAAAAAAJ4h+AQAAAAAAADPEHwCAAAAAACAZwg+AQAAAAAAwDMEnwAAAAAAAOAZgk8AAAAAAADwDMEnAAAAAAAAeIbgEwAAAAAAADxD8AkAAAAAAACeIfgEAAAAAAAAzxB8AgAAAAAAgGcIPgEAAAAAAMAzBJ8AAAAAAADgGYJPAAAAAAAA8AzBJwAAAAAAAHiG4BMAAAAAAAA8Q/AJAAAAAAAAniH4BAAAAAAAAM8QfAIAAAAAAIBnCD4BAAAAAADAMwSfAAAAAAAA4BmCTwAAAAAAAPAMwScAAAAAAAB4huATAAAAAAAAPEPwCQAAAAAAAJ4h+AQAAAAAAADPEHwCAAAAAACAZwg+AQAAAAAAwDMEnwAAAAAAAOAZgk8AAAAAAADwDMEnAAAAAAAAeIbgEwAAAAAAADxD8AkAAAAAAACeIfgEAAAAAAAAzxB8AgAAAAAAgGcIPgEAAAAAAMAzBJ8AAAAAAADgGYJPAAAAAAAA8AzBJwAAAAAAAHiG4BMAAAAAAAA8Q/AJAAAAAAAAniH4BAAAAAAAAM8QfAIAAAAAAIBnCD4BAAAAAADAMwSfAAAAAAAA4BmCTwAAAAAAAPAMwScAAAAAAAB4huATAAAAAAAAPEPwCQAAAAAAAJ4h+AQAAAAAAADPEHwCAAAAAACAZwg+AQAAAAAAwDMEnwAAAAAAAOAZgk8AAAAAAADwDMEnAAAAAAAAeIbgEwAAAAAAADxD8AkAAAAAAACeIfgEAAAAAAAAzxB8AgAAAAAAgGcIPgEAAAAAAMAzBJ8AAAAAAADgGYJPAAAAAAAA8AzBJwAAAAAAAHiG4BMAAAAAAAA8Q/AJAAAAAAAAniH4BAAAAAAAAM8QfAIAAAAAAIBnCD4BAAAAAADAMwSfAAAAAAAA4BmCTwAAAAAAAPAMwScAAAAAAAB4huATAAAAAAAAPEPwCQAAAAAAAJ5JPNQLcBA5h3oBAAAAAAAA6hoynwAAAAAAAOAZgk8AAAAAAADwDMEnAAAAAAAAeIbgEwAAAAAAADxD8AkAAAAAAACeIfgEAAAAAAAAz/wbNjy33iraapUAAAAASUVORK5CYII=\n",
-      "text/plain": [
-       "<Figure size 1440x216 with 1 Axes>"
-      ]
-     },
-     "metadata": {
-      "needs_background": "light"
-     },
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "# We can use the shap package\n",
-    "import shap\n",
-    "\n",
-    "\n",
-    "# shap will call the GPU accelerated version as long as the predictor parameter is set to \"gpu_predictor\"\n",
-    "model.set_param({\"predictor\": \"gpu_predictor\"})\n",
-    "explainer = shap.TreeExplainer(model)\n",
-    "%time shap_values = explainer.shap_values(X)\n",
-    "\n",
-    "# visualize the first prediction's explanation\n",
-    "shap.force_plot(\n",
-    "    explainer.expected_value,\n",
-    "    shap_values[0, :],\n",
-    "    X[0, :],\n",
-    "    feature_names=data.feature_names,\n",
-    "    matplotlib=True\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAh4AAAEvCAYAAAAKDcjfAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAgAElEQVR4nO3debwWZf3/8dclSC64AykhAqlYuZR+ckkt/bkWUpYZ5oq4YGWWippGrpQbZlpaboC4W7kAhqgZLX7L+qiZ+4IsihCYgGwuwPz+uK5bhtuz3Oec+8zNOef9fDzuxzkz18w111z3zH1/7uu6ZiZkWYaIiIhIEdaodQFERESk41DgISIiIoVR4CEiIiKFUeAhIiIihVHgISIiIoXpXOsCdATjx4/PBg4cWOtiiIiIFCXUl6AWDxERESmMAg8REREpjAIPERERKYwCDxERESmMAg8REREpjAIPERERKYwCDxERESmMAg8REREpjAIPERERKYwCDxERESmMAg8REREpjAIPERERKYwCDxERESmMAg8REREpjAIPERERKYwCDxERESmMAg8REREpjAIPERERKUzIsqzWZWj3wshlqmQREVktZcM6t0a2ob4EtXiIiIhIYRR4iIiISGEUeIiIiEhhFHiIiIhIYRR4iIiISGE6XOBhZq+a2eBal0NERKQjapVraFrKzCYDXwIGufvdufm7AP8Aprt7nypsZy/gEXdfLetBRESkvVmdWzxeAE4om3dCmi8iIiJt0Or8S/8e4CQz6+fur5nZesAhwM+A7wGYWWfgTGAw0AN4DjjF3Z9I6WsClwJHAiuAKxvaoJmNAToB7wKHAouBC939utwyXwJGAJ9JeY5392Ors8siIiLt2+rc4vEucBtwXJr+NvBnYFZumQuBrwEHApsAo4BJZrZRSv8RcBDwBaAv0AfYopHtfhMYD2wMfB/4lZltAWBm2wOTgJuAzYDNgbHN3UEREZGOZnVu8QC4AXjIzM4DTgTOAzYCMLNADAwGuPtrafmbzOyHwADgVuBo4BJ3fzWtM4yVgUx9HnX3cen/e8xsPvBZYDpwErGFY0xu+T+1bBdFREQ6jtW5xQN3f5b4hf8T4OPAg7nkbkBXYLyZzS+9gH5Ar7RML2BaLr/FwJxGNjurbHoxsF76vw/wcpN3RERERIDVv8UD4Hpi18aF7r7czErz3yIGBfu6+7/qWXcmMVgAwMzWJY4Faa5pwFYtWF9ERKRDawuBxx3A68AT+ZnunpnZVcBIMzve3V8xs67A7sAz7v4mcAtwRro8903gMhp4Yl4FrgMeN7OjgLuIA1F3cffJLchTRESkw1itu1oA3P1dd3/E3efVkXwecD9wv5m9A7xCHIdR2q+LiYNB/wFMBWYQu26aW5anga8A3yF22cwAjmpufiIiIh1NyLKs1mVo98LIZapkERFZLWXDWqXzo97ehdW+xUNERETaDwUeIiIiUhgFHiIiIlKYtnBVS5s3rv9EBg4cWOtiiIiI1JxaPERERKQwCjxERESkMAo8REREpDAKPERERKQwCjxERESkMAo8REREpDAKPERERKQwelZLAfSsFhGR9quVnnXS1ulZLSIiIlJ7CjxERESkMAo8REREpDAKPERERKQwCjxERESkMG0+8DCzc8xsfCvke6OZjal2viIiIh1ZTa4BMrPJwCPuPqKl67n7z6qRt4iIiLS+Nt/iISIiIm3HanXXEzM7DDgb6AssBsYBp7n7YjP7FbAnsJuZ/QiY6e79zex8YA9337eBZcYAy9z9+Ny2pgHD3f3WND0E+DHQHbifePOTZbnlewM/B3ZPs8YDp7v7wtapDRERkfZndWvxWAAcDmxIDCD2BIYDuPvJwF+Bi9y9q7v3L1+5kmXqYmZ7AtcAJwEbAw8Dg3LpawGPAs8D/YBPA72Aq5q3myIiIh3TatXi4e4Tc5Ovmtm1wNEFbPpo4Hfu/nCaHmtmQ3PpBwHB3c9N00vN7CfA/5nZCe6+vIAyioiItHmrVeBhZvsB5wLbAB8DOgFzCth0L8DL5k3N/d8X6G1m88uWyYBNgZmtWDYREZF2Y7UJPMysC3AfcCYwyt2XmtnJwLDcYisqyKquZRYBm+S21RnokUufCfQpW6cv8Er6fzrwsrt/poLti4iISD1qGXh0TmMnPpwG1gLmpaDj08DJZevMBrZsJN+6lnHgMjPrC7wJXAismUsfC0xKg1D/DBwG7MzKwGMCMMLMzgF+SQxkegI7u/u9je2oiIiIRLUcXHoesDT3WghcQAwQFhEHe95ets6VgJnZfDN7rp5861rmNuIVMk8CU4AZ5LpH3P0vwPeBG4G3gQOBu3LpS4B9iINKXyQOgv0j8Nlm7bmIiEgHFbIsq3UZ2r0wcpkqWUSkncqGrTajFlYnob6E1e1yWhEREWnHFHiIiIhIYRR4iIiISGHUMVWAcf0nMnDgwFoXQ0REpObU4iEiIiKFUeAhIiIihVHgISIiIoVR4CEiIiKFUeAhIiIihVHgISIiIoVR4CEiIiKFUeAhIiIihdFD4gqgh8SJSEegh6VJjh4SJyIiIrWnwENEREQKo8BDRERECqPAQ0RERArT5kcCmVlv4Hlga3d/s4r57gH81d3rHSAjIiIiTVO1wMPMJgOPuPuIauVZCXefAXTNlWMwMNzdtyyyHCIiItI4dbWIiIhIYVq9q8XM1gEuBr4BrA38DTgltVSUWkqeAPoA+wNzgNPc/f6UHoCzge8C6wA3A9sTu0HON7M+wFRg8/T6DdDFzBalIhyU/j7i7h/ur5mdD+zh7vum6a2AG4CdgNeA0WX70Rk4ExgM9ACeS/vxRMtqSEREpOMoosXjSmDX9NoCeAsYb2adcsscA/wc2AD4FXBzClgAjgJ+AAwEPg7MAr5Y14bc/e/AScBr7t41vSY3VsAUVIwnBhM9gG+mfPIuBL4GHAhsAowCJpnZRo3lLyIiIlGrBh5mtgZwNHHMxUx3Xwz8EPgUsHNu0bvc/TF3XwFcTwxAtkppRwPXuftT7v4BcDlQtUGkyS5AX+AMd1/q7q8AV+T2IwDfT+mvuftyd7+JGAQNqHJZRERE2q3WbvHoDqxF7LoAwN0XEbtTNs8tNyuXvjj9u176+wlgei49A16vcjl7AXPcfUlu3tTc/92IA1jHm9n80gvol9YVERGRCrT2GI+5wHvE1oQpAGbWldidUWnwMJPYRUNaP7Bq0FJuRR3zFgGdzOxj7v5emtezbBs9zGydXPDRN5f+FrAY2Nfd/1VhuUVERKRMtQOPzma2Vtm8scBFZvY8MJ/YhfEi8M8K87wFuNTMfk+8X8cprBo0lJtNDCLWd/d30ryXiMHH8Wb2a+ALxHEcT6b0fxBbVS4xs7NS/qeWMnT3zMyuAkaa2fHu/koKoHYHnqnm/UNERETas2p3tZwHLC17XQA48C9gBrAZ8FV3X15hnmOBa4CJwH+JXRv/ILak1OVR4GFgauoS+ZK7LwSOBU4HFhAHq95cWsHdlwFfBXYgdgPdQxxrUr5v9wP3m9k7wCvEAai6JFlERKRCIcva1hPb04DVGcCZ7n57rctTiTByWduqZBGRZsiGtfmbYUv11HvX7zZxlJjZIGJrwxrEe3qsS2wBERERkTakrXQTfJ/YzTIL+H/AV9x9Xm2LJCIiIk3VJlo83H2PWpdBREREWq5NBB5t3bj+Exk4cGCtiyEiIlJzbaWrRURERNoBBR4iIiJSGAUeIiIiUhgFHiIiIlIYBR4iIiJSGAUeIiIiUhgFHiIiIlIYBR4iIiJSmDb3kLi2SA+JE2mYHi4m0u7U+5A4tXiIiIhIYRR4iIiISGEUeIiIiEhhFHiIiIhIYRR4iIiISGEUeIiIiEhhqnoNm5kNBy4CjnH3sVXO+2jgB8A2wDLgH8AF7v5/1dyOiIiItJ6qtXiY2RrAccDbwNBq5ZvyvgC4CrgM6A70Ax4DHjWz/au5LREREWk91WzxOADoBRwMTDCzbd39WTMbCXzS3b9eWtDM9gbGAZu6+2Iz2xa4AtgJWALcBpzr7h+YWR/gx8Bx7n5XymIJcKGZ9QOuAbZK+XYFzge+QQxQZgBD3f1vZrYmcAZwDNATmAOc6e6/N7MxwDJ3Pz5XxmnAcHe/1cwGA8OBG4AfAp2AW4AfufsH1apAERGR9q6aYzyGAhPd/QHgaeDENH8UMMDMuueWHQzcnYKOHsCfgXuIAcFuwH7A2WnZ/Yl3QLujjm3eAmxpZlul6ZuAXYB9gPWJQdDslDYCOBI4NKV9CXilCfu3BdCb2NqyGzAQGNaE9UVERDq8qrR4mFlPYADxSx1isHGBmZ3l7s+b2VPEL/0rzWw94BBiCwnA0cDT7n5dmp5pZhcDlwIXElsu5rr7+3Vs+s30t4eZLQC+BWzr7lPT/FdS+QLwPWCQu/8npb2RXpVaAZzh7kuBKWZ2GXAmcHET8hAREenQqtXiURrbMSFN3wqsDQxK06OBY9P/3wJmuvtjabovsLuZzS+9iIHLpil9LtDNzLrUsd2euWX6pP9frmO57sC69aRVao67L8lNTyN2LYmIiEiFWtzikQaVHg9sCLxhZqWkTsTuljHAncTWjh2J3Syjc1lMBx5x9wH1bOLh9HcQsWsl7whgiru/nLpsII73eL5subnA4pRWV/fKImCT3D51BnqULdPDzNbJBR99aFqLiYiISIdXja6WA4m//HcGZubmbw9MMrPt3P0ZM7uXOM5iV1a2hACMBU43syHA7cD7xC/1rd39QXefamaXAleZ2VLgAWJryneIgcfBAO4+x8x+B1ybBoNOBz6Z0l41s18Dl5nZDOA5YmvJxu7+DOAprS+x++ZCYM2y/VwDuMTMzgI2I47vuLkF9SYiItLhVKOrZShwn7s/4e6zc6+HgL+z8tLa0cCXgUnuXhqbgbvPBvYmBhDTgHnAvcRBnKVlfgycDpwDvJWW+xKwj7tPzJVlCPBv4mDVhcD9rOyy+TFwN3BfSvsz6WoY4lU044AngSnEq2HyQRTEQGYmMBV4HHiQeHmviIiIVChkWVbrMqz2SpfTuvuWzVk/jFymShZpQDasqvcyFJHaC/Ul6JbpIiIiUhgFHiIiIlIYdbUUQF0tIg1TV4tIu1NvV4vO9gKM6z+RgQMH1roYIiIiNaeuFhERESmMAg8REREpjAIPERERKYwCDxERESmMAg8REREpjAIPERERKYwCDxERESmMbiBWAN1ArPXpBlQiIqsVPatFREREak+Bh4iIiBRGgYeIiIgURoGHiIiIFEaBh4iIiBRGgYeIiIgUpsXXIJrZcOAi4Bh3H9vyIn2YbwYsBVYA7wFPAcPc/d/V2oaIiIgUq0UtHma2BnAc8DYwtColWtX+7t4V6APMBe5rhW2IiIhIQVra4nEA0As4GJhgZtu6+7NmNhL4pLt/vbSgme0NjAM2dffFZrYtcAWwE7AEuA04190/KN+Iuy80s1uBw8ysm7u/lfLcHvgF8DlgHjAKuNjdlzeWbmZ9gKnAYOAsYAvgz8ARaXoIsbXlIne/JuXXB7gO2AXIgNeAw939pRbWo4iISIfQ0jEeQ4GJ7v4A8DRwYpo/ChhgZt1zyw4G7k5BRw/il/w9QE9gN2A/4Oy6NmJmGwLHAHOA+WneBsDDwJ+ATYEBxGDhtErScw4B9gB6E1tWHgempHIdC/zCzHqnZX8GzAA+DnRL6fMrqCcRERGhBYGHmfUkfpmPSrNGAUeZ2dru/jxxTMaRadn1iF/wpWWPBp529+vc/X13nwlcnObnTTSzd4itFbsCB7v7spQ2AHgfGOHu77n7C8ClwPEVppdc5O5vu/v/gAnAB+5+g7svc/eJadufS8u+Twxi+rn7cnf/j7v/t+m1JyIi0jG1pMWjNLZjQpq+FVgbGJSmRxNbBAC+Bcx098fSdF9gdzObX3oRg5JNy7bxZXdfH9iaONB021za5sA0d88/B2VKml9Jesms3P9LyqZL89ZL/59B7J4Zb2azzOyXZtYVERERqUizAo80qPR4YEPgDTObDTwPdGJld8udwFZmtiOxm2V0LovpwCPuvmHutUEaSPoR7v4KcBJwZWppAXgd2MLM8g+i6ZfmV5LeZO4+191Pcfctgd2BvYAzm5ufiIhIR9PcwaUHEgeV7gzMzM3fHphkZtu5+zNmdi8wgthNMii33FjgdDMbAtxO7MLoA2zt7g/WtUF3/5OZPQ6cSwxCHiAOHD3HzC4ntqKcRRz8SQXpTWZmg4B/AtOABancyxpaR0RERFZqblfLUOA+d3/C3WfnXg8Bf2flpbWjgS8Dk9z9zdLK7j4b2Jt4Ncw04jiKe4ktEg05DzjOzLZ09wXA/sC+wH+BScSA5udpGw2mN9PniINiFwHPAU8CI1uQn4iISIcSsixrfClpkTBymSq5lWXDWnwvPBERqZ5QX4JumS4iIiKFUeAhIiIihVHgISIiIoVRx3gBxvWfyMCBA2tdDBERkZpTi4eIiIgURoGHiIiIFEaBh4iIiBRGgYeIiIgURoGHiIiIFEaBh4iIiBRGgYeIiIgURoGHiIiIFEYPiStAR3pInB7WJiIi6CFxIiIisjpQ4CEiIiKFUeAhIiIihVHgISIiIoVR4CEiIiKFUeAhIiIihalK4GFmk81seKXza8XMbjSzzMy+WOuyiIiIdEQdpsXDzNYDDgPeBobWuDgiIiIdUmF3ezKz7YFfAJ8D5gGjgIvdfbmZ9QGmApu7+xtp+cHAcHffMk2fApwKdAPeAW5293NSWm/g58DuaXPjgdPdfWGuCEcC7wHfB0aZ2Snu/r9c+XYBrgW2Bp4GHgKGuHuflL4OcCFwCLAB8E/gZHd/tUpVJCIi0u4V0uJhZhsADwN/AjYFBgBDgNMqXH9r4BLgIHdfD/gMMC6lrQU8CjwP9AM+DfQCrirL5kTgNuC3wELgmLLy/QG4E9iYGJyUt4rcCGwD7Jr24XFggpmtWck+iIiISHUDjx+b2fz8C9gjpQ0A3gdGuPt77v4CcClwfIV5LyPefvUzZtbV3ee7+z9S2kFAcPdz3X2pu88DfgIcYWadAMxsZ+CzwCh3/wC4hRiIlAwEFgEj3f0Dd3+K2CJDWr8b8G3gu+7+X3d/H7gA2AzYpSmVJCIi0pFVs6vlp+4+Ij/DzCanfzcHprl7/pklU9L8Rrn7a2Z2BPAd4EYz+w9wobs/BPQFeqdAJy8jtkzMJLZePOXu/05pNwGnmtle7j4Z+AQwo6x803P/901//2Nm+W2sWek+iIiISHFjPF4HtjCzkPty75fmQ2xtAFg3t07PfAbufg9wj5l1AU4C7jezTYgBwsvu/pm6Nmxm6wODgDXMbHYuKSO2ekwmBie9y8rXO7dsKQjZyt3nVrLDIiIi8lFFBR4PEAeWnmNmlxNbEM4CrgNw97fMbDowxMzOIY7TOAFYDmBm/dM6fwGWAguIgcMKYAIwIq33S2IQ0xPY2d3vJQ4qXQFsDyzJlekg4JrUjTIBuBo4zcyuTts/trR9d59jZrcD15rZD919ppltCOwNPOzuixAREZFGFTK41N0XAPsD+wL/BSYBY4lXopQcQwwGFqT5N+XSugDnAbOA+cApwCHu/q67LwH2IQYLL6b1/0gc0wGxVeMGd3/N3WeXXsAYYDYw2N3nE8ehHEG84uaalP5ergwnAC8Bk81sIfAMcCgxABIREZEKhCzT92ZdzOxiYCd337+leYWRyzpMJWfDCrtCW0REVl+hvgR9SyRmth/wLLFFZndiS8mwmhZKRESknVHgsdJ2xMts1wfeBC4Hbq5piURERNoZdbUUQF0tIiLSwairpZbG9Z/IwIEDa10MERGRmuswD4kTERGR2lPgISIiIoVR4CEiIiKFUeAhIiIihVHgISIiIoVR4CEiIiKFUeAhIiIihdENxApQxA3EdOMuERFZjdR7AzG1eIiIiEhhFHiIiIhIYRR4iIiISGEUeIiIiEhhFHiIiIhIYdpk4GFmg83s1RbmcY6Zja9WmURERKRxzb4G08wmA7sBHwDLgdeAEe7+++oUrXpSWR9x9xGlee7+s9qVSEREpGNqaYvHRe7eFdgEuAO4y8y2bnmxREREpD2qyl2n3H2ZmV0LXApsZ2bvAVcDuwNLgd8DZ7v7UgAzy4BTgcHAJwEHTnD3V1P6ZMpaKNI6e7r738q3b2aHAWcDfYHFwDjgNHdfbGa/AvYEdjOzHwEz3b2/mZ0P7OHu+6Y8NgGuBPYj3vhkEnCqu7+d0qcB1wP7ALsA04AT3f3/Wlp/IiIiHUVVxniYWRfge8Rul6eBB4DZwBbArsQAZGTZaicC3wR6AM8B48ysUzOLsAA4HNiQGGTsCQwHcPeTgb+SWmfcvX89edwGbAR8GvgU0A24pWyZIcApwAbAw8DNzSyviIhIh9TSwOPHZjYfeAP4GnAIMZDYitTi4O4ziUHAEDPL30L1Cnd/NbWCnEls+dilOYVw94nu/py7r0itJtcSWyYqYmY9gQNSmee5+zzgNOArZrZZbtHr0naWAzcCW5rZBs0ps4iISEfU0q6Wn+a7QwDMbBAwx90X52ZPAdYCugNz0rxppUR3X2Jmc4FezSmEme0HnAtsA3wM6JTbTiU2T3+nlpW5lDYr/T8rl17av/WILS4iIiLSiNa4nPZ1oIeZrZOb1w94F3grN69P6Z+0bHdiywnAImDdXHrP+jaWunnuA+4Eerv7+sBZrPqAmhUVlHmVMqUy59NERESkhVrjkab/BF4FrjCz04njLi4CRrt7PgA4NQ0inQlcQrwc9/GU5sC3zOznxIDlpw1srwuxNWWeuy81s08DJ5ctMxvYsr4M3P1NM3solfkYYtByBTDR3WfVt56IiIg0TdVbPNx9GXAQsdtkBjEQeRwYVrbojcA9wFxgB+BraewExKtLXiR2d/ybOFi1vu0tAr4DXGZmi4BrgNvLFrsSMDObb2bP1ZPVkcDCtN0XgfnA0Y3tr4iIiFQuZFlW+EYbujS2PQojl7V6JWfDWqPxSkREpFlCfQlt8pbpIiIi0jYp8BAREZHC1KR93t3rbYIRERGR9ksDAwowrv9EBg4cWOtiiIiI1Jy6WkRERKQwCjxERESkMAo8REREpDAKPERERKQwCjxERESkMAo8REREpDAKPERERKQwNXlWS0ejZ7WIiEgHo2e1iIiISO0p8BAREZHCKPAQERGRwijwEBERkcIo8BAREZHCtMvAw8yONLNptS6HiIiIrKriazDNbDhwEXCMu4+tVgHMLAOWAivS62XgHHd/qFrbEBERkdVDRS0eZrYGcBzwNjC0Fcqxv7t3BTYCRgP3mtmGrbAdAMxszdbKW0REROpXaYvHAUAv4GBggplt6+7PmtlI4JPu/vXSgma2NzAO2NTdF5vZtsAVwE7AEuA24Fx3/6B8I+6+3MzGAL8C+gFPpjwbzMPMdgauBbYB/g2s0lqSul1GAXsDOwPHmdk2wJ6AA0OIQdhPgd8Tg5/PE1tfjnT3F1I+hwHnpbpYAkx098EV1qGIiEiHV+kYj6HEL9kHgKeBE9P8UcAAM+ueW3YwcHcKOnoAfwbuAXoCuwH7AWfXtZHUEnEc8BbwUprXYB5mtgEwEfgdsDFwKvDdOrI/ATgN6Arcn+Z9EXgF2BQ4ErgcuAn4XsrrBeCqtJ11gFuA77n7esTA6KaGKk1ERERW1WiLh5n1BAYAh6ZZo4ALzOwsd3/ezJ4ifmlfaWbrAYcQW0gAjgaedvfr0vRMM7sYuBS4MLeZiWa2HFgHWA58390XV5jHQcBi4FJ3z4B/mdlNwBFlu3KDuz+V/l9qZgAvu/uNuTL8D5iUa+G4ndi6UvIBsI2Z/dvd3wb+2lj9iYiIyEqVtHiUxnZMSNO3AmsDg9L0aODY9P+3gJnu/lia7gvsbmbzSy9i4LJp2Ta+7O4bAmsBewA/NbNjK8yjFzA9BR0lU+vYj2l1zJtVNr2kbN4SYD0Ad18CfAU4EJhiZk+Y2eF15CkiIiL1aLDFIw0qPR7YEHgjtRIAdCJ2t4wB7iS2duxI7GYZnctiOvCIuw+opDDuvgJ4wsz+Cnwj5dVYHjOBLcws5IKPvnUst6KSMjRSvsnAZDPrBHwV+L2ZPe7uU1qat4iISEfQWFfLgcQWhZ2JX/Al2wOTzGw7d3/GzO4FRgC7srIlBGAscLqZDQFuB94H+gBbu/uDdW3QzHYgDvq8ocI8JgBXA2eY2ZXAdsTBou81uvdNYGYfJ7bGPOLuC1LLC8SuIREREalAY10tQ4H73P0Jd5+dez0E/J2Vl9aOBr5MHB/xZmlld59NvJLkYGJXxzzgXuLAzLyHzGyRmS0mXhFzK2kMSGN5uPt84hiUQSntauDXTauGiqxBHHQ6zcwWAtcQ72kyrRW2JSIi0i6FLMsaX0paJIxc1uqVnA2r+F5wIiIirS3Ul9Aub5kuIiIiqycFHiIiIlIYBR4iIiJSGA0MKMC4/hMZOHBgrYshIiJSc2rxEBERkcIo8BAREZHCKPAQERGRwijwEBERkcIo8BAREZHCKPAQERGRwijwEBERkcIo8BAREZHCKPAQERGRwijwEBERkcIo8BAREZHCKPAQERGRwijwEBERkcIo8BAREZHCKPAQERGRwijwEBERkcIo8BAREZHCKPAQERGRwoQsy2pdhnbvYx/72LPvv//+u7UuR0fRuXPnbsuWLXur1uXoSFTnxVJ9F0913mRvZVl2YF0JnYsuSUe03XbbvevuVutydBRm5qrvYqnOi6X6Lp7qvHrU1SIiIiKFUeAhIiIihVHgUYzra12ADkb1XTzVebFU38VTnVeJBpeKiIhIYdTiISIiIoVR4CEiIiKF0eW0VWJmWwM3A5sA/wOOdvdXypbpBFwNHAhkwCXufmPRZW0vKqzz/YGfAdsBv3T3YYUXtJ2osL5/AhwGLEuvc9x9UtFlbQ8qrO9jgVOBFUAn4AZ3v7rosrYXldR5btn+wFPAtfpcaRq1eFTPb4Br3H1r4BrgujqWOQLYEtgK2A0438z6FFbC9qeSOn8NOAG4vMiCtVOV1Pc/gc+7+w7AEOAuM1u7wDK2J5XU9++BHdz9s8AXgNPNbPsCy9jeVFLnpR+R1wH3FVi2dkOBRxWYWR+S9bIAABA7SURBVA9gR+CONOsOYEcz61626CDiL5IV7j6XeNAeWlxJ249K69zdX3X3p4i/vqWZmlDfk9x9SZr8DxCIvx6lCZpQ3++4e+kKgXWANYmtqdJETfgcB/gRMAF4uaDitSsKPKpjc2Cmuy8HSH/fTPPzegPTc9Mz6lhGKlNpnUt1NKe+jwamuPsbBZSvvam4vs3sq2b2HPGz5XJ3f6bQkrYfFdV5alE6ALiy8BK2Ewo8RKTqzOxLwEXAt2tdlvbO3ce5+2eArYGj0tgDaQVmtiZwA3BSKUCRplPgUR2vA59I/X6l/r+eaX7eDGCL3HTvOpaRylRa51IdFde3me0G3Aoc7O4vFVrK9qPJx7e7zyCOsTmokBK2P5XU+WbAJ4E/mNk04IfACWamm4s1gQKPKnD3OcC/Wfnr7tvAU2kcR95viQfpGqnf8GDi4DBpoibUuVRBpfVtZp8H7gK+6e5PFlvK9qMJ9b1N7v9uwN6AulqaoZI6d/cZ7t7N3fu4ex/gF8RxeycWXuA2TJfTVs9JwM1mdi4wj9i/jZn9ATjX3R24BdgFKF2edaG7v1aLwrYTjda5me0B3AmsDwQzOww4Tpd4Nkslx/i1wNrAdWYfPsjzKI07aJZK6ntoumT8A+JA3l+5+0O1KnA7UEmdSwvplukiIiJSGHW1iIiISGEUeIiIiEhhFHiIiIhIYRR4iIiISGEUeIiIiEhhFHhInUIIB4QQ/pqb3iuEMK2GRSpMCGFMCKFqTw0OIfQJIWS56e4hhOkhhG4VrHtSCOGWapWlLQgh7BlCmF/rcnREIYQjm3KeV/tckYa11rnRjPf90hDCRc3dngIP+YgQQiA+h+C8Rpb7Tgjh2RDCOyGEeSEEDyEMyqVPCyEcWcd6H5kfopdTXl3L0vYKIWQhhEXp9WYIYXQIYeOW7WltZFk2F7idxut3XeBC4PwCirXayLLsr1mWbVjrctQnhHB+COGRWpejI2itug4hTA4hDK92vq2t/Nyo4bF4CfC9EMInmrOyAg+py/5AF+BP9S0QQvg28YvzOGAD4q2FTyXedKc59gb6ASuo+/key7Ms65plWVdgD2A34l0D26pRwLEhhPUbWOZI4Jksy6YUVKZVhBA6hRD0GSEiq8iybB4wERjanPX1oVJj6df/8BDCn9Kv+WdCCNuHEL4dQng1hLAghHBjCKFzbp3eIYTfhRBmpdf1IYT1cuk/CyG8lvKbEkL4YS6tT2o9OCqE8HwIYWEI4aEQwma5Yh0MPJI1fHe5LwB/ybLs8SxamqLx5t41cSjwIPHurg0ezFmWvUZ8JPXnytNCCJ1TnXytbP7NIYRR6f99QgiPp1aauSGEO0MIPerbXqqvPXLTe4UQlpVt85zUYjM/hPBYCGGnRvbhFeAtYN8GFjsYeLisLD8IIbyY3rcZIYSLQwidUtrIEMK9ZcvvnZZdN01vG0KYFEJ4K7f+mimtdGwcF0J4HlgC9AghHBZCeDq1Rs0KIVxXyi+tt2kIYXw6Vl9O62chhD65ZU5IrWMLQghPhRD2r2+n66jfMSGEW0IIo1L9zkznx2dDCP9K+/enEELP3DrTQgjnhhD+ls4DDyF8Ppfe4DEQQlgzvacvpfynhBAOCbFF7xxgr7CyBa5fPfvxpbSNBek9G5pL2yuEsCyEMCjlvSCEcHf+PK4jv+Z8VmwfQng07edraf1OufSdU90sCiH8jRj857e5TjqupoYQ3g4hPBhC2LK+MtZR5k1CCGPTcTM7xPNw41z6Kq2fuWOwV311HUIYnPb3rJTvnBDCFXUcx71y+Q4OIbya/v8VsCfwk5Rnnc8TCrE14Y8hdivMDSH8L4RwWghhi1SnC0MIT4QQPpVbp0XnSu5YvyF3rH/kuEn/N1g/ZfuySpdYld73h4mfUU2XZZleNXwB04i3UP8UsCbx4VpTgOuBdYkPkpsDHJ6WXwt4ldgEvzawEfAHYFQuzyOJLRAB+H/AUuCAlNYHyIhf3N2ItxJ/DLght/7jwCll5dwLmJabPhR4FxgB7ANsWM++HdnYfKA78B7wDeCzqXw7lW17WW56S+Cl/D6X5X8ZcF9uuiuwCNgzTe8BfJ74yIBNgb8Ad+SWHwPcmJvOgD0aKM/PUp31AzoRW4HeAjbK13kd5RwPjGjg2Pgv8NWyeYcAfdN7+7m0zNCU9mngfaB7bvmbgZvS/z2A/xEDuy7AJwAHzi07Nv6Y6qVL2p8vA58h/lDZEngeuDi3jT8Snzm0ftrG5JRPn5R+IvGY3SHl8ZX0fmxZz36X1+8Y4jE8IK1/Ulp/HNALWAd4FLi+7Bh7E9gp7cePgLnA+hUeA5em/dw+1XUvYPuUdj4xMG/ovO6bynxs2sauwNvAobl9zICbiMfnx4mfAz+u4mfFBun4+AnwsbTea8AZufT/pbrpkupjNque57cTPys+npa5AHgRWLOuc6WOMj9IPM43Sq8HgAca+Czok+qlV311DQwm3iL+GuJn4CeBl4Gz68ojt86ruenJwPBG3sPz03aOZ+V5sBx4pOw9eCi3TkvPlTHE4+arKY9vpDJsUc+5UV/9vFo278P3qRrve1pmJ2ILdZeG6rHOum3qCnpV95VOvDNy019JB2L+y+Nu4Mr0/zeBKWV57ET84u5UzzZ+B1yW/i+dlJ/PpX8PeCo3/TIwuCyPvfIHZpp3EHAP8cNtObFrZtuyfVsMzC97rWDVD5sziR+YpQ+zJ4HryradpXXnAVOB31BHsJOW/xTxC7hHmh4CvNzAe3AQMCc3/eFJmqbrDTyIX0oLgS+W5flMaR+pP/C4Dbi2gXK9D+zVyPEzErg7N/04cGr6fz3iF/TuaXoY8GjZ+oeQPqRyx8YXG9nmycA/0/+90jr9cun7sOqH6bPA0WV5jKeeD37qDjzyX1brpPwPzc37Lqsew9OAi3LTgfh06MMbOwbSsouAAfUsez6NBx7nAI+VzbsYmFR2TOfP88uBexvIcxpN+6w4nPhk1ZBLHwq8lP4/ItVJPv2npPOc+MMkA3rn0tcAFpDOBxoIPIg/fjJgq9y8/mneZrl9ak7g8R6wTm7e8aRzvDyP3DrNCTyeK5s3p473YF4Vz5Ux5I71NG8u8LV6zo366qehwKPF73uat1VarkdD9VjXSw+JWz3Myv2/hDieYW7ZvFITbF+gd/joyOaM+MttZgjhFOAE4oEeiL8Kbm9gm4tz+UP8cm9o7EHcYJZNIEbFhBC2IT4gbEIIoW+Wjkzir/Fb8+uF3OjpEEJIZb01y7IP0uybgEtCCKdnWbYozVueVTjgMMuyF0IITxJbfn5O/NU5OrfNnYitFDsQv8QC8Vdnc3RL644PuStXiL+GetW9yofWJwZR9fnI+xDi2JrTiK0rnYm/Rv6RW2Q08Uv4SuBbwMwsyx5LaX2B3cuOnUD8NZc3rWyb+wHnAtsQfzl3In4AQ2w1gfhBVjK9LL++wDUhhKtz8zoDb1C5D4/XLMuWxMPmI+dNeTfFtNw6WQhhBuk9aeQY6E5sQXi5CeUrtzmxdSFvCpDvAiw/z8vPw7o05bNic+KXSf64nJLmQ6yL6WXp+eOxb/r7n1TfJWvm8mhIaZl8nlNyabNovjlZli3JTU+j8fOtOcrLuIQGjrsqnCt1bbOS46IpqvW+r8/KH4RNojEebc90YmS/YdlrrSzLZoYQdic2Ew8FuqUv6/HED9ZKPUVstq9YlmUvEr/stiA2qVZqH2KT5JDUBzyb2KzXlfiLrblGA4NTv+SuwNhc2p3EVpWtsyxbn7oHs+YtJn4RlfTM/f9WSt+37P1YN8uySxrJd1tiXddnlfchhLA5sWl3BPEX4wbE5ub8e3snsFUIYUfiL5/RubTpxF9H+XJukMUBu3krctvsAtyX8u2d6uus3DZnpr+9c+vn/y9td0jZdrtmWfadBva9GvqU/kkBbm9WBjsNHQNzie/pVvXku6Ke+Xmvs/IDvKRfml+U14EtwqrfHvkyzKwjPV/m0pfiVmXv3TpZlt1R4fYh9z6wcixBKW0R9Z9bUH9d9wghrJOb7sPK97b0Y6U5+TZblc6VpqprP8rrFFbd/2q979sSW4Teb2qhFXi0PROA0sC39UL0iRDC11P6+sRuj7lAFkIYQOx3bIr7iAFBvUIIQ0IIh4Z0L4o0kOsk4Pksy95uwrZOJPavb0Mc3/FZ4gE9mmaOmE7uJAY0VwMPZ1k2M5e2PrHZcGEIoTexr7MhDhwTQuiSBoGdVkpIvxquAkaGELYCCCF0DfE+KOUfdh9KAVF3Yn9xfe5j1cGnXYnn7FzggxDCrsBR+RWyLJsP3EsMTsoDrrGApfdurRDCGmkw2oENlKELcVzRvCzLloYQPk1sPi5t7w1is/Ul6XjsAZRfpnglcH6Ig0FDCGHtEMIeqZWsNQ0JIewY4qDDM4gtGw+ktHqPgfSe/hq4LMTBuKVzbLu0yGxiq2OXBrZ9B7BTCOHoEAcf70w8nm+q6h427AHie3dOOnb7E78IS2WYQDymzghxMO2OxG5JALIsm0NsKb02pMsmQwgbhhC+Hsouea9LlmVvAg8BV6T1NgKuACZmWVb6Ve/At9M50504HiWvvrpeg3jMrR3i4N5hxPFMZFn2FinYDfHKrO2Irarl+VY8SLZC1ThXmqqu+nmKGJgdlM7xrwNfzKVX633fj/gZ1WQKPNqY1Ly4D/GX8IvED88/Er+wASYRrwz5J/HX+DeJX0RNMQlYFkLYq4Fl5hGb9F8IISwmji2YT+wrr0g68Q4GRmZZNjv/IrbafC6EYE0sOwBZli0g7veXiZeu5p1I7BNeSByj8ttGsjuZ+CH1NrEPfUxZ+nnA/cD9IYR3iAMAT6Lh82sIMCaVsz63ADukD1ayLHsht635xC/Lun55jibu96T04U9afzbxsuWDiU3T84h1VOdVGWmdRcB3iF/Ci4gtLOXddocTv9TfAP7Gyvp8L+VxA3HA7+i0zRnEL5g1G9j3arieGHjOAwYRx2yU6ruxY+DHxPf6vrTMn1nZAvJb4i/22SFeeVDeskGWZVOJ/f8nEwfy3UIcxHt31fauEWlf9ycGr/8lntdjid2PpSB1ALFu5hHr6tdl2ZxAHMg9OYSwkDh26VBiE3sljiTW34vpNR84Opc+nPhDaRbxS/nOsvXrq+vpxF/uU4mfPQ8Sj7GSY4ifRQvS/pYHfFcSg/D5IYTnKtyXBlXjXGmGj9RPFi+//wHx+H8bOJA4oLVUzha/7yGEDYnH92+aU+iwajePSJR+BZ+TZdkX0/RexC/KPrUsV1uUWkmmZlkW0nQ34AnAyvrn61r3JOLg0KMaWm51EkI4gBgcrZ3V6AMmxHFEw8vHF0nbF0IYTHxvq91iUbjV4VxpjhDCxcTxRc1qsdHgUqlTlmUPEn9FSJWlpuAtKlz2NzTzV0VRQgg7EH8JPUPsKx4B3NWWPkhFitBezpUsy85uyfrqapFKTaNt3ym0luYTB8y2VxsTuysWEZuP/0Ns6hWRVelcQV0tIiIiUiC1eIiIiEhhFHiIiIhIYRR4iIiISGEUeIiIiEhhFHiIiIhIYf4/lBG7GSUzgM4AAAAASUVORK5CYII=\n",
-      "text/plain": [
-       "<Figure size 576x338.4 with 1 Axes>"
-      ]
-     },
-     "metadata": {
-      "needs_background": "light"
-     },
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "# Show a summary of feature importance\n",
-    "shap.summary_plot(shap_values, X, plot_type=\"bar\", feature_names=data.feature_names)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.3"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/demo/gpu_acceleration/tree_shap.py b/demo/gpu_acceleration/tree_shap.py
new file mode 100644
index 000000000..d591307e0
--- /dev/null
+++ b/demo/gpu_acceleration/tree_shap.py
@@ -0,0 +1,55 @@
+"""
+Use GPU to speedup SHAP value computation
+=========================================
+
+Demonstrates using GPU acceleration to compute SHAP values for feature importance.
+
+"""
+import shap
+from sklearn.datasets import fetch_california_housing
+
+import xgboost as xgb
+
+# Fetch dataset using sklearn
+data = fetch_california_housing()
+print(data.DESCR)
+X = data.data
+y = data.target
+
+num_round = 500
+
+param = {
+    "eta": 0.05,
+    "max_depth": 10,
+    "tree_method": "hist",
+    "device": "cuda",
+}
+
+# GPU accelerated training
+dtrain = xgb.DMatrix(X, label=y, feature_names=data.feature_names)
+model = xgb.train(param, dtrain, num_round)
+
+# Compute shap values using GPU with xgboost
+model.set_param({"device": "cuda"})
+shap_values = model.predict(dtrain, pred_contribs=True)
+
+# Compute shap interaction values using GPU
+shap_interaction_values = model.predict(dtrain, pred_interactions=True)
+
+
+# shap will call the GPU accelerated version as long as the device parameter is set to
+# "cuda"
+explainer = shap.TreeExplainer(model)
+shap_values = explainer.shap_values(X)
+
+# visualize the first prediction's explanation
+shap.force_plot(
+    explainer.expected_value,
+    shap_values[0, :],
+    X[0, :],
+    feature_names=data.feature_names,
+    matplotlib=True,
+)
+
+# Show a summary of feature importance
+shap.summary_plot(shap_values, X, plot_type="bar", feature_names=data.feature_names)
diff --git a/demo/nvflare/horizontal/custom/trainer.py b/demo/nvflare/horizontal/custom/trainer.py
index f65f800f0..b1ec94211 100644
--- a/demo/nvflare/horizontal/custom/trainer.py
+++ b/demo/nvflare/horizontal/custom/trainer.py
@@ -70,8 +70,7 @@ class XGBoostTrainer(Executor):
             param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
             if self._use_gpus:
                 self.log_info(fl_ctx, f'Training with GPU {rank}')
-                param['tree_method'] = 'gpu_hist'
-                param['gpu_id'] = rank
+                param['device'] = f"cuda:{rank}"
 
             # Specify validations set to watch performance
             watchlist = [(dtest, 'eval'), (dtrain, 'train')]
diff --git a/demo/rmm_plugin/README.md b/demo/rmm_plugin/README.md
deleted file mode 100644
index bf6e7f12d..000000000
--- a/demo/rmm_plugin/README.md
+++ /dev/null
@@ -1,47 +0,0 @@
-Using XGBoost with RAPIDS Memory Manager (RMM) plugin (EXPERIMENTAL)
-====================================================================
-[RAPIDS Memory Manager (RMM)](https://github.com/rapidsai/rmm) library provides a collection of
-efficient memory allocators for NVIDIA GPUs. It is now possible to use XGBoost with memory
-allocators provided by RMM, by enabling the RMM integration plugin.
-
-The demos in this directory highlights one RMM allocator in particular: **the pool sub-allocator**.
-This allocator addresses the slow speed of `cudaMalloc()` by allocating a large chunk of memory
-upfront. Subsequent allocations will draw from the pool of already allocated memory and thus avoid
-the overhead of calling `cudaMalloc()` directly. See
-[this GTC talk slides](https://on-demand.gputechconf.com/gtc/2015/presentation/S5530-Stephen-Jones.pdf)
-for more details.
-
-Before running the demos, ensure that XGBoost is compiled with the RMM plugin enabled. To do this,
-run CMake with option `-DPLUGIN_RMM=ON` (`-DUSE_CUDA=ON` also required):
-```
-cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON
-make -j4
-```
-CMake will attempt to locate the RMM library in your build environment. You may choose to build
-RMM from the source, or install it using the Conda package manager. If CMake cannot find RMM, you
-should specify the location of RMM with the CMake prefix:
-```
-# If using Conda:
-cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
-# If using RMM installed with a custom location
-cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON -DCMAKE_PREFIX_PATH=/path/to/rmm
-```
-
-# Informing XGBoost about RMM pool
-
-When XGBoost is compiled with RMM, most of the large size allocation will go through RMM
-allocators, but some small allocations in performance critical areas are using a different
-caching allocator so that we can have better control over memory allocation behavior.
-Users can override this behavior and force the use of rmm for all allocations by setting
-the global configuration ``use_rmm``:
-
-``` python
-with xgb.config_context(use_rmm=True):
-    clf = xgb.XGBClassifier(tree_method="gpu_hist")
-```
-
-Depending on the choice of memory pool size or type of allocator, this may have negative
-performance impact.
-
-* [Using RMM with a single GPU](./rmm_singlegpu.py)
-* [Using RMM with a local Dask cluster consisting of multiple GPUs](./rmm_mgpu_with_dask.py)
diff --git a/demo/rmm_plugin/README.rst b/demo/rmm_plugin/README.rst
new file mode 100644
index 000000000..4742507d2
--- /dev/null
+++ b/demo/rmm_plugin/README.rst
@@ -0,0 +1,51 @@
+Using XGBoost with RAPIDS Memory Manager (RMM) plugin (EXPERIMENTAL)
+====================================================================
+
+`RAPIDS Memory Manager (RMM) <https://github.com/rapidsai/rmm>`__ library provides a
+collection of efficient memory allocators for NVIDIA GPUs. It is now possible to use
+XGBoost with memory allocators provided by RMM, by enabling the RMM integration plugin.
+
+The demos in this directory highlights one RMM allocator in particular: **the pool
+sub-allocator**.  This allocator addresses the slow speed of ``cudaMalloc()`` by
+allocating a large chunk of memory upfront. Subsequent allocations will draw from the pool
+of already allocated memory and thus avoid the overhead of calling ``cudaMalloc()``
+directly. See `this GTC talk slides
+<https://on-demand.gputechconf.com/gtc/2015/presentation/S5530-Stephen-Jones.pdf>`_ for
+more details.
+
+Before running the demos, ensure that XGBoost is compiled with the RMM plugin enabled. To do this,
+run CMake with option ``-DPLUGIN_RMM=ON`` (``-DUSE_CUDA=ON`` also required):
+
+.. code-block:: sh
+
+  cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON
+  make -j$(nproc)
+
+CMake will attempt to locate the RMM library in your build environment. You may choose to build
+RMM from the source, or install it using the Conda package manager. If CMake cannot find RMM, you
+should specify the location of RMM with the CMake prefix:
+
+.. code-block:: sh
+
+  # If using Conda:
+  cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
+  # If using RMM installed with a custom location
+  cmake .. -DUSE_CUDA=ON -DUSE_NCCL=ON -DPLUGIN_RMM=ON -DCMAKE_PREFIX_PATH=/path/to/rmm
+
+********************************
+Informing XGBoost about RMM pool
+********************************
+
+When XGBoost is compiled with RMM, most of the large size allocation will go through RMM
+allocators, but some small allocations in performance critical areas are using a different
+caching allocator so that we can have better control over memory allocation behavior.
+Users can override this behavior and force the use of rmm for all allocations by setting
+the global configuration ``use_rmm``:
+
+.. code-block:: python
+
+  with xgb.config_context(use_rmm=True):
+    clf = xgb.XGBClassifier(tree_method="hist", device="cuda")
+
+Depending on the choice of memory pool size or type of allocator, this may have negative
+performance impact.
diff --git a/demo/rmm_plugin/rmm_mgpu_with_dask.py b/demo/rmm_plugin/rmm_mgpu_with_dask.py
index be2aa83a7..2384b209e 100644
--- a/demo/rmm_plugin/rmm_mgpu_with_dask.py
+++ b/demo/rmm_plugin/rmm_mgpu_with_dask.py
@@ -1,3 +1,7 @@
+"""
+Using rmm with Dask
+===================
+"""
 import dask
 from dask.distributed import Client
 from dask_cuda import LocalCUDACluster
@@ -11,25 +15,33 @@ def main(client):
     # xgb.set_config(use_rmm=True)
 
     X, y = make_classification(n_samples=10000, n_informative=5, n_classes=3)
-    # In pratice one should prefer loading the data with dask collections instead of using
-    # `from_array`.
+    # In pratice one should prefer loading the data with dask collections instead of
+    # using `from_array`.
     X = dask.array.from_array(X)
     y = dask.array.from_array(y)
     dtrain = xgb.dask.DaskDMatrix(client, X, label=y)
 
-    params = {'max_depth': 8, 'eta': 0.01, 'objective': 'multi:softprob', 'num_class': 3,
-              'tree_method': 'gpu_hist', 'eval_metric': 'merror'}
-    output = xgb.dask.train(client, params, dtrain, num_boost_round=100,
-                            evals=[(dtrain, 'train')])
-    bst = output['booster']
-    history = output['history']
-    for i, e in enumerate(history['train']['merror']):
-        print(f'[{i}] train-merror: {e}')
+    params = {
+        "max_depth": 8,
+        "eta": 0.01,
+        "objective": "multi:softprob",
+        "num_class": 3,
+        "tree_method": "hist",
+        "eval_metric": "merror",
+        "device": "cuda",
+    }
+    output = xgb.dask.train(
+        client, params, dtrain, num_boost_round=100, evals=[(dtrain, "train")]
+    )
+    bst = output["booster"]
+    history = output["history"]
+    for i, e in enumerate(history["train"]["merror"]):
+        print(f"[{i}] train-merror: {e}")
 
 
-if __name__ == '__main__':
-    # To use RMM pool allocator with a GPU Dask cluster, just add rmm_pool_size option to
-    # LocalCUDACluster constructor.
-    with LocalCUDACluster(rmm_pool_size='2GB') as cluster:
+if __name__ == "__main__":
+    # To use RMM pool allocator with a GPU Dask cluster, just add rmm_pool_size option
+    # to LocalCUDACluster constructor.
+    with LocalCUDACluster(rmm_pool_size="2GB") as cluster:
         with Client(cluster) as client:
             main(client)
diff --git a/demo/rmm_plugin/rmm_singlegpu.py b/demo/rmm_plugin/rmm_singlegpu.py
index 50d4a7ea3..b4dccd805 100644
--- a/demo/rmm_plugin/rmm_singlegpu.py
+++ b/demo/rmm_plugin/rmm_singlegpu.py
@@ -1,3 +1,7 @@
+"""
+Using rmm on a single node device
+=================================
+"""
 import rmm
 from sklearn.datasets import make_classification
 
@@ -16,7 +20,8 @@ params = {
     "eta": 0.01,
     "objective": "multi:softprob",
     "num_class": 3,
-    "tree_method": "gpu_hist",
+    "tree_method": "hist",
+    "device": "cuda",
 }
 # XGBoost will automatically use the RMM pool allocator
 bst = xgb.train(params, dtrain, num_boost_round=100, evals=[(dtrain, "train")])
diff --git a/doc/.gitignore b/doc/.gitignore
index 61e15164c..26725cafb 100644
--- a/doc/.gitignore
+++ b/doc/.gitignore
@@ -6,3 +6,5 @@ doxygen
 parser.py
 *.pyc
 web-data
+# generated by doxygen
+tmp
\ No newline at end of file
diff --git a/doc/conf.py b/doc/conf.py
index f8926e73b..68ec39181 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -19,7 +19,6 @@ import sys
 import tarfile
 import urllib.request
 import warnings
-from subprocess import call
 from urllib.error import HTTPError
 
 from sh.contrib import git
@@ -148,12 +147,20 @@ extensions = [
 
 sphinx_gallery_conf = {
     # path to your example scripts
-    "examples_dirs": ["../demo/guide-python", "../demo/dask", "../demo/aft_survival"],
+    "examples_dirs": [
+        "../demo/guide-python",
+        "../demo/dask",
+        "../demo/aft_survival",
+        "../demo/gpu_acceleration",
+        "../demo/rmm_plugin"
+    ],
     # path to where to save gallery generated output
     "gallery_dirs": [
         "python/examples",
         "python/dask-examples",
         "python/survival-examples",
+        "python/gpu-examples",
+        "python/rmm-examples",
     ],
     "matplotlib_animations": True,
 }
diff --git a/doc/gpu/index.rst b/doc/gpu/index.rst
index 4489c1427..a11b753fe 100644
--- a/doc/gpu/index.rst
+++ b/doc/gpu/index.rst
@@ -23,20 +23,19 @@ The GPU algorithms currently work with CLI, Python, R, and JVM packages. See :do
   :caption: Python example
 
   params = dict()
-  params["device"] = "cuda:0"
+  params["device"] = "cuda"
   params["tree_method"] = "hist"
   Xy = xgboost.QuantileDMatrix(X, y)
   xgboost.train(params, Xy)
 
 .. code-block:: python
-  :caption: With Scikit-Learn interface
+  :caption: With the Scikit-Learn interface
 
   XGBRegressor(tree_method="hist", device="cuda")
 
-
 GPU-Accelerated SHAP values
 =============================
-XGBoost makes use of `GPUTreeShap <https://github.com/rapidsai/gputreeshap>`_ as a backend for computing shap values when the GPU predictor is selected.
+XGBoost makes use of `GPUTreeShap <https://github.com/rapidsai/gputreeshap>`_ as a backend for computing shap values when the GPU is used.
 
 .. code-block:: python
 
@@ -44,12 +43,12 @@ XGBoost makes use of `GPUTreeShap <https://github.com/rapidsai/gputreeshap>`_ as
   shap_values = booster.predict(dtrain, pred_contribs=True)
   shap_interaction_values = model.predict(dtrain, pred_interactions=True)
 
-See examples `here <https://github.com/dmlc/xgboost/tree/master/demo/gpu_acceleration>`__.
+See :ref:`sphx_glr_python_gpu-examples_tree_shap.py` for a worked example.
 
 Multi-node Multi-GPU Training
 =============================
 
-XGBoost supports fully distributed GPU training using `Dask <https://dask.org/>`_, ``Spark`` and ``PySpark``. For getting started with Dask see our tutorial :doc:`/tutorials/dask` and worked examples `here <https://github.com/dmlc/xgboost/tree/master/demo/dask>`__, also Python documentation :ref:`dask_api` for complete reference. For usage with ``Spark`` using Scala see :doc:`/jvm/xgboost4j_spark_gpu_tutorial`. Lastly for distributed GPU training with ``PySpark``, see :doc:`/tutorials/spark_estimator`.
+XGBoost supports fully distributed GPU training using `Dask <https://dask.org/>`_, ``Spark`` and ``PySpark``. For getting started with Dask see our tutorial :doc:`/tutorials/dask` and worked examples :doc:`/python/dask-examples/index`, also Python documentation :ref:`dask_api` for complete reference. For usage with ``Spark`` using Scala see :doc:`/jvm/xgboost4j_spark_gpu_tutorial`. Lastly for distributed GPU training with ``PySpark``, see :doc:`/tutorials/spark_estimator`.
 
 
 Memory usage
@@ -67,7 +66,8 @@ If you are getting out-of-memory errors on a big dataset, try the or :py:class:`
 
 CPU-GPU Interoperability
 ========================
-XGBoost models trained on GPUs can be used on CPU-only systems to generate predictions. For information about how to save and load an XGBoost model, see :doc:`/tutorials/saving_model`.
+
+The model can be used on any device regardless of the one used to train it. For instance, a model trained using GPU can still work on a CPU-only machine and vice versa. For more information about model serialization, see :doc:`/tutorials/saving_model`.
 
 
 Developer notes
diff --git a/doc/install.rst b/doc/install.rst
index 51f0d0d60..bf90a913b 100644
--- a/doc/install.rst
+++ b/doc/install.rst
@@ -189,7 +189,7 @@ This will check out the latest stable version from the Maven Central.
 
 For the latest release version number, please check `release page <https://github.com/dmlc/xgboost/releases>`_.
 
-To enable the GPU algorithm (``tree_method='gpu_hist'``), use artifacts ``xgboost4j-gpu_2.12`` and ``xgboost4j-spark-gpu_2.12`` instead (note the ``gpu`` suffix).
+To enable the GPU algorithm (``device='cuda'``), use artifacts ``xgboost4j-gpu_2.12`` and ``xgboost4j-spark-gpu_2.12`` instead (note the ``gpu`` suffix).
 
 
 .. note:: Windows not supported in the JVM package
@@ -325,4 +325,4 @@ The SNAPSHOT JARs are hosted by the XGBoost project. Every commit in the ``maste
 
 You can browse the file listing of the Maven repository at https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/list.html.
 
-To enable the GPU algorithm (``tree_method='gpu_hist'``), use artifacts ``xgboost4j-gpu_2.12`` and ``xgboost4j-spark-gpu_2.12`` instead (note the ``gpu`` suffix).
+To enable the GPU algorithm (``device='cuda'``), use artifacts ``xgboost4j-gpu_2.12`` and ``xgboost4j-spark-gpu_2.12`` instead (note the ``gpu`` suffix).
diff --git a/doc/parameter.rst b/doc/parameter.rst
index 382cddd4f..2072c4b75 100644
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -34,27 +34,6 @@ General Parameters
 
   - Which booster to use. Can be ``gbtree``, ``gblinear`` or ``dart``; ``gbtree`` and ``dart`` use tree based models while ``gblinear`` uses linear functions.
 
-* ``verbosity`` [default=1]
-
-  - Verbosity of printing messages.  Valid values are 0 (silent), 1 (warning), 2 (info), 3
-    (debug).  Sometimes XGBoost tries to change configurations based on heuristics, which
-    is displayed as warning message.  If there's unexpected behaviour, please try to
-    increase value of verbosity.
-
-* ``validate_parameters`` [default to ``false``, except for Python, R and CLI interface]
-
-  - When set to True, XGBoost will perform validation of input parameters to check whether
-    a parameter is used or not.
-
-* ``nthread`` [default to maximum number of threads available if not set]
-
-  - Number of parallel threads used to run XGBoost.  When choosing it, please keep thread
-    contention and hyperthreading in mind.
-
-* ``disable_default_eval_metric`` [default= ``false``]
-
-  - Flag to disable default metric. Set to 1 or ``true`` to disable.
-
 * ``device`` [default= ``cpu``]
 
   .. versionadded:: 2.0.0
@@ -67,6 +46,29 @@ General Parameters
     + ``gpu``: Default GPU device selection from the list of available and supported devices. Only ``cuda`` devices are supported currently.
     + ``gpu:<ordinal>``: Default GPU device selection from the list of available and supported devices. Only ``cuda`` devices are supported currently.
 
+    For more information about GPU acceleration, see :doc:`/gpu/index`.
+
+* ``verbosity`` [default=1]
+
+  - Verbosity of printing messages.  Valid values are 0 (silent), 1 (warning), 2 (info), 3
+    (debug).  Sometimes XGBoost tries to change configurations based on heuristics, which
+    is displayed as warning message.  If there's unexpected behaviour, please try to
+    increase value of verbosity.
+
+* ``validate_parameters`` [default to ``false``, except for Python, R and CLI interface]
+
+  - When set to True, XGBoost will perform validation of input parameters to check whether
+    a parameter is used or not. A warning is emitted when there's unknown parameter.
+
+* ``nthread`` [default to maximum number of threads available if not set]
+
+  - Number of parallel threads used to run XGBoost.  When choosing it, please keep thread
+    contention and hyperthreading in mind.
+
+* ``disable_default_eval_metric`` [default= ``false``]
+
+  - Flag to disable default metric. Set to 1 or ``true`` to disable.
+
 Parameters for Tree Booster
 ===========================
 * ``eta`` [default=0.3, alias: ``learning_rate``]
@@ -160,7 +162,7 @@ Parameters for Tree Booster
     - ``grow_colmaker``: non-distributed column-based construction of trees.
     - ``grow_histmaker``: distributed tree construction with row-based data splitting based on global proposal of histogram counting.
     - ``grow_quantile_histmaker``: Grow tree using quantized histogram.
-    - ``grow_gpu_hist``: Grow tree with GPU. Same as setting ``tree_method`` to ``hist`` and use ``device=cuda``.
+    - ``grow_gpu_hist``: Grow tree with GPU. Enabled when ``tree_method`` is set to ``hist`` along with ``device=cuda``.
     - ``sync``: synchronizes trees in all distributed nodes.
     - ``refresh``: refreshes tree's statistics and/or leaf values based on the current data. Note that no random subsampling of data rows is performed.
     - ``prune``: prunes the splits where loss < min_split_loss (or gamma) and nodes that have depth greater than ``max_depth``.
diff --git a/doc/python/.gitignore b/doc/python/.gitignore
index bb0916d77..f3097dfc2 100644
--- a/doc/python/.gitignore
+++ b/doc/python/.gitignore
@@ -1,3 +1,5 @@
 examples
 dask-examples
-survival-examples
\ No newline at end of file
+survival-examples
+gpu-examples
+rmm-examples
\ No newline at end of file
diff --git a/doc/python/index.rst b/doc/python/index.rst
index fd34e0d43..079c91bfe 100644
--- a/doc/python/index.rst
+++ b/doc/python/index.rst
@@ -17,3 +17,5 @@ Contents
   examples/index
   dask-examples/index
   survival-examples/index
+  gpu-examples/index
+  rmm-examples/index
diff --git a/doc/treemethod.rst b/doc/treemethod.rst
index 8ecddc066..4dfb107a0 100644
--- a/doc/treemethod.rst
+++ b/doc/treemethod.rst
@@ -124,7 +124,7 @@ Following table summarizes some differences in supported features between 4 tree
 `T` means supported while `F` means unsupported.
 
 +------------------+-----------+---------------------+---------------------+------------------------+
-|                  | Exact     | Approx              | Hist                | GPU Hist               |
+|                  | Exact     | Approx              | Hist                | Hist (GPU)             |
 +==================+===========+=====================+=====================+========================+
 | grow_policy      | Depthwise | depthwise/lossguide | depthwise/lossguide | depthwise/lossguide    |
 +------------------+-----------+---------------------+---------------------+------------------------+
@@ -141,5 +141,5 @@ Following table summarizes some differences in supported features between 4 tree
 
 Features/parameters that are not mentioned here are universally supported for all 4 tree
 methods (for instance, column sampling and constraints).  The `P` in external memory means
-partially supported.  Please note that both categorical data and external memory are
+special handling.  Please note that both categorical data and external memory are
 experimental.
diff --git a/doc/tutorials/categorical.rst b/doc/tutorials/categorical.rst
index bcea77ddf..2a84080cf 100644
--- a/doc/tutorials/categorical.rst
+++ b/doc/tutorials/categorical.rst
@@ -35,8 +35,8 @@ parameter ``enable_categorical``:
 
 .. code:: python
 
-  # Supported tree methods are `gpu_hist`, `approx`, and `hist`.
-  clf = xgb.XGBClassifier(tree_method="gpu_hist", enable_categorical=True)
+  # Supported tree methods are `approx` and `hist`.
+  clf = xgb.XGBClassifier(tree_method="hist", enable_categorical=True, device="cuda")
   # X is the dataframe we created in previous snippet
   clf.fit(X, y)
   # Must use JSON/UBJSON for serialization, otherwise the information is lost.
diff --git a/doc/tutorials/external_memory.rst b/doc/tutorials/external_memory.rst
index 832d13edd..811db6bd5 100644
--- a/doc/tutorials/external_memory.rst
+++ b/doc/tutorials/external_memory.rst
@@ -81,7 +81,7 @@ constructor.
   it = Iterator(["file_0.svm", "file_1.svm", "file_2.svm"])
   Xy = xgboost.DMatrix(it)
 
-  # Other tree methods including ``hist`` and ``gpu_hist`` also work, but has some caveats
+  # The ``approx`` also work, but with low performance. GPU implementation is different from CPU.
   # as noted in following sections.
   booster = xgboost.train({"tree_method": "hist"}, Xy)
 
@@ -118,15 +118,15 @@ to reduce the overhead of file reading.
 GPU Version (GPU Hist tree method)
 **********************************
 
-External memory is supported by GPU algorithms (i.e. when ``tree_method`` is set to
-``gpu_hist``). However, the algorithm used for GPU is different from the one used for
+External memory is supported by GPU algorithms (i.e. when ``device`` is set to
+``cuda``). However, the algorithm used for GPU is different from the one used for
 CPU. When training on a CPU, the tree method iterates through all batches from external
 memory for each step of the tree construction algorithm. On the other hand, the GPU
 algorithm uses a hybrid approach. It iterates through the data during the beginning of
-each iteration and concatenates all batches into one in GPU memory. To reduce overall
-memory usage, users can utilize subsampling. The GPU hist tree method supports
-`gradient-based sampling`, enabling users to set a low sampling rate without compromising
-accuracy.
+each iteration and concatenates all batches into one in GPU memory for performance
+reasons. To reduce overall memory usage, users can utilize subsampling. The GPU hist tree
+method supports `gradient-based sampling`, enabling users to set a low sampling rate
+without compromising accuracy.
 
 .. code-block:: python
 
diff --git a/doc/tutorials/monotonic.rst b/doc/tutorials/monotonic.rst
index 4ed7fa273..e663d1109 100644
--- a/doc/tutorials/monotonic.rst
+++ b/doc/tutorials/monotonic.rst
@@ -83,13 +83,14 @@ Some other examples:
 - ``(0,-1)``: No constraint on the first predictor and a decreasing constraint on the second.
 
 
-**Note for the 'hist' tree construction algorithm**.
-If ``tree_method`` is set to either ``hist``, ``approx`` or ``gpu_hist``, enabling
-monotonic constraints may produce unnecessarily shallow trees. This is because the
-``hist`` method reduces the number of candidate splits to be considered at each
-split. Monotonic constraints may wipe out all available split candidates, in which case no
-split is made. To reduce the effect, you may want to increase the ``max_bin`` parameter to
-consider more split candidates.
+.. note::
+
+   **Note for the 'hist' tree construction algorithm**.  If ``tree_method`` is set to
+   either ``hist`` or ``approx``, enabling monotonic constraints may produce unnecessarily
+   shallow trees. This is because the ``hist`` method reduces the number of candidate
+   splits to be considered at each split. Monotonic constraints may wipe out all available
+   split candidates, in which case no split is made. To reduce the effect, you may want to
+   increase the ``max_bin`` parameter to consider more split candidates.
 
 
 *******************
diff --git a/doc/tutorials/param_tuning.rst b/doc/tutorials/param_tuning.rst
index 5ede195f3..5ef8df003 100644
--- a/doc/tutorials/param_tuning.rst
+++ b/doc/tutorials/param_tuning.rst
@@ -38,10 +38,6 @@ There are in general two ways that you can control overfitting in XGBoost:
   - This includes ``subsample`` and ``colsample_bytree``.
   - You can also reduce stepsize ``eta``. Remember to increase ``num_round`` when you do so.
 
-***************************
-Faster training performance
-***************************
-There's a parameter called ``tree_method``, set it to ``hist`` or ``gpu_hist`` for faster computation.
 
 *************************
 Handle Imbalanced Dataset
diff --git a/doc/tutorials/rf.rst b/doc/tutorials/rf.rst
index b68204e63..014c67060 100644
--- a/doc/tutorials/rf.rst
+++ b/doc/tutorials/rf.rst
@@ -50,13 +50,14 @@ Here is a sample parameter dictionary for training a random forest on a GPU usin
 xgboost::
 
   params = {
-    'colsample_bynode': 0.8,
-    'learning_rate': 1,
-    'max_depth': 5,
-    'num_parallel_tree': 100,
-    'objective': 'binary:logistic',
-    'subsample': 0.8,
-    'tree_method': 'gpu_hist'
+    "colsample_bynode": 0.8,
+    "learning_rate": 1,
+    "max_depth": 5,
+    "num_parallel_tree": 100,
+    "objective": "binary:logistic",
+    "subsample": 0.8,
+    "tree_method": "hist",
+    "device": "cuda",
   }
 
 A random forest model can then be trained as follows::
diff --git a/doc/tutorials/saving_model.rst b/doc/tutorials/saving_model.rst
index 5d9ba1d55..54c217249 100644
--- a/doc/tutorials/saving_model.rst
+++ b/doc/tutorials/saving_model.rst
@@ -174,7 +174,7 @@ Will print out something similar to (not actual output as it's too long for demo
           "gbtree_train_param": {
             "num_parallel_tree": "1",
             "process_type": "default",
-            "tree_method": "gpu_hist",
+            "tree_method": "hist",
             "updater": "grow_gpu_hist",
             "updater_seq": "grow_gpu_hist"
           },
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index 46a3ffa4a..9e155c20a 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -278,9 +278,15 @@ __model_doc = f"""
           without bias.
 
     device : Optional[str]
-        Device ordinal.
+
+        .. versionadded:: 2.0.0
+
+        Device ordinal, available options are `cpu`, `cuda`, and `gpu`.
+
     validate_parameters : Optional[bool]
+
         Give warnings for unknown parameter.
+
     enable_categorical : bool
 
         .. versionadded:: 1.5.0
diff --git a/python-package/xgboost/spark/estimator.py b/python-package/xgboost/spark/estimator.py
index f11a0eda8..b73dfba6c 100644
--- a/python-package/xgboost/spark/estimator.py
+++ b/python-package/xgboost/spark/estimator.py
@@ -144,8 +144,13 @@ class SparkXGBRegressor(_SparkXGBEstimator):
         .. deprecated:: 2.0.0
 
         Use `device` instead.
+
     device:
+
+        .. versionadded:: 2.0.0
+
         Device for XGBoost workers, available options are `cpu`, `cuda`, and `gpu`.
+
     force_repartition:
         Boolean value to specify if forcing the input dataset to be repartitioned
         before XGBoost training.
@@ -319,8 +324,13 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
         .. deprecated:: 2.0.0
 
         Use `device` instead.
+
     device:
+
+        .. versionadded:: 2.0.0
+
         Device for XGBoost workers, available options are `cpu`, `cuda`, and `gpu`.
+
     force_repartition:
         Boolean value to specify if forcing the input dataset to be repartitioned
         before XGBoost training.
@@ -497,8 +507,13 @@ class SparkXGBRanker(_SparkXGBEstimator):
         .. deprecated:: 2.0.0
 
         Use `device` instead.
+
     device:
+
+        .. versionadded:: 2.0.0
+
         Device for XGBoost workers, available options are `cpu`, `cuda`, and `gpu`.
+
     force_repartition:
         Boolean value to specify if forcing the input dataset to be repartitioned
         before XGBoost training.
diff --git a/src/data/data.cc b/src/data/data.cc
index d305749ee..7c76c6d25 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -724,11 +724,15 @@ void MetaInfo::SynchronizeNumberOfColumns() {
 namespace {
 template <typename T>
 void CheckDevice(std::int32_t device, HostDeviceVector<T> const& v) {
-  CHECK(v.DeviceIdx() == Context::kCpuId || device == Context::kCpuId || v.DeviceIdx() == device)
-      << "Data is resided on a different device than `gpu_id`. "
-      << "Device that data is on: " << v.DeviceIdx() << ", "
-      << "`gpu_id` for XGBoost: " << device;
+  bool valid =
+      v.DeviceIdx() == Context::kCpuId || device == Context::kCpuId || v.DeviceIdx() == device;
+  if (!valid) {
+    LOG(FATAL) << "Invalid device ordinal. Data is associated with a different device ordinal than "
+                  "the booster. The device ordinal of the data is: "
+               << v.DeviceIdx() << "; the device ordinal of the Booster is: " << device;
+  }
 }
+
 template <typename T, std::int32_t D>
 void CheckDevice(std::int32_t device, linalg::Tensor<T, D> const& v) {
   CheckDevice(device, *v.Data());
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index 0806c13a7..8b456af66 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -42,22 +42,22 @@ DMLC_REGISTRY_FILE_TAG(gbtree);
 
 namespace {
 /** @brief Map the `tree_method` parameter to the `updater` parameter. */
-std::string MapTreeMethodToUpdaters(Context const* ctx_, TreeMethod tree_method) {
+std::string MapTreeMethodToUpdaters(Context const* ctx, TreeMethod tree_method) {
   // Choose updaters according to tree_method parameters
+  if (ctx->IsCUDA()) {
+    common::AssertGPUSupport();
+  }
   switch (tree_method) {
     case TreeMethod::kAuto:  // Use hist as default in 2.0
     case TreeMethod::kHist: {
-      return ctx_->DispatchDevice([] { return "grow_quantile_histmaker"; },
-                                  [] {
-                                    common::AssertGPUSupport();
-                                    return "grow_gpu_hist";
-                                  });
+      return ctx->DispatchDevice([] { return "grow_quantile_histmaker"; },
+                                 [] { return "grow_gpu_hist"; });
     }
     case TreeMethod::kApprox:
-      CHECK(ctx_->IsCPU()) << "The `approx` tree method is not supported on GPU.";
+      CHECK(ctx->IsCPU()) << "The `approx` tree method is not supported on GPU.";
       return "grow_histmaker";
     case TreeMethod::kExact:
-      CHECK(ctx_->IsCPU()) << "The `exact` tree method is not supported on GPU.";
+      CHECK(ctx->IsCPU()) << "The `exact` tree method is not supported on GPU.";
       return "grow_colmaker,prune";
     case TreeMethod::kGPUHist: {
       common::AssertGPUSupport();
@@ -150,6 +150,7 @@ void GBTree::Configure(Args const& cfg) {
     CHECK(tparam_.tree_method == TreeMethod::kHist || tparam_.tree_method == TreeMethod::kAuto)
         << "Only the hist tree method is supported for building multi-target trees with vector "
            "leaf.";
+    CHECK(ctx_->IsCPU()) << "GPU is not yet supported for vector leaf.";
   }
 
   LOG(DEBUG) << "Using tree method: " << static_cast<int>(tparam_.tree_method);
diff --git a/tests/ci_build/lint_python.py b/tests/ci_build/lint_python.py
index ca5d56e4c..5a5745ffb 100644
--- a/tests/ci_build/lint_python.py
+++ b/tests/ci_build/lint_python.py
@@ -29,10 +29,12 @@ class LintersPaths:
         "tests/python-gpu/load_pickle.py",
         "tests/python-gpu/test_gpu_pickling.py",
         "tests/python-gpu/test_gpu_eval_metrics.py",
+        "tests/python-gpu/test_gpu_with_sklearn.py",
         "tests/test_distributed/test_with_spark/",
         "tests/test_distributed/test_gpu_with_spark/",
         # demo
         "demo/dask/",
+        "demo/rmm_plugin",
         "demo/json-model/json_parser.py",
         "demo/guide-python/cat_in_the_dat.py",
         "demo/guide-python/categorical.py",
diff --git a/tests/python-gpu/test_from_cupy.py b/tests/python-gpu/test_from_cupy.py
index 71667fa7b..b811ba090 100644
--- a/tests/python-gpu/test_from_cupy.py
+++ b/tests/python-gpu/test_from_cupy.py
@@ -234,7 +234,7 @@ Arrow specification.'''
         cp.cuda.runtime.setDevice(0)
         dtrain = dmatrix_from_cupy(np.float32, xgb.QuantileDMatrix, np.nan)
         with pytest.raises(
-            xgb.core.XGBoostError, match="Data is resided on a different device"
+            xgb.core.XGBoostError, match="Invalid device ordinal"
         ):
             xgb.train(
                 {'tree_method': 'gpu_hist', 'gpu_id': 1}, dtrain, num_boost_round=10
diff --git a/tests/python-gpu/test_gpu_with_sklearn.py b/tests/python-gpu/test_gpu_with_sklearn.py
index c9d3ab4eb..530d3e9df 100644
--- a/tests/python-gpu/test_gpu_with_sklearn.py
+++ b/tests/python-gpu/test_gpu_with_sklearn.py
@@ -2,6 +2,7 @@ import json
 import os
 import sys
 import tempfile
+from concurrent.futures import ThreadPoolExecutor
 
 import numpy as np
 import pytest
@@ -23,18 +24,19 @@ def test_gpu_binary_classification():
     from sklearn.model_selection import KFold
 
     digits = load_digits(n_class=2)
-    y = digits['target']
-    X = digits['data']
+    y = digits["target"]
+    X = digits["data"]
     kf = KFold(n_splits=2, shuffle=True, random_state=rng)
     for cls in (xgb.XGBClassifier, xgb.XGBRFClassifier):
         for train_index, test_index in kf.split(X, y):
             xgb_model = cls(
-                random_state=42, tree_method='gpu_hist',
-                n_estimators=4, gpu_id='0').fit(X[train_index], y[train_index])
+                random_state=42, tree_method="gpu_hist", n_estimators=4, gpu_id="0"
+            ).fit(X[train_index], y[train_index])
             preds = xgb_model.predict(X[test_index])
             labels = y[test_index]
-            err = sum(1 for i in range(len(preds))
-                      if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
+            err = sum(
+                1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]
+            ) / float(len(preds))
             assert err < 0.1
 
 
@@ -133,7 +135,7 @@ def test_classififer():
     X, y = load_digits(return_X_y=True)
     y *= 10
 
-    clf = xgb.XGBClassifier(tree_method="gpu_hist", n_estimators=1)
+    clf = xgb.XGBClassifier(tree_method="hist", n_estimators=1, device="cuda")
 
     # numpy
     with pytest.raises(ValueError, match=r"Invalid classes.*"):
@@ -161,3 +163,46 @@ def test_ranking_qid_df():
     import cudf
 
     run_ranking_qid_df(cudf, "gpu_hist")
+
+
+@pytest.mark.skipif(**tm.no_cupy())
+@pytest.mark.mgpu
+def test_device_ordinal() -> None:
+    import cupy as cp
+
+    n_devices = 2
+
+    def worker(ordinal: int, correct_ordinal: bool) -> None:
+        if correct_ordinal:
+            cp.cuda.runtime.setDevice(ordinal)
+        else:
+            cp.cuda.runtime.setDevice((ordinal + 1) % n_devices)
+
+        X, y, w = tm.make_regression(4096, 12, use_cupy=True)
+        reg = xgb.XGBRegressor(device=f"cuda:{ordinal}", tree_method="hist")
+
+        if correct_ordinal:
+            reg.fit(
+                X, y, sample_weight=w, eval_set=[(X, y)], sample_weight_eval_set=[w]
+            )
+            assert tm.non_increasing(reg.evals_result()["validation_0"]["rmse"])
+            return
+
+        with pytest.raises(ValueError, match="Invalid device ordinal"):
+            reg.fit(
+                X, y, sample_weight=w, eval_set=[(X, y)], sample_weight_eval_set=[w]
+            )
+
+    with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
+        futures = []
+        n_trials = 32
+        for i in range(n_trials):
+            fut = executor.submit(
+                worker, ordinal=i % n_devices, correct_ordinal=i % 3 != 0
+            )
+            futures.append(fut)
+
+        for fut in futures:
+            fut.result()
+
+    cp.cuda.runtime.setDevice(0)

From 01e00efc53003290051ad0b3db0c85a6ba74666b Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 24 Jul 2023 11:06:30 +0800
Subject: [PATCH 048/136] [breaking] Remove support for single string feature
 info. (#9401)

- Input must be a sequence of strings.
- Improve validation error message.
---
 python-package/xgboost/core.py | 189 +++++++++++++++------------------
 tests/python/test_dmatrix.py   |   5 +-
 2 files changed, 90 insertions(+), 104 deletions(-)

diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 4cacd61f3..70ef3535d 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -297,6 +297,23 @@ def _check_distributed_params(kwargs: Dict[str, Any]) -> None:
         )
 
 
+def _validate_feature_info(
+    feature_info: Sequence[str], n_features: int, name: str
+) -> List[str]:
+    if isinstance(feature_info, str) or not isinstance(feature_info, Sequence):
+        raise TypeError(
+            f"Expecting a sequence of strings for {name}, got: {type(feature_info)}"
+        )
+    feature_info = list(feature_info)
+    if len(feature_info) != n_features and n_features != 0:
+        msg = (
+            f"{name} must have the same length as the number of data columns, ",
+            f"expected {n_features}, got {len(feature_info)}",
+        )
+        raise ValueError(msg)
+    return feature_info
+
+
 def build_info() -> dict:
     """Build information of XGBoost.  The returned value format is not stable. Also,
     please note that build time dependency is not the same as runtime dependency. For
@@ -1217,11 +1234,10 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes,too-many-public-m
 
     @property
     def feature_names(self) -> Optional[FeatureNames]:
-        """Get feature names (column labels).
+        """Labels for features (column labels).
+
+        Setting it to ``None`` resets existing feature names.
 
-        Returns
-        -------
-        feature_names : list or None
         """
         length = c_bst_ulong()
         sarr = ctypes.POINTER(ctypes.c_char_p)()
@@ -1240,67 +1256,61 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes,too-many-public-m
 
     @feature_names.setter
     def feature_names(self, feature_names: Optional[FeatureNames]) -> None:
-        """Set feature names (column labels).
-
-        Parameters
-        ----------
-        feature_names : list or None
-            Labels for features. None will reset existing feature names
-        """
-        if feature_names is not None:
-            # validate feature name
-            try:
-                if not isinstance(feature_names, str):
-                    feature_names = list(feature_names)
-                else:
-                    feature_names = [feature_names]
-            except TypeError:
-                feature_names = [cast(str, feature_names)]
-
-            if len(feature_names) != len(set(feature_names)):
-                raise ValueError("feature_names must be unique")
-            if len(feature_names) != self.num_col() and self.num_col() != 0:
-                msg = (
-                    "feature_names must have the same length as data, ",
-                    f"expected {self.num_col()}, got {len(feature_names)}",
-                )
-                raise ValueError(msg)
-            # prohibit to use symbols may affect to parse. e.g. []<
-            if not all(
-                isinstance(f, str) and not any(x in f for x in ["[", "]", "<"])
-                for f in feature_names
-            ):
-                raise ValueError(
-                    "feature_names must be string, and may not contain [, ] or <"
-                )
-            feature_names_bytes = [bytes(f, encoding="utf-8") for f in feature_names]
-            c_feature_names = (ctypes.c_char_p * len(feature_names_bytes))(
-                *feature_names_bytes
-            )
-            _check_call(
-                _LIB.XGDMatrixSetStrFeatureInfo(
-                    self.handle,
-                    c_str("feature_name"),
-                    c_feature_names,
-                    c_bst_ulong(len(feature_names)),
-                )
-            )
-        else:
-            # reset feature_types also
+        if feature_names is None:
             _check_call(
                 _LIB.XGDMatrixSetStrFeatureInfo(
                     self.handle, c_str("feature_name"), None, c_bst_ulong(0)
                 )
             )
-            self.feature_types = None
+            return
+
+        # validate feature name
+        feature_names = _validate_feature_info(
+            feature_names, self.num_col(), "feature names"
+        )
+        if len(feature_names) != len(set(feature_names)):
+            values, counts = np.unique(
+                feature_names,
+                return_index=False,
+                return_inverse=False,
+                return_counts=True,
+            )
+            duplicates = [name for name, cnt in zip(values, counts) if cnt > 1]
+            raise ValueError(
+                f"feature_names must be unique. Duplicates found: {duplicates}"
+            )
+
+        # prohibit the use symbols that may affect parsing. e.g. []<
+        if not all(
+            isinstance(f, str) and not any(x in f for x in ["[", "]", "<"])
+            for f in feature_names
+        ):
+            raise ValueError(
+                "feature_names must be string, and may not contain [, ] or <"
+            )
+
+        feature_names_bytes = [bytes(f, encoding="utf-8") for f in feature_names]
+        c_feature_names = (ctypes.c_char_p * len(feature_names_bytes))(
+            *feature_names_bytes
+        )
+        _check_call(
+            _LIB.XGDMatrixSetStrFeatureInfo(
+                self.handle,
+                c_str("feature_name"),
+                c_feature_names,
+                c_bst_ulong(len(feature_names)),
+            )
+        )
 
     @property
     def feature_types(self) -> Optional[FeatureTypes]:
-        """Get feature types (column types).
+        """Type of features (column types).
+
+        This is for displaying the results and categorical data support. See
+        :py:class:`DMatrix` for details.
+
+        Setting it to ``None`` resets existing feature types.
 
-        Returns
-        -------
-        feature_types : list or None
         """
         length = c_bst_ulong()
         sarr = ctypes.POINTER(ctypes.c_char_p)()
@@ -1318,57 +1328,32 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes,too-many-public-m
         return res
 
     @feature_types.setter
-    def feature_types(self, feature_types: Optional[Union[List[str], str]]) -> None:
-        """Set feature types (column types).
-
-        This is for displaying the results and categorical data support. See
-        :py:class:`DMatrix` for details.
-
-        Parameters
-        ----------
-        feature_types :
-            Labels for features. None will reset existing feature names
-
-        """
-        # For compatibility reason this function wraps single str input into a list.  But
-        # we should not promote such usage since other than visualization, the field is
-        # also used for specifying categorical data type.
-        if feature_types is not None:
-            if not isinstance(feature_types, (list, str)):
-                raise TypeError("feature_types must be string or list of strings")
-            if isinstance(feature_types, str):
-                # single string will be applied to all columns
-                feature_types = [feature_types] * self.num_col()
-            try:
-                if not isinstance(feature_types, str):
-                    feature_types = list(feature_types)
-                else:
-                    feature_types = [feature_types]
-            except TypeError:
-                feature_types = [cast(str, feature_types)]
-            feature_types_bytes = [bytes(f, encoding="utf-8") for f in feature_types]
-            c_feature_types = (ctypes.c_char_p * len(feature_types_bytes))(
-                *feature_types_bytes
-            )
-            _check_call(
-                _LIB.XGDMatrixSetStrFeatureInfo(
-                    self.handle,
-                    c_str("feature_type"),
-                    c_feature_types,
-                    c_bst_ulong(len(feature_types)),
-                )
-            )
-
-            if len(feature_types) != self.num_col() and self.num_col() != 0:
-                msg = "feature_types must have the same length as data"
-                raise ValueError(msg)
-        else:
-            # Reset.
+    def feature_types(self, feature_types: Optional[FeatureTypes]) -> None:
+        if feature_types is None:
+            # Reset
             _check_call(
                 _LIB.XGDMatrixSetStrFeatureInfo(
                     self.handle, c_str("feature_type"), None, c_bst_ulong(0)
                 )
             )
+            return
+
+        feature_types = _validate_feature_info(
+            feature_types, self.num_col(), "feature types"
+        )
+
+        feature_types_bytes = [bytes(f, encoding="utf-8") for f in feature_types]
+        c_feature_types = (ctypes.c_char_p * len(feature_types_bytes))(
+            *feature_types_bytes
+        )
+        _check_call(
+            _LIB.XGDMatrixSetStrFeatureInfo(
+                self.handle,
+                c_str("feature_type"),
+                c_feature_types,
+                c_bst_ulong(len(feature_types)),
+            )
+        )
 
 
 class _ProxyDMatrix(DMatrix):
diff --git a/tests/python/test_dmatrix.py b/tests/python/test_dmatrix.py
index bcc089afb..73e2055b7 100644
--- a/tests/python/test_dmatrix.py
+++ b/tests/python/test_dmatrix.py
@@ -219,8 +219,8 @@ class TestDMatrix:
         assert dm.slice([0, 1]).num_col() == dm.num_col()
         assert dm.slice([0, 1]).feature_names == dm.feature_names
 
-        dm.feature_types = 'q'
-        assert dm.feature_types == list('qqqqq')
+        with pytest.raises(ValueError, match=r"Duplicates found: \['bar'\]"):
+            dm.feature_names = ["bar"] * (data.shape[1] - 2) + ["a", "b"]
 
         dm.feature_types = list('qiqiq')
         assert dm.feature_types == list('qiqiq')
@@ -230,6 +230,7 @@ class TestDMatrix:
 
         # reset
         dm.feature_names = None
+        dm.feature_types = None
         assert dm.feature_names is None
         assert dm.feature_types is None
 

From 851cba931ebafd5ee38e57fb980dcaf465dd87fc Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 24 Jul 2023 12:43:35 +0800
Subject: [PATCH 049/136] Define `best_iteration` only if early stopping is
 used. (#9403)

* Define `best_iteration` only if early stopping is used.

This is the behavior specified by the document but not honored in the actual code.

- Don't set the attributes if there's no early stopping.
- Clean up the code for callbacks, and replace assertions with proper exceptions.
- Assign the attributes when early stopping `save_best` is used.
- Turn the attributes into Python properties.

---------

Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
---
 demo/guide-python/callbacks.py             |  75 ++++++------
 doc/python/model.rst                       |   4 +
 python-package/xgboost/callback.py         | 112 ++++++++++--------
 python-package/xgboost/core.py             |  35 +++++-
 python-package/xgboost/sklearn.py          |  33 ++----
 python-package/xgboost/training.py         |  24 ++--
 tests/ci_build/lint_python.py              |   1 +
 tests/python/test_callback.py              | 127 +++++++++++++--------
 tests/python/test_predict.py               |   4 +-
 tests/python/test_training_continuation.py |  13 ++-
 10 files changed, 249 insertions(+), 179 deletions(-)

diff --git a/demo/guide-python/callbacks.py b/demo/guide-python/callbacks.py
index 42fe397db..be03b1693 100644
--- a/demo/guide-python/callbacks.py
+++ b/demo/guide-python/callbacks.py
@@ -1,9 +1,9 @@
-'''
+"""
 Demo for using and defining callback functions
 ==============================================
 
     .. versionadded:: 1.3.0
-'''
+"""
 import argparse
 import os
 import tempfile
@@ -17,10 +17,11 @@ import xgboost as xgb
 
 
 class Plotting(xgb.callback.TrainingCallback):
-    '''Plot evaluation result during training.  Only for demonstration purpose as it's quite
+    """Plot evaluation result during training.  Only for demonstration purpose as it's quite
     slow to draw.
 
-    '''
+    """
+
     def __init__(self, rounds):
         self.fig = plt.figure()
         self.ax = self.fig.add_subplot(111)
@@ -31,16 +32,16 @@ class Plotting(xgb.callback.TrainingCallback):
         plt.ion()
 
     def _get_key(self, data, metric):
-        return f'{data}-{metric}'
+        return f"{data}-{metric}"
 
     def after_iteration(self, model, epoch, evals_log):
-        '''Update the plot.'''
+        """Update the plot."""
         if not self.lines:
             for data, metric in evals_log.items():
                 for metric_name, log in metric.items():
                     key = self._get_key(data, metric_name)
                     expanded = log + [0] * (self.rounds - len(log))
-                    self.lines[key],  = self.ax.plot(self.x, expanded, label=key)
+                    (self.lines[key],) = self.ax.plot(self.x, expanded, label=key)
                     self.ax.legend()
         else:
             # https://pythonspot.com/matplotlib-update-plot/
@@ -55,8 +56,8 @@ class Plotting(xgb.callback.TrainingCallback):
 
 
 def custom_callback():
-    '''Demo for defining a custom callback function that plots evaluation result during
-    training.'''
+    """Demo for defining a custom callback function that plots evaluation result during
+    training."""
     X, y = load_breast_cancer(return_X_y=True)
     X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=0)
 
@@ -69,15 +70,16 @@ def custom_callback():
     # Pass it to the `callbacks` parameter as a list.
     xgb.train(
         {
-            'objective': 'binary:logistic',
-            'eval_metric': ['error', 'rmse'],
-            'tree_method': 'hist',
+            "objective": "binary:logistic",
+            "eval_metric": ["error", "rmse"],
+            "tree_method": "hist",
             "device": "cuda",
         },
         D_train,
-        evals=[(D_train, 'Train'), (D_valid, 'Valid')],
+        evals=[(D_train, "Train"), (D_valid, "Valid")],
         num_boost_round=num_boost_round,
-        callbacks=[plotting])
+        callbacks=[plotting],
+    )
 
 
 def check_point_callback():
@@ -90,10 +92,10 @@ def check_point_callback():
             if i == 0:
                 continue
             if as_pickle:
-                path = os.path.join(tmpdir, 'model_' + str(i) + '.pkl')
+                path = os.path.join(tmpdir, "model_" + str(i) + ".pkl")
             else:
-                path = os.path.join(tmpdir, 'model_' + str(i) + '.json')
-            assert(os.path.exists(path))
+                path = os.path.join(tmpdir, "model_" + str(i) + ".json")
+            assert os.path.exists(path)
 
     X, y = load_breast_cancer(return_X_y=True)
     m = xgb.DMatrix(X, y)
@@ -101,31 +103,36 @@ def check_point_callback():
     with tempfile.TemporaryDirectory() as tmpdir:
         # Use callback class from xgboost.callback
         # Feel free to subclass/customize it to suit your need.
-        check_point = xgb.callback.TrainingCheckPoint(directory=tmpdir,
-                                                      iterations=rounds,
-                                                      name='model')
-        xgb.train({'objective': 'binary:logistic'}, m,
-                  num_boost_round=10,
-                  verbose_eval=False,
-                  callbacks=[check_point])
+        check_point = xgb.callback.TrainingCheckPoint(
+            directory=tmpdir, iterations=rounds, name="model"
+        )
+        xgb.train(
+            {"objective": "binary:logistic"},
+            m,
+            num_boost_round=10,
+            verbose_eval=False,
+            callbacks=[check_point],
+        )
         check(False)
 
         # This version of checkpoint saves everything including parameters and
         # model.  See: doc/tutorials/saving_model.rst
-        check_point = xgb.callback.TrainingCheckPoint(directory=tmpdir,
-                                                      iterations=rounds,
-                                                      as_pickle=True,
-                                                      name='model')
-        xgb.train({'objective': 'binary:logistic'}, m,
-                  num_boost_round=10,
-                  verbose_eval=False,
-                  callbacks=[check_point])
+        check_point = xgb.callback.TrainingCheckPoint(
+            directory=tmpdir, iterations=rounds, as_pickle=True, name="model"
+        )
+        xgb.train(
+            {"objective": "binary:logistic"},
+            m,
+            num_boost_round=10,
+            verbose_eval=False,
+            callbacks=[check_point],
+        )
         check(True)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--plot', default=1, type=int)
+    parser.add_argument("--plot", default=1, type=int)
     args = parser.parse_args()
 
     check_point_callback()
diff --git a/doc/python/model.rst b/doc/python/model.rst
index c854043b3..5ea38164a 100644
--- a/doc/python/model.rst
+++ b/doc/python/model.rst
@@ -37,3 +37,7 @@ The sliced model is a copy of selected trees, that means the model itself is imm
 during slicing.  This feature is the basis of `save_best` option in early stopping
 callback. See :ref:`sphx_glr_python_examples_individual_trees.py` for a worked example on
 how to combine prediction with sliced trees.
+
+.. note::
+
+   The returned model slice doesn't contain attributes like :py:class:`~xgboost.Booster.best_iteration` and :py:class:`~xgboost.Booster.best_score`.
diff --git a/python-package/xgboost/callback.py b/python-package/xgboost/callback.py
index 88e340737..6077aa1e3 100644
--- a/python-package/xgboost/callback.py
+++ b/python-package/xgboost/callback.py
@@ -134,13 +134,17 @@ class CallbackContainer:
         is_cv: bool = False,
     ) -> None:
         self.callbacks = set(callbacks)
-        if metric is not None:
-            msg = (
-                "metric must be callable object for monitoring.  For "
-                + "builtin metrics, passing them in training parameter"
-                + " will invoke monitor automatically."
-            )
-            assert callable(metric), msg
+        for cb in callbacks:
+            if not isinstance(cb, TrainingCallback):
+                raise TypeError("callback must be an instance of `TrainingCallback`.")
+
+        msg = (
+            "metric must be callable object for monitoring.  For builtin metrics"
+            ", passing them in training parameter invokes monitor automatically."
+        )
+        if metric is not None and not callable(metric):
+            raise TypeError(msg)
+
         self.metric = metric
         self.history: TrainingCallback.EvalsLog = collections.OrderedDict()
         self._output_margin = output_margin
@@ -170,16 +174,6 @@ class CallbackContainer:
             else:
                 assert isinstance(model, Booster), msg
 
-        if not self.is_cv:
-            if model.attr("best_score") is not None:
-                model.best_score = float(cast(str, model.attr("best_score")))
-                model.best_iteration = int(cast(str, model.attr("best_iteration")))
-            else:
-                # Due to compatibility with version older than 1.4, these attributes are
-                # added to Python object even if early stopping is not used.
-                model.best_iteration = model.num_boosted_rounds() - 1
-                model.set_attr(best_iteration=str(model.best_iteration))
-
         return model
 
     def before_iteration(
@@ -267,9 +261,14 @@ class LearningRateScheduler(TrainingCallback):
     def __init__(
         self, learning_rates: Union[Callable[[int], float], Sequence[float]]
     ) -> None:
-        assert callable(learning_rates) or isinstance(
+        if not callable(learning_rates) and not isinstance(
             learning_rates, collections.abc.Sequence
-        )
+        ):
+            raise TypeError(
+                "Invalid learning rates, expecting callable or sequence, got: "
+                f"{type(learning_rates)}"
+            )
+
         if callable(learning_rates):
             self.learning_rates = learning_rates
         else:
@@ -302,24 +301,28 @@ class EarlyStopping(TrainingCallback):
     save_best :
         Whether training should return the best model or the last model.
     min_delta :
-        Minimum absolute change in score to be qualified as an improvement.
 
         .. versionadded:: 1.5.0
 
-        .. code-block:: python
+        Minimum absolute change in score to be qualified as an improvement.
 
-            es = xgboost.callback.EarlyStopping(
-                rounds=2,
-                min_delta=1e-3,
-                save_best=True,
-                maximize=False,
-                data_name="validation_0",
-                metric_name="mlogloss",
-            )
-            clf = xgboost.XGBClassifier(tree_method="gpu_hist", callbacks=[es])
+    Examples
+    --------
 
-            X, y = load_digits(return_X_y=True)
-            clf.fit(X, y, eval_set=[(X, y)])
+    .. code-block:: python
+
+        es = xgboost.callback.EarlyStopping(
+            rounds=2,
+            min_delta=1e-3,
+            save_best=True,
+            maximize=False,
+            data_name="validation_0",
+            metric_name="mlogloss",
+        )
+        clf = xgboost.XGBClassifier(tree_method="hist", device="cuda", callbacks=[es])
+
+        X, y = load_digits(return_X_y=True)
+        clf.fit(X, y, eval_set=[(X, y)])
     """
 
     # pylint: disable=too-many-arguments
@@ -363,7 +366,7 @@ class EarlyStopping(TrainingCallback):
             return numpy.greater(get_s(new) - self._min_delta, get_s(best))
 
         def minimize(new: _Score, best: _Score) -> bool:
-            """New score should be smaller than the old one."""
+            """New score should be lesser than the old one."""
             return numpy.greater(get_s(best) - self._min_delta, get_s(new))
 
         if self.maximize is None:
@@ -419,38 +422,53 @@ class EarlyStopping(TrainingCallback):
     ) -> bool:
         epoch += self.starting_round  # training continuation
         msg = "Must have at least 1 validation dataset for early stopping."
-        assert len(evals_log.keys()) >= 1, msg
-        data_name = ""
+        if len(evals_log.keys()) < 1:
+            raise ValueError(msg)
+
+        # Get data name
         if self.data:
-            for d, _ in evals_log.items():
-                if d == self.data:
-                    data_name = d
-            if not data_name:
-                raise ValueError("No dataset named:", self.data)
+            data_name = self.data
         else:
             # Use the last one as default.
             data_name = list(evals_log.keys())[-1]
-        assert isinstance(data_name, str) and data_name
+        if data_name not in evals_log:
+            raise ValueError(f"No dataset named: {data_name}")
+
+        if not isinstance(data_name, str):
+            raise TypeError(
+                f"The name of the dataset should be a string. Got: {type(data_name)}"
+            )
         data_log = evals_log[data_name]
 
-        # Filter out scores that can not be used for early stopping.
+        # Get metric name
         if self.metric_name:
             metric_name = self.metric_name
         else:
             # Use last metric by default.
-            assert isinstance(data_log, collections.OrderedDict)
             metric_name = list(data_log.keys())[-1]
+        if metric_name not in data_log:
+            raise ValueError(f"No metric named: {metric_name}")
+
+        # The latest score
         score = data_log[metric_name][-1]
         return self._update_rounds(score, data_name, metric_name, model, epoch)
 
     def after_training(self, model: _Model) -> _Model:
+        if not self.save_best:
+            return model
+
         try:
-            if self.save_best:
-                model = model[: int(model.attr("best_iteration")) + 1]
+            best_iteration = model.best_iteration
+            best_score = model.best_score
+            assert best_iteration is not None and best_score is not None
+            model = model[: best_iteration + 1]
+            model.best_iteration = best_iteration
+            model.best_score = best_score
         except XGBoostError as e:
             raise XGBoostError(
-                "`save_best` is not applicable to current booster"
+                "`save_best` is not applicable to the current booster"
             ) from e
+
         return model
 
 
@@ -462,8 +480,6 @@ class EvaluationMonitor(TrainingCallback):
     Parameters
     ----------
 
-    metric :
-        Extra user defined metric.
     rank :
         Which worker should be used for printing the result.
     period :
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 70ef3535d..5658a5079 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -1890,7 +1890,7 @@ class Booster:
         attr_names = from_cstr_to_pystr(sarr, length)
         return {n: self.attr(n) for n in attr_names}
 
-    def set_attr(self, **kwargs: Optional[str]) -> None:
+    def set_attr(self, **kwargs: Optional[Any]) -> None:
         """Set the attribute of the Booster.
 
         Parameters
@@ -2559,10 +2559,35 @@ class Booster:
         else:
             raise TypeError("Unknown file type: ", fname)
 
-        if self.attr("best_iteration") is not None:
-            self.best_iteration = int(cast(int, self.attr("best_iteration")))
-        if self.attr("best_score") is not None:
-            self.best_score = float(cast(float, self.attr("best_score")))
+    @property
+    def best_iteration(self) -> int:
+        """The best iteration during training."""
+        best = self.attr("best_iteration")
+        if best is not None:
+            return int(best)
+
+        raise AttributeError(
+            "`best_iteration` is only defined when early stopping is used."
+        )
+
+    @best_iteration.setter
+    def best_iteration(self, iteration: int) -> None:
+        self.set_attr(best_iteration=iteration)
+
+    @property
+    def best_score(self) -> float:
+        """The best evaluation score during training."""
+        best = self.attr("best_score")
+        if best is not None:
+            return float(best)
+
+        raise AttributeError(
+            "`best_score` is only defined when early stopping is used."
+        )
+
+    @best_score.setter
+    def best_score(self, score: int) -> None:
+        self.set_attr(best_score=score)
 
     def num_boosted_rounds(self) -> int:
         """Get number of boosted rounds.  For gblinear this is reset to 0 after
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index 9e155c20a..e791be51c 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -230,10 +230,10 @@ __model_doc = f"""
     subsample : Optional[float]
         Subsample ratio of the training instance.
     sampling_method :
-        Sampling method. Used only by `gpu_hist` tree method.
-          - `uniform`: select random training instances uniformly.
-          - `gradient_based` select random training instances with higher probability when
-            the gradient and hessian are larger. (cf. CatBoost)
+        Sampling method. Used only by the GPU version of ``hist`` tree method.
+          - ``uniform``: select random training instances uniformly.
+          - ``gradient_based`` select random training instances with higher probability
+            when the gradient and hessian are larger. (cf. CatBoost)
     colsample_bytree : Optional[float]
         Subsample ratio of columns when constructing each tree.
     colsample_bylevel : Optional[float]
@@ -992,12 +992,12 @@ class XGBModel(XGBModelBase):
         X :
             Feature matrix. See :ref:`py-data` for a list of supported types.
 
-            When the ``tree_method`` is set to ``hist`` or ``gpu_hist``, internally, the
+            When the ``tree_method`` is set to ``hist``, internally, the
             :py:class:`QuantileDMatrix` will be used instead of the :py:class:`DMatrix`
             for conserving memory. However, this has performance implications when the
             device of input data is not matched with algorithm. For instance, if the
-            input is a numpy array on CPU but ``gpu_hist`` is used for training, then
-            the data is first processed on CPU then transferred to GPU.
+            input is a numpy array on CPU but ``cuda`` is used for training, then the
+            data is first processed on CPU then transferred to GPU.
         y :
             Labels
         sample_weight :
@@ -1279,19 +1279,10 @@ class XGBModel(XGBModelBase):
             )
         return np.array(feature_names)
 
-    def _early_stopping_attr(self, attr: str) -> Union[float, int]:
-        booster = self.get_booster()
-        try:
-            return getattr(booster, attr)
-        except AttributeError as e:
-            raise AttributeError(
-                f"`{attr}` in only defined when early stopping is used."
-            ) from e
-
     @property
     def best_score(self) -> float:
         """The best score obtained by early stopping."""
-        return float(self._early_stopping_attr("best_score"))
+        return self.get_booster().best_score
 
     @property
     def best_iteration(self) -> int:
@@ -1299,7 +1290,7 @@ class XGBModel(XGBModelBase):
         for instance if the best iteration is the first round, then best_iteration is 0.
 
         """
-        return int(self._early_stopping_attr("best_iteration"))
+        return self.get_booster().best_iteration
 
     @property
     def feature_importances_(self) -> np.ndarray:
@@ -1926,12 +1917,12 @@ class XGBRanker(XGBModel, XGBRankerMixIn):
             | 1   | :math:`x_{20}` | :math:`x_{21}` |
             +-----+----------------+----------------+
 
-            When the ``tree_method`` is set to ``hist`` or ``gpu_hist``, internally, the
+            When the ``tree_method`` is set to ``hist``, internally, the
             :py:class:`QuantileDMatrix` will be used instead of the :py:class:`DMatrix`
             for conserving memory. However, this has performance implications when the
             device of input data is not matched with algorithm. For instance, if the
-            input is a numpy array on CPU but ``gpu_hist`` is used for training, then
-            the data is first processed on CPU then transferred to GPU.
+            input is a numpy array on CPU but ``cuda`` is used for training, then the
+            data is first processed on CPU then transferred to GPU.
         y :
             Labels
         group :
diff --git a/python-package/xgboost/training.py b/python-package/xgboost/training.py
index a238e73c8..aa3c18a01 100644
--- a/python-package/xgboost/training.py
+++ b/python-package/xgboost/training.py
@@ -28,17 +28,6 @@ from .core import (
 _CVFolds = Sequence["CVPack"]
 
 
-def _assert_new_callback(callbacks: Optional[Sequence[TrainingCallback]]) -> None:
-    is_new_callback: bool = not callbacks or all(
-        isinstance(c, TrainingCallback) for c in callbacks
-    )
-    if not is_new_callback:
-        link = "https://xgboost.readthedocs.io/en/latest/python/callbacks.html"
-        raise ValueError(
-            f"Old style callback was removed in version 1.6.  See: {link}."
-        )
-
-
 def _configure_custom_metric(
     feval: Optional[Metric], custom_metric: Optional[Metric]
 ) -> Optional[Metric]:
@@ -170,7 +159,6 @@ def train(
     bst = Booster(params, [dtrain] + [d[0] for d in evals], model_file=xgb_model)
     start_iteration = 0
 
-    _assert_new_callback(callbacks)
     if verbose_eval:
         verbose_eval = 1 if verbose_eval is True else verbose_eval
         callbacks.append(EvaluationMonitor(period=verbose_eval))
@@ -247,7 +235,7 @@ class _PackedBooster:
         result = [f.eval(iteration, feval, output_margin) for f in self.cvfolds]
         return result
 
-    def set_attr(self, **kwargs: Optional[str]) -> Any:
+    def set_attr(self, **kwargs: Optional[Any]) -> Any:
         """Iterate through folds for setting attributes"""
         for f in self.cvfolds:
             f.bst.set_attr(**kwargs)
@@ -274,11 +262,20 @@ class _PackedBooster:
         """Get best_iteration"""
         return int(cast(int, self.cvfolds[0].bst.attr("best_iteration")))
 
+    @best_iteration.setter
+    def best_iteration(self, iteration: int) -> None:
+        """Get best_iteration"""
+        self.set_attr(best_iteration=iteration)
+
     @property
     def best_score(self) -> float:
         """Get best_score."""
         return float(cast(float, self.cvfolds[0].bst.attr("best_score")))
 
+    @best_score.setter
+    def best_score(self, score: float) -> None:
+        self.set_attr(best_score=score)
+
 
 def groups_to_rows(groups: List[np.ndarray], boundaries: np.ndarray) -> np.ndarray:
     """
@@ -551,7 +548,6 @@ def cv(
 
     # setup callbacks
     callbacks = [] if callbacks is None else copy.copy(list(callbacks))
-    _assert_new_callback(callbacks)
 
     if verbose_eval:
         verbose_eval = 1 if verbose_eval is True else verbose_eval
diff --git a/tests/ci_build/lint_python.py b/tests/ci_build/lint_python.py
index 5a5745ffb..9749a8485 100644
--- a/tests/ci_build/lint_python.py
+++ b/tests/ci_build/lint_python.py
@@ -37,6 +37,7 @@ class LintersPaths:
         "demo/rmm_plugin",
         "demo/json-model/json_parser.py",
         "demo/guide-python/cat_in_the_dat.py",
+        "demo/guide-python/callbacks.py",
         "demo/guide-python/categorical.py",
         "demo/guide-python/feature_weights.py",
         "demo/guide-python/sklearn_parallel.py",
diff --git a/tests/python/test_callback.py b/tests/python/test_callback.py
index d3ec05e6e..56c9fdabd 100644
--- a/tests/python/test_callback.py
+++ b/tests/python/test_callback.py
@@ -1,7 +1,6 @@
 import json
 import os
 import tempfile
-from contextlib import nullcontext
 from typing import Union
 
 import pytest
@@ -104,15 +103,6 @@ class TestCallbacks:
         dump = booster.get_dump(dump_format='json')
         assert len(dump) - booster.best_iteration == early_stopping_rounds + 1
 
-        # No early stopping, best_iteration should be set to last epoch
-        booster = xgb.train({'objective': 'binary:logistic',
-                             'eval_metric': 'error'}, D_train,
-                            evals=[(D_train, 'Train'), (D_valid, 'Valid')],
-                            num_boost_round=10,
-                            evals_result=evals_result,
-                            verbose_eval=True)
-        assert booster.num_boosted_rounds() - 1 == booster.best_iteration
-
     def test_early_stopping_custom_eval(self):
         D_train = xgb.DMatrix(self.X_train, self.y_train)
         D_valid = xgb.DMatrix(self.X_valid, self.y_valid)
@@ -204,8 +194,9 @@ class TestCallbacks:
         X, y = load_breast_cancer(return_X_y=True)
         n_estimators = 100
         early_stopping_rounds = 5
-        early_stop = xgb.callback.EarlyStopping(rounds=early_stopping_rounds,
-                                                save_best=True)
+        early_stop = xgb.callback.EarlyStopping(
+            rounds=early_stopping_rounds, save_best=True
+        )
         cls = xgb.XGBClassifier(
             n_estimators=n_estimators,
             eval_metric=tm.eval_error_metric_skl,
@@ -216,20 +207,27 @@ class TestCallbacks:
         dump = booster.get_dump(dump_format='json')
         assert len(dump) == booster.best_iteration + 1
 
-        early_stop = xgb.callback.EarlyStopping(rounds=early_stopping_rounds,
-                                                save_best=True)
+        early_stop = xgb.callback.EarlyStopping(
+            rounds=early_stopping_rounds, save_best=True
+        )
         cls = xgb.XGBClassifier(
-            booster='gblinear', n_estimators=10, eval_metric=tm.eval_error_metric_skl
+            booster="gblinear",
+            n_estimators=10,
+            eval_metric=tm.eval_error_metric_skl,
+            callbacks=[early_stop],
         )
         with pytest.raises(ValueError):
-            cls.fit(X, y, eval_set=[(X, y)], callbacks=[early_stop])
+            cls.fit(X, y, eval_set=[(X, y)])
 
         # No error
         early_stop = xgb.callback.EarlyStopping(rounds=early_stopping_rounds,
                                                 save_best=False)
         xgb.XGBClassifier(
-            booster='gblinear', n_estimators=10, eval_metric=tm.eval_error_metric_skl
-        ).fit(X, y, eval_set=[(X, y)], callbacks=[early_stop])
+            booster="gblinear",
+            n_estimators=10,
+            eval_metric=tm.eval_error_metric_skl,
+            callbacks=[early_stop],
+        ).fit(X, y, eval_set=[(X, y)])
 
     def test_early_stopping_continuation(self):
         from sklearn.datasets import load_breast_cancer
@@ -252,8 +250,11 @@ class TestCallbacks:
             cls.load_model(path)
             assert cls._Booster is not None
             early_stopping_rounds = 3
-            cls.set_params(eval_metric=tm.eval_error_metric_skl)
-            cls.fit(X, y, eval_set=[(X, y)], early_stopping_rounds=early_stopping_rounds)
+            cls.set_params(
+                eval_metric=tm.eval_error_metric_skl,
+                early_stopping_rounds=early_stopping_rounds,
+            )
+            cls.fit(X, y, eval_set=[(X, y)])
             booster = cls.get_booster()
             assert booster.num_boosted_rounds() == \
                 booster.best_iteration + early_stopping_rounds + 1
@@ -280,20 +281,20 @@ class TestCallbacks:
         watchlist = [(dtest, 'eval'), (dtrain, 'train')]
         num_round = 4
 
-        warning_check = nullcontext()
-
         # learning_rates as a list
         # init eta with 0 to check whether learning_rates work
         param = {'max_depth': 2, 'eta': 0, 'verbosity': 0,
                  'objective': 'binary:logistic', 'eval_metric': 'error',
                  'tree_method': tree_method}
         evals_result = {}
-        with warning_check:
-            bst = xgb.train(param, dtrain, num_round, watchlist,
-                            callbacks=[scheduler([
-                                0.8, 0.7, 0.6, 0.5
-                            ])],
-                            evals_result=evals_result)
+        bst = xgb.train(
+            param,
+            dtrain,
+            num_round,
+            evals=watchlist,
+            callbacks=[scheduler([0.8, 0.7, 0.6, 0.5])],
+            evals_result=evals_result,
+        )
         eval_errors_0 = list(map(float, evals_result['eval']['error']))
         assert isinstance(bst, xgb.core.Booster)
         # validation error should decrease, if eta > 0
@@ -304,11 +305,15 @@ class TestCallbacks:
                  'objective': 'binary:logistic', 'eval_metric': 'error',
                  'tree_method': tree_method}
         evals_result = {}
-        with warning_check:
-            bst = xgb.train(param, dtrain, num_round, watchlist,
-                            callbacks=[scheduler(
-                                [0.8, 0.7, 0.6, 0.5])],
-                            evals_result=evals_result)
+
+        bst = xgb.train(
+            param,
+            dtrain,
+            num_round,
+            evals=watchlist,
+            callbacks=[scheduler([0.8, 0.7, 0.6, 0.5])],
+            evals_result=evals_result,
+        )
         eval_errors_1 = list(map(float, evals_result['eval']['error']))
         assert isinstance(bst, xgb.core.Booster)
         # validation error should decrease, if learning_rate > 0
@@ -320,12 +325,14 @@ class TestCallbacks:
             'eval_metric': 'error', 'tree_method': tree_method
         }
         evals_result = {}
-        with warning_check:
-            bst = xgb.train(param, dtrain, num_round, watchlist,
-                            callbacks=[scheduler(
-                                [0, 0, 0, 0]
-                            )],
-                            evals_result=evals_result)
+        bst = xgb.train(
+            param,
+            dtrain,
+            num_round,
+            evals=watchlist,
+            callbacks=[scheduler([0, 0, 0, 0])],
+            evals_result=evals_result,
+        )
         eval_errors_2 = list(map(float, evals_result['eval']['error']))
         assert isinstance(bst, xgb.core.Booster)
         # validation error should not decrease, if eta/learning_rate = 0
@@ -336,12 +343,14 @@ class TestCallbacks:
             return num_boost_round / (ithround + 1)
 
         evals_result = {}
-        with warning_check:
-            bst = xgb.train(param, dtrain, num_round, watchlist,
-                            callbacks=[
-                                scheduler(eta_decay)
-                            ],
-                            evals_result=evals_result)
+        bst = xgb.train(
+            param,
+            dtrain,
+            num_round,
+            evals=watchlist,
+            callbacks=[scheduler(eta_decay)],
+            evals_result=evals_result,
+        )
         eval_errors_3 = list(map(float, evals_result['eval']['error']))
 
         assert isinstance(bst, xgb.core.Booster)
@@ -351,8 +360,7 @@ class TestCallbacks:
         for i in range(1, len(eval_errors_0)):
             assert eval_errors_3[i] != eval_errors_2[i]
 
-        with warning_check:
-            xgb.cv(param, dtrain, num_round, callbacks=[scheduler(eta_decay)])
+        xgb.cv(param, dtrain, num_round, callbacks=[scheduler(eta_decay)])
 
     def run_eta_decay_leaf_output(self, tree_method: str, objective: str) -> None:
         # check decay has effect on leaf output.
@@ -378,7 +386,7 @@ class TestCallbacks:
             param,
             dtrain,
             num_round,
-            watchlist,
+            evals=watchlist,
             callbacks=[scheduler(eta_decay_0)],
         )
 
@@ -391,7 +399,7 @@ class TestCallbacks:
             param,
             dtrain,
             num_round,
-            watchlist,
+            evals=watchlist,
             callbacks=[scheduler(eta_decay_1)],
         )
         bst_json0 = bst0.save_raw(raw_format="json")
@@ -474,3 +482,24 @@ class TestCallbacks:
                 callbacks=callbacks,
             )
         assert len(callbacks) == 1
+
+    def test_attribute_error(self) -> None:
+        from sklearn.datasets import load_breast_cancer
+
+        X, y = load_breast_cancer(return_X_y=True)
+
+        clf = xgb.XGBClassifier(n_estimators=8)
+        clf.fit(X, y, eval_set=[(X, y)])
+
+        with pytest.raises(AttributeError, match="early stopping is used"):
+            clf.best_iteration
+
+        with pytest.raises(AttributeError, match="early stopping is used"):
+            clf.best_score
+
+        booster = clf.get_booster()
+        with pytest.raises(AttributeError, match="early stopping is used"):
+            booster.best_iteration
+
+        with pytest.raises(AttributeError, match="early stopping is used"):
+            booster.best_score
diff --git a/tests/python/test_predict.py b/tests/python/test_predict.py
index 04a7d70cb..6ed9c39f7 100644
--- a/tests/python/test_predict.py
+++ b/tests/python/test_predict.py
@@ -173,7 +173,7 @@ class TestInplacePredict:
         np.testing.assert_allclose(predt_from_dmatrix, predt_from_array)
 
         with pytest.raises(ValueError):
-            booster.predict(test, iteration_range=(0, booster.best_iteration + 2))
+            booster.predict(test, iteration_range=(0, booster.num_boosted_rounds() + 2))
 
         default = booster.predict(test)
 
@@ -181,7 +181,7 @@ class TestInplacePredict:
         np.testing.assert_allclose(range_full, default)
 
         range_full = booster.predict(
-            test, iteration_range=(0, booster.best_iteration + 1)
+            test, iteration_range=(0, booster.num_boosted_rounds())
         )
         np.testing.assert_allclose(range_full, default)
 
diff --git a/tests/python/test_training_continuation.py b/tests/python/test_training_continuation.py
index 3cbe6a421..6b2f96301 100644
--- a/tests/python/test_training_continuation.py
+++ b/tests/python/test_training_continuation.py
@@ -100,8 +100,8 @@ class TestTrainingContinuation:
         res2 = mean_squared_error(
             y_2class,
             gbdt_04.predict(
-                dtrain_2class, iteration_range=(0, gbdt_04.best_iteration + 1)
-            )
+                dtrain_2class, iteration_range=(0, gbdt_04.num_boosted_rounds())
+            ),
         )
         assert res1 == res2
 
@@ -112,7 +112,7 @@ class TestTrainingContinuation:
         res2 = mean_squared_error(
             y_2class,
             gbdt_04.predict(
-                dtrain_2class, iteration_range=(0, gbdt_04.best_iteration + 1)
+                dtrain_2class, iteration_range=(0, gbdt_04.num_boosted_rounds())
             )
         )
         assert res1 == res2
@@ -126,7 +126,7 @@ class TestTrainingContinuation:
 
         res1 = gbdt_05.predict(dtrain_5class)
         res2 = gbdt_05.predict(
-            dtrain_5class, iteration_range=(0, gbdt_05.best_iteration + 1)
+            dtrain_5class, iteration_range=(0, gbdt_05.num_boosted_rounds())
         )
         np.testing.assert_almost_equal(res1, res2)
 
@@ -138,15 +138,16 @@ class TestTrainingContinuation:
     @pytest.mark.skipif(**tm.no_sklearn())
     def test_training_continuation_updaters_json(self):
         # Picked up from R tests.
-        updaters = 'grow_colmaker,prune,refresh'
+        updaters = "grow_colmaker,prune,refresh"
         params = self.generate_parameters()
         for p in params:
-            p['updater'] = updaters
+            p["updater"] = updaters
         self.run_training_continuation(params[0], params[1], params[2])
 
     @pytest.mark.skipif(**tm.no_sklearn())
     def test_changed_parameter(self):
         from sklearn.datasets import load_breast_cancer
+
         X, y = load_breast_cancer(return_X_y=True)
         clf = xgb.XGBClassifier(n_estimators=2)
         clf.fit(X, y, eval_set=[(X, y)], eval_metric="logloss")

From a196443a0734df3cf646c951858a294a0325c697 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 24 Jul 2023 15:43:03 +0800
Subject: [PATCH 050/136] Implement sketching with Hessian on GPU. (#9399)

- Prepare for implementing approx on GPU.
- Unify the code path between weighted and uniform sketching on DMatrix.
---
 include/xgboost/data.h               |  12 +-
 include/xgboost/host_device_vector.h |  10 +-
 src/common/hist_util.cc              |   6 +-
 src/common/hist_util.cu              | 281 +++++++++++++-------------
 src/common/hist_util.cuh             |  44 +++-
 src/common/hist_util.h               |   2 +-
 src/common/host_device_vector.cc     |   3 +
 src/common/host_device_vector.cu     |   5 +
 src/data/ellpack_page.cu             |   2 +-
 src/data/gradient_index.cc           |   2 +-
 src/data/gradient_index.h            |   2 +-
 src/data/sparse_page_dmatrix.cu      |   4 +-
 tests/cpp/common/test_hist_util.cu   | 288 +++++++++++++++++++++------
 tests/cpp/common/test_quantile.cu    |  15 +-
 14 files changed, 446 insertions(+), 230 deletions(-)

diff --git a/include/xgboost/data.h b/include/xgboost/data.h
index 472ca43b3..eae2f612b 100644
--- a/include/xgboost/data.h
+++ b/include/xgboost/data.h
@@ -185,10 +185,10 @@ class MetaInfo {
     return data_split_mode == DataSplitMode::kRow;
   }
 
-  /*! \brief Whether the data is split column-wise. */
-  bool IsColumnSplit() const {
-    return data_split_mode == DataSplitMode::kCol;
-  }
+  /** @brief Whether the data is split column-wise. */
+  bool IsColumnSplit() const { return data_split_mode == DataSplitMode::kCol; }
+  /** @brief Whether this is a learning to rank data. */
+  bool IsRanking() const { return !group_ptr_.empty(); }
 
   /*!
    * \brief A convenient method to check if we are doing vertical federated learning, which requires
@@ -249,7 +249,7 @@ struct BatchParam {
   /**
    * \brief Hessian, used for sketching with future approx implementation.
    */
-  common::Span<float> hess;
+  common::Span<float const> hess;
   /**
    * \brief Whether should we force DMatrix to regenerate the batch.  Only used for
    *        GHistIndex.
@@ -279,7 +279,7 @@ struct BatchParam {
    *   Get batch with sketch weighted by hessian.  The batch will be regenerated if the
    *   span is changed, so caller should keep the span for each iteration.
    */
-  BatchParam(bst_bin_t max_bin, common::Span<float> hessian, bool regenerate)
+  BatchParam(bst_bin_t max_bin, common::Span<float const> hessian, bool regenerate)
       : max_bin{max_bin}, hess{hessian}, regen{regenerate} {}
 
   [[nodiscard]] bool ParamNotEqual(BatchParam const& other) const {
diff --git a/include/xgboost/host_device_vector.h b/include/xgboost/host_device_vector.h
index b9fb15104..b221d7206 100644
--- a/include/xgboost/host_device_vector.h
+++ b/include/xgboost/host_device_vector.h
@@ -49,11 +49,12 @@
 #ifndef XGBOOST_HOST_DEVICE_VECTOR_H_
 #define XGBOOST_HOST_DEVICE_VECTOR_H_
 
-#include <initializer_list>
-#include <vector>
-#include <type_traits>
+#include <xgboost/context.h>  // for DeviceOrd
+#include <xgboost/span.h>     // for Span
 
-#include "span.h"
+#include <initializer_list>
+#include <type_traits>
+#include <vector>
 
 namespace xgboost {
 
@@ -133,6 +134,7 @@ class HostDeviceVector {
   GPUAccess DeviceAccess() const;
 
   void SetDevice(int device) const;
+  void SetDevice(DeviceOrd device) const;
 
   void Resize(size_t new_size, T v = T());
 
diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc
index 1d950e70a..489ef2396 100644
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -12,8 +12,8 @@
 #include "../data/gradient_index.h"  // for GHistIndexMatrix
 #include "quantile.h"
 #include "xgboost/base.h"
-#include "xgboost/context.h"  // Context
-#include "xgboost/data.h"     // SparsePage, SortedCSCPage
+#include "xgboost/context.h"  // for Context
+#include "xgboost/data.h"     // for SparsePage, SortedCSCPage
 
 #if defined(XGBOOST_MM_PREFETCH_PRESENT)
   #include <xmmintrin.h>
@@ -30,7 +30,7 @@ HistogramCuts::HistogramCuts() {
 }
 
 HistogramCuts SketchOnDMatrix(Context const *ctx, DMatrix *m, bst_bin_t max_bins, bool use_sorted,
-                              Span<float> const hessian) {
+                              Span<float const> hessian) {
   HistogramCuts out;
   auto const &info = m->Info();
   auto n_threads = ctx->Threads();
diff --git a/src/common/hist_util.cu b/src/common/hist_util.cu
index eabdb86de..2dfba7215 100644
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@@ -19,14 +19,13 @@
 #include <vector>
 
 #include "categorical.h"
+#include "cuda_context.cuh"  // for CUDAContext
 #include "device_helpers.cuh"
 #include "hist_util.cuh"
 #include "hist_util.h"
-#include "math.h"  // NOLINT
 #include "quantile.h"
 #include "xgboost/host_device_vector.h"
 
-
 namespace xgboost::common {
 constexpr float SketchContainer::kFactor;
 
@@ -109,22 +108,19 @@ size_t SketchBatchNumElements(size_t sketch_batch_num_elements, bst_row_t num_ro
   return std::min(sketch_batch_num_elements, kIntMax);
 }
 
-void SortByWeight(dh::device_vector<float>* weights,
-                  dh::device_vector<Entry>* sorted_entries) {
+void SortByWeight(dh::device_vector<float>* weights, dh::device_vector<Entry>* sorted_entries) {
   // Sort both entries and wegihts.
   dh::XGBDeviceAllocator<char> alloc;
-  thrust::sort_by_key(thrust::cuda::par(alloc), sorted_entries->begin(),
-                      sorted_entries->end(), weights->begin(),
-                      detail::EntryCompareOp());
+  CHECK_EQ(weights->size(), sorted_entries->size());
+  thrust::sort_by_key(thrust::cuda::par(alloc), sorted_entries->begin(), sorted_entries->end(),
+                      weights->begin(), detail::EntryCompareOp());
 
   // Scan weights
   dh::XGBCachingDeviceAllocator<char> caching;
-  thrust::inclusive_scan_by_key(thrust::cuda::par(caching),
-                                sorted_entries->begin(), sorted_entries->end(),
-                                weights->begin(), weights->begin(),
-                                [=] __device__(const Entry& a, const Entry& b) {
-                                  return a.index == b.index;
-                                });
+  thrust::inclusive_scan_by_key(
+      thrust::cuda::par(caching), sorted_entries->begin(), sorted_entries->end(), weights->begin(),
+      weights->begin(),
+      [=] __device__(const Entry& a, const Entry& b) { return a.index == b.index; });
 }
 
 void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
@@ -200,159 +196,170 @@ void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_r
 }
 }  // namespace detail
 
-void ProcessBatch(int device, MetaInfo const &info, const SparsePage &page,
-                  size_t begin, size_t end, SketchContainer *sketch_container,
-                  int num_cuts_per_feature, size_t num_columns) {
-  dh::XGBCachingDeviceAllocator<char> alloc;
+void ProcessWeightedBatch(Context const* ctx, const SparsePage& page, MetaInfo const& info,
+                          std::size_t begin, std::size_t end,
+                          SketchContainer* sketch_container,  // <- output sketch
+                          int num_cuts_per_feature, common::Span<float const> sample_weight) {
   dh::device_vector<Entry> sorted_entries;
   if (page.data.DeviceCanRead()) {
-    const auto& device_data = page.data.ConstDevicePointer();
-    sorted_entries = dh::device_vector<Entry>(device_data + begin, device_data + end);
+    // direct copy if data is already on device
+    auto const& d_data = page.data.ConstDevicePointer();
+    sorted_entries = dh::device_vector<Entry>(d_data + begin, d_data + end);
   } else {
-    const auto& host_data = page.data.ConstHostVector();
-    sorted_entries = dh::device_vector<Entry>(host_data.begin() + begin,
-                                              host_data.begin() + end);
+    const auto& h_data = page.data.ConstHostVector();
+    sorted_entries = dh::device_vector<Entry>(h_data.begin() + begin, h_data.begin() + end);
+  }
+
+  bst_row_t base_rowid = page.base_rowid;
+
+  dh::device_vector<float> entry_weight;
+  auto cuctx = ctx->CUDACtx();
+  if (!sample_weight.empty()) {
+    // Expand sample weight into entry weight.
+    CHECK_EQ(sample_weight.size(), info.num_row_);
+    entry_weight.resize(sorted_entries.size());
+    auto d_temp_weight = dh::ToSpan(entry_weight);
+    page.offset.SetDevice(ctx->Device());
+    auto row_ptrs = page.offset.ConstDeviceSpan();
+    thrust::for_each_n(cuctx->CTP(), thrust::make_counting_iterator(0ul), entry_weight.size(),
+                       [=] __device__(std::size_t idx) {
+                         std::size_t element_idx = idx + begin;
+                         std::size_t ridx = dh::SegmentId(row_ptrs, element_idx);
+                         d_temp_weight[idx] = sample_weight[ridx + base_rowid];
+                       });
+    detail::SortByWeight(&entry_weight, &sorted_entries);
+  } else {
+    thrust::sort(cuctx->CTP(), sorted_entries.begin(), sorted_entries.end(),
+                 detail::EntryCompareOp());
   }
-  thrust::sort(thrust::cuda::par(alloc), sorted_entries.begin(),
-               sorted_entries.end(), detail::EntryCompareOp());
 
   HostDeviceVector<SketchContainer::OffsetT> cuts_ptr;
   dh::caching_device_vector<size_t> column_sizes_scan;
   data::IsValidFunctor dummy_is_valid(std::numeric_limits<float>::quiet_NaN());
   auto batch_it = dh::MakeTransformIterator<data::COOTuple>(
-      sorted_entries.data().get(),
-      [] __device__(Entry const &e) -> data::COOTuple {
-        return {0, e.index, e.fvalue};  // row_idx is not needed for scanning column size.
+      sorted_entries.data().get(), [] __device__(Entry const& e) -> data::COOTuple {
+        return {0, e.index, e.fvalue};  // row_idx is not needed for scaning column size.
       });
-  detail::GetColumnSizesScan(device, num_columns, num_cuts_per_feature,
+  detail::GetColumnSizesScan(ctx->Ordinal(), info.num_col_, num_cuts_per_feature,
                              IterSpan{batch_it, sorted_entries.size()}, dummy_is_valid, &cuts_ptr,
                              &column_sizes_scan);
   auto d_cuts_ptr = cuts_ptr.DeviceSpan();
-
   if (sketch_container->HasCategorical()) {
-    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr, &sorted_entries, nullptr,
+    auto p_weight = entry_weight.empty() ? nullptr : &entry_weight;
+    detail::RemoveDuplicatedCategories(ctx->Ordinal(), info, d_cuts_ptr, &sorted_entries, p_weight,
                                        &column_sizes_scan);
   }
 
   auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
   CHECK_EQ(d_cuts_ptr.size(), column_sizes_scan.size());
 
-  // add cuts into sketches
-  sketch_container->Push(dh::ToSpan(sorted_entries), dh::ToSpan(column_sizes_scan),
-                         d_cuts_ptr, h_cuts_ptr.back());
+  // Add cuts into sketches
+  sketch_container->Push(dh::ToSpan(sorted_entries), dh::ToSpan(column_sizes_scan), d_cuts_ptr,
+                         h_cuts_ptr.back(), dh::ToSpan(entry_weight));
+
   sorted_entries.clear();
   sorted_entries.shrink_to_fit();
   CHECK_EQ(sorted_entries.capacity(), 0);
   CHECK_NE(cuts_ptr.Size(), 0);
 }
 
-void ProcessWeightedBatch(int device, const SparsePage& page,
-                          MetaInfo const& info, size_t begin, size_t end,
-                          SketchContainer* sketch_container, int num_cuts_per_feature,
-                          size_t num_columns,
-                          bool is_ranking, Span<bst_group_t const> d_group_ptr) {
-  auto weights = info.weights_.ConstDeviceSpan();
+// Unify group weight, Hessian, and sample weight into sample weight.
+[[nodiscard]] Span<float const> UnifyWeight(CUDAContext const* cuctx, MetaInfo const& info,
+                                            common::Span<float const> hessian,
+                                            HostDeviceVector<float>* p_out_weight) {
+  if (hessian.empty()) {
+    if (info.IsRanking() && !info.weights_.Empty()) {
+      common::Span<float const> group_weight = info.weights_.ConstDeviceSpan();
+      dh::device_vector<bst_group_t> group_ptr(info.group_ptr_);
+      auto d_group_ptr = dh::ToSpan(group_ptr);
+      CHECK_GE(d_group_ptr.size(), 2) << "Must have at least 1 group for ranking.";
+      auto d_weight = info.weights_.ConstDeviceSpan();
+      CHECK_EQ(d_weight.size(), d_group_ptr.size() - 1)
+          << "Weight size should equal to number of groups.";
+      p_out_weight->Resize(info.num_row_);
+      auto d_weight_out = p_out_weight->DeviceSpan();
 
-  dh::XGBCachingDeviceAllocator<char> alloc;
-  const auto& host_data = page.data.ConstHostVector();
-  dh::device_vector<Entry> sorted_entries(host_data.begin() + begin,
-                                          host_data.begin() + end);
-
-  // Binary search to assign weights to each element
-  dh::device_vector<float> temp_weights(sorted_entries.size());
-  auto d_temp_weights = temp_weights.data().get();
-  page.offset.SetDevice(device);
-  auto row_ptrs = page.offset.ConstDeviceSpan();
-  size_t base_rowid = page.base_rowid;
-  if (is_ranking) {
-    CHECK_GE(d_group_ptr.size(), 2)
-        << "Must have at least 1 group for ranking.";
-    CHECK_EQ(weights.size(), d_group_ptr.size() - 1)
-        << "Weight size should equal to number of groups.";
-    dh::LaunchN(temp_weights.size(), [=] __device__(size_t idx) {
-        size_t element_idx = idx + begin;
-        size_t ridx = dh::SegmentId(row_ptrs, element_idx);
-        bst_group_t group_idx = dh::SegmentId(d_group_ptr, ridx + base_rowid);
-        d_temp_weights[idx] = weights[group_idx];
-      });
-  } else {
-    dh::LaunchN(temp_weights.size(), [=] __device__(size_t idx) {
-        size_t element_idx = idx + begin;
-        size_t ridx = dh::SegmentId(row_ptrs, element_idx);
-        d_temp_weights[idx] = weights[ridx + base_rowid];
-      });
-  }
-  detail::SortByWeight(&temp_weights, &sorted_entries);
-
-  HostDeviceVector<SketchContainer::OffsetT> cuts_ptr;
-  dh::caching_device_vector<size_t> column_sizes_scan;
-  data::IsValidFunctor dummy_is_valid(std::numeric_limits<float>::quiet_NaN());
-  auto batch_it = dh::MakeTransformIterator<data::COOTuple>(
-      sorted_entries.data().get(),
-      [] __device__(Entry const &e) -> data::COOTuple {
-        return {0, e.index, e.fvalue};  // row_idx is not needed for scaning column size.
-      });
-  detail::GetColumnSizesScan(device, num_columns, num_cuts_per_feature,
-                             IterSpan{batch_it, sorted_entries.size()}, dummy_is_valid, &cuts_ptr,
-                             &column_sizes_scan);
-  auto d_cuts_ptr = cuts_ptr.DeviceSpan();
-  if (sketch_container->HasCategorical()) {
-    detail::RemoveDuplicatedCategories(device, info, d_cuts_ptr, &sorted_entries, &temp_weights,
-                                       &column_sizes_scan);
-  }
-
-  auto const& h_cuts_ptr = cuts_ptr.ConstHostVector();
-
-  // Extract cuts
-  sketch_container->Push(dh::ToSpan(sorted_entries),
-                         dh::ToSpan(column_sizes_scan), d_cuts_ptr,
-                         h_cuts_ptr.back(), dh::ToSpan(temp_weights));
-  sorted_entries.clear();
-  sorted_entries.shrink_to_fit();
-}
-
-HistogramCuts DeviceSketch(int device, DMatrix* dmat, int max_bins,
-                           size_t sketch_batch_num_elements) {
-  dmat->Info().feature_types.SetDevice(device);
-  dmat->Info().feature_types.ConstDevicePointer();  // pull to device early
-  // Configure batch size based on available memory
-  bool has_weights = dmat->Info().weights_.Size() > 0;
-  size_t num_cuts_per_feature =
-      detail::RequiredSampleCutsPerColumn(max_bins, dmat->Info().num_row_);
-  sketch_batch_num_elements = detail::SketchBatchNumElements(
-      sketch_batch_num_elements,
-      dmat->Info().num_row_,
-      dmat->Info().num_col_,
-      dmat->Info().num_nonzero_,
-      device, num_cuts_per_feature, has_weights);
-
-  HistogramCuts cuts;
-  SketchContainer sketch_container(dmat->Info().feature_types, max_bins, dmat->Info().num_col_,
-                                   dmat->Info().num_row_, device);
-
-  dmat->Info().weights_.SetDevice(device);
-  for (const auto& batch : dmat->GetBatches<SparsePage>()) {
-    size_t batch_nnz = batch.data.Size();
-    auto const& info = dmat->Info();
-    for (auto begin = 0ull; begin < batch_nnz; begin += sketch_batch_num_elements) {
-      size_t end = std::min(batch_nnz, static_cast<std::size_t>(begin + sketch_batch_num_elements));
-      if (has_weights) {
-        bool is_ranking = HostSketchContainer::UseGroup(dmat->Info());
-        dh::caching_device_vector<uint32_t> groups(info.group_ptr_.cbegin(),
-                                                   info.group_ptr_.cend());
-        ProcessWeightedBatch(
-            device, batch, dmat->Info(), begin, end,
-            &sketch_container,
-            num_cuts_per_feature,
-            dmat->Info().num_col_,
-            is_ranking, dh::ToSpan(groups));
-      } else {
-        ProcessBatch(device, dmat->Info(), batch, begin, end, &sketch_container,
-                     num_cuts_per_feature, dmat->Info().num_col_);
-      }
+      thrust::for_each_n(cuctx->CTP(), thrust::make_counting_iterator(0ul), d_weight_out.size(),
+                         [=] XGBOOST_DEVICE(std::size_t i) {
+                           auto gidx = dh::SegmentId(d_group_ptr, i);
+                           d_weight_out[i] = d_weight[gidx];
+                         });
+      return p_out_weight->ConstDeviceSpan();
+    } else {
+      return info.weights_.ConstDeviceSpan();
     }
   }
-  sketch_container.MakeCuts(&cuts, dmat->Info().IsColumnSplit());
+
+  // sketch with hessian as weight
+  p_out_weight->Resize(info.num_row_);
+  auto d_weight_out = p_out_weight->DeviceSpan();
+  if (!info.weights_.Empty()) {
+    // merge sample weight with hessian
+    auto d_weight = info.weights_.ConstDeviceSpan();
+    if (info.IsRanking()) {
+      dh::device_vector<bst_group_t> group_ptr(info.group_ptr_);
+      CHECK_EQ(hessian.size(), d_weight_out.size());
+      auto d_group_ptr = dh::ToSpan(group_ptr);
+      CHECK_GE(d_group_ptr.size(), 2) << "Must have at least 1 group for ranking.";
+      CHECK_EQ(d_weight.size(), d_group_ptr.size() - 1)
+          << "Weight size should equal to number of groups.";
+      thrust::for_each_n(cuctx->CTP(), thrust::make_counting_iterator(0ul), hessian.size(),
+                         [=] XGBOOST_DEVICE(std::size_t i) {
+                           d_weight_out[i] = d_weight[dh::SegmentId(d_group_ptr, i)] * hessian(i);
+                         });
+    } else {
+      CHECK_EQ(hessian.size(), info.num_row_);
+      CHECK_EQ(hessian.size(), d_weight.size());
+      CHECK_EQ(hessian.size(), d_weight_out.size());
+      thrust::for_each_n(
+          cuctx->CTP(), thrust::make_counting_iterator(0ul), hessian.size(),
+          [=] XGBOOST_DEVICE(std::size_t i) { d_weight_out[i] = d_weight[i] * hessian(i); });
+    }
+  } else {
+    // copy hessian as weight
+    CHECK_EQ(d_weight_out.size(), hessian.size());
+    dh::safe_cuda(cudaMemcpyAsync(d_weight_out.data(), hessian.data(), hessian.size_bytes(),
+                                  cudaMemcpyDefault));
+  }
+  return d_weight_out;
+}
+
+HistogramCuts DeviceSketchWithHessian(Context const* ctx, DMatrix* p_fmat, bst_bin_t max_bin,
+                                      Span<float const> hessian,
+                                      std::size_t sketch_batch_num_elements) {
+  auto const& info = p_fmat->Info();
+  bool has_weight = !info.weights_.Empty();
+  info.feature_types.SetDevice(ctx->Device());
+
+  HostDeviceVector<float> weight;
+  weight.SetDevice(ctx->Device());
+
+  // Configure batch size based on available memory
+  std::size_t num_cuts_per_feature = detail::RequiredSampleCutsPerColumn(max_bin, info.num_row_);
+  sketch_batch_num_elements = detail::SketchBatchNumElements(
+      sketch_batch_num_elements, info.num_row_, info.num_col_, info.num_nonzero_, ctx->Ordinal(),
+      num_cuts_per_feature, has_weight);
+
+  CUDAContext const* cuctx = ctx->CUDACtx();
+
+  info.weights_.SetDevice(ctx->Device());
+  auto d_weight = UnifyWeight(cuctx, info, hessian, &weight);
+
+  HistogramCuts cuts;
+  SketchContainer sketch_container(info.feature_types, max_bin, info.num_col_, info.num_row_,
+                                   ctx->Ordinal());
+  CHECK_EQ(has_weight || !hessian.empty(), !d_weight.empty());
+  for (const auto& page : p_fmat->GetBatches<SparsePage>()) {
+    std::size_t page_nnz = page.data.Size();
+    for (auto begin = 0ull; begin < page_nnz; begin += sketch_batch_num_elements) {
+      std::size_t end =
+          std::min(page_nnz, static_cast<std::size_t>(begin + sketch_batch_num_elements));
+      ProcessWeightedBatch(ctx, page, info, begin, end, &sketch_container, num_cuts_per_feature,
+                           d_weight);
+    }
+  }
+
+  sketch_container.MakeCuts(&cuts, p_fmat->Info().IsColumnSplit());
   return cuts;
 }
 }  // namespace xgboost::common
diff --git a/src/common/hist_util.cuh b/src/common/hist_util.cuh
index 5e5ce80ca..d7be12749 100644
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@@ -11,14 +11,13 @@
 
 #include <cstddef>  // for size_t
 
-#include "../data/device_adapter.cuh"
+#include "../data/adapter.h"  // for IsValidFunctor
 #include "device_helpers.cuh"
 #include "hist_util.h"
 #include "quantile.cuh"
-#include "timer.h"
+#include "xgboost/span.h"  // for IterSpan
 
-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 namespace cuda {
 /**
  * copy and paste of the host version, we can't make it a __host__ __device__ function as
@@ -246,10 +245,35 @@ void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_r
                                 dh::caching_device_vector<size_t>* p_column_sizes_scan);
 }  // namespace detail
 
-// Compute sketch on DMatrix.
-// sketch_batch_num_elements 0 means autodetect. Only modify this for testing.
-HistogramCuts DeviceSketch(int device, DMatrix* dmat, int max_bins,
-                           size_t sketch_batch_num_elements = 0);
+/**
+ * @brief Compute sketch on DMatrix with GPU and Hessian as weight.
+ *
+ * @param ctx     Runtime context
+ * @param p_fmat  Training feature matrix
+ * @param max_bin Maximum number of bins for each feature
+ * @param hessian Hessian vector.
+ * @param sketch_batch_num_elements 0 means autodetect. Only modify this for testing.
+ *
+ * @return Quantile cuts
+ */
+HistogramCuts DeviceSketchWithHessian(Context const* ctx, DMatrix* p_fmat, bst_bin_t max_bin,
+                                      Span<float const> hessian,
+                                      std::size_t sketch_batch_num_elements = 0);
+
+/**
+ * @brief Compute sketch on DMatrix with GPU.
+ *
+ * @param ctx     Runtime context
+ * @param p_fmat  Training feature matrix
+ * @param max_bin Maximum number of bins for each feature
+ * @param sketch_batch_num_elements 0 means autodetect. Only modify this for testing.
+ *
+ * @return Quantile cuts
+ */
+inline HistogramCuts DeviceSketch(Context const* ctx, DMatrix* p_fmat, bst_bin_t max_bin,
+                                  std::size_t sketch_batch_num_elements = 0) {
+  return DeviceSketchWithHessian(ctx, p_fmat, max_bin, {}, sketch_batch_num_elements);
+}
 
 template <typename AdapterBatch>
 void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
@@ -417,7 +441,5 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
     }
   }
 }
-}      // namespace common
-}      // namespace xgboost
-
+}  // namespace xgboost::common
 #endif  // COMMON_HIST_UTIL_CUH_
diff --git a/src/common/hist_util.h b/src/common/hist_util.h
index c0fe5b44f..fd364b8ac 100644
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -172,7 +172,7 @@ class HistogramCuts {
  *                   but consumes more memory.
  */
 HistogramCuts SketchOnDMatrix(Context const* ctx, DMatrix* m, bst_bin_t max_bins,
-                              bool use_sorted = false, Span<float> const hessian = {});
+                              bool use_sorted = false, Span<float const> hessian = {});
 
 enum BinTypeSize : uint8_t {
   kUint8BinsTypeSize = 1,
diff --git a/src/common/host_device_vector.cc b/src/common/host_device_vector.cc
index 55c0ecf20..175a5cbf1 100644
--- a/src/common/host_device_vector.cc
+++ b/src/common/host_device_vector.cc
@@ -168,6 +168,9 @@ bool HostDeviceVector<T>::DeviceCanWrite() const {
 template <typename T>
 void HostDeviceVector<T>::SetDevice(int) const {}
 
+template <typename T>
+void HostDeviceVector<T>::SetDevice(DeviceOrd) const {}
+
 // explicit instantiations are required, as HostDeviceVector isn't header-only
 template class HostDeviceVector<bst_float>;
 template class HostDeviceVector<double>;
diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu
index 1fa9a3b22..7acb6719b 100644
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@@ -394,6 +394,11 @@ void HostDeviceVector<T>::SetDevice(int device) const {
   impl_->SetDevice(device);
 }
 
+template <typename T>
+void HostDeviceVector<T>::SetDevice(DeviceOrd device) const {
+  impl_->SetDevice(device.ordinal);
+}
+
 template <typename T>
 void HostDeviceVector<T>::Resize(size_t new_size, T v) {
   impl_->Resize(new_size, v);
diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index 0ccd7a081..7097df405 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -131,7 +131,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchP
   monitor_.Start("Quantiles");
   // Create the quantile sketches for the dmatrix and initialize HistogramCuts.
   row_stride = GetRowStride(dmat);
-  cuts_ = common::DeviceSketch(ctx->gpu_id, dmat, param.max_bin);
+  cuts_ = common::DeviceSketch(ctx, dmat, param.max_bin);
   monitor_.Stop("Quantiles");
 
   monitor_.Start("InitCompressedData");
diff --git a/src/data/gradient_index.cc b/src/data/gradient_index.cc
index 1d47ae9e6..1ee1bd60b 100644
--- a/src/data/gradient_index.cc
+++ b/src/data/gradient_index.cc
@@ -21,7 +21,7 @@ GHistIndexMatrix::GHistIndexMatrix() : columns_{std::make_unique<common::ColumnM
 
 GHistIndexMatrix::GHistIndexMatrix(Context const *ctx, DMatrix *p_fmat, bst_bin_t max_bins_per_feat,
                                    double sparse_thresh, bool sorted_sketch,
-                                   common::Span<float> hess)
+                                   common::Span<float const> hess)
     : max_numeric_bins_per_feat{max_bins_per_feat} {
   CHECK(p_fmat->SingleColBlock());
   // We use sorted sketching for approx tree method since it's more efficient in
diff --git a/src/data/gradient_index.h b/src/data/gradient_index.h
index 901451ad9..0bb93fc20 100644
--- a/src/data/gradient_index.h
+++ b/src/data/gradient_index.h
@@ -160,7 +160,7 @@ class GHistIndexMatrix {
    * \brief Constrcutor for SimpleDMatrix.
    */
   GHistIndexMatrix(Context const* ctx, DMatrix* x, bst_bin_t max_bins_per_feat,
-                   double sparse_thresh, bool sorted_sketch, common::Span<float> hess = {});
+                   double sparse_thresh, bool sorted_sketch, common::Span<float const> hess = {});
   /**
    * \brief Constructor for Iterative DMatrix. Initialize basic information and prepare
    *        for push batch.
diff --git a/src/data/sparse_page_dmatrix.cu b/src/data/sparse_page_dmatrix.cu
index 38304f725..1d9af9f06 100644
--- a/src/data/sparse_page_dmatrix.cu
+++ b/src/data/sparse_page_dmatrix.cu
@@ -25,8 +25,8 @@ BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const* ctx,
     cache_info_.erase(id);
     MakeCache(this, ".ellpack.page", cache_prefix_, &cache_info_);
     std::unique_ptr<common::HistogramCuts> cuts;
-    cuts = std::make_unique<common::HistogramCuts>(
-        common::DeviceSketch(ctx->gpu_id, this, param.max_bin, 0));
+    cuts =
+        std::make_unique<common::HistogramCuts>(common::DeviceSketch(ctx, this, param.max_bin, 0));
     this->InitializeSparsePage(ctx);  // reset after use.
 
     row_stride = GetRowStride(this);
diff --git a/tests/cpp/common/test_hist_util.cu b/tests/cpp/common/test_hist_util.cu
index 2d5735925..91baad981 100644
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@@ -3,17 +3,22 @@
  */
 #include <gtest/gtest.h>
 #include <thrust/device_vector.h>
+#include <xgboost/base.h>  // for bst_bin_t
 #include <xgboost/c_api.h>
 #include <xgboost/data.h>
 
-#include <algorithm>
-#include <cmath>
+#include <algorithm>  // for transform
+#include <cmath>      // for floor
+#include <cstddef>    // for size_t
+#include <limits>     // for numeric_limits
+#include <string>     // for string, to_string
+#include <tuple>      // for tuple, make_tuple
+#include <vector>     // for vector
 
 #include "../../../include/xgboost/logging.h"
 #include "../../../src/common/device_helpers.cuh"
 #include "../../../src/common/hist_util.cuh"
 #include "../../../src/common/hist_util.h"
-#include "../../../src/common/math.h"
 #include "../../../src/data/device_adapter.cuh"
 #include "../../../src/data/simple_dmatrix.h"
 #include "../data/test_array_interface.h"
@@ -21,8 +26,7 @@
 #include "../helpers.h"
 #include "test_hist_util.h"
 
-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 
 template <typename AdapterT>
 HistogramCuts GetHostCuts(Context const* ctx, AdapterT* adapter, int num_bins, float missing) {
@@ -32,16 +36,17 @@ HistogramCuts GetHostCuts(Context const* ctx, AdapterT* adapter, int num_bins, f
 }
 
 TEST(HistUtil, DeviceSketch) {
+  auto ctx = MakeCUDACtx(0);
   int num_columns = 1;
   int num_bins = 4;
   std::vector<float> x = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 7.0f, -1.0f};
   int num_rows = x.size();
   auto dmat = GetDMatrixFromData(x, num_rows, num_columns);
 
-  auto device_cuts = DeviceSketch(0, dmat.get(), num_bins);
+  auto device_cuts = DeviceSketch(&ctx, dmat.get(), num_bins);
 
-  Context ctx;
-  HistogramCuts host_cuts = SketchOnDMatrix(&ctx, dmat.get(), num_bins);
+  Context cpu_ctx;
+  HistogramCuts host_cuts = SketchOnDMatrix(&cpu_ctx, dmat.get(), num_bins);
 
   EXPECT_EQ(device_cuts.Values(), host_cuts.Values());
   EXPECT_EQ(device_cuts.Ptrs(), host_cuts.Ptrs());
@@ -65,6 +70,7 @@ TEST(HistUtil, SketchBatchNumElements) {
 }
 
 TEST(HistUtil, DeviceSketchMemory) {
+  auto ctx = MakeCUDACtx(0);
   int num_columns = 100;
   int num_rows = 1000;
   int num_bins = 256;
@@ -73,7 +79,7 @@ TEST(HistUtil, DeviceSketchMemory) {
 
   dh::GlobalMemoryLogger().Clear();
   ConsoleLogger::Configure({{"verbosity", "3"}});
-  auto device_cuts = DeviceSketch(0, dmat.get(), num_bins);
+  auto device_cuts = DeviceSketch(&ctx, dmat.get(), num_bins);
 
   size_t bytes_required = detail::RequiredMemory(
       num_rows, num_columns, num_rows * num_columns, num_bins, false);
@@ -83,6 +89,7 @@ TEST(HistUtil, DeviceSketchMemory) {
 }
 
 TEST(HistUtil, DeviceSketchWeightsMemory) {
+  auto ctx = MakeCUDACtx(0);
   int num_columns = 100;
   int num_rows = 1000;
   int num_bins = 256;
@@ -92,7 +99,7 @@ TEST(HistUtil, DeviceSketchWeightsMemory) {
 
   dh::GlobalMemoryLogger().Clear();
   ConsoleLogger::Configure({{"verbosity", "3"}});
-  auto device_cuts = DeviceSketch(0, dmat.get(), num_bins);
+  auto device_cuts = DeviceSketch(&ctx, dmat.get(), num_bins);
   ConsoleLogger::Configure({{"verbosity", "0"}});
 
   size_t bytes_required = detail::RequiredMemory(
@@ -102,43 +109,44 @@ TEST(HistUtil, DeviceSketchWeightsMemory) {
 }
 
 TEST(HistUtil, DeviceSketchDeterminism) {
+  auto ctx = MakeCUDACtx(0);
   int num_rows = 500;
   int num_columns = 5;
   int num_bins = 256;
   auto x = GenerateRandom(num_rows, num_columns);
   auto dmat = GetDMatrixFromData(x, num_rows, num_columns);
-  auto reference_sketch = DeviceSketch(0, dmat.get(), num_bins);
+  auto reference_sketch = DeviceSketch(&ctx, dmat.get(), num_bins);
   size_t constexpr kRounds{ 100 };
   for (size_t r = 0; r < kRounds; ++r) {
-    auto new_sketch = DeviceSketch(0, dmat.get(), num_bins);
+    auto new_sketch = DeviceSketch(&ctx, dmat.get(), num_bins);
     ASSERT_EQ(reference_sketch.Values(), new_sketch.Values());
     ASSERT_EQ(reference_sketch.MinValues(), new_sketch.MinValues());
   }
 }
 
 TEST(HistUtil, DeviceSketchCategoricalAsNumeric) {
-  int categorical_sizes[] = {2, 6, 8, 12};
+  auto ctx = MakeCUDACtx(0);
+  auto categorical_sizes = {2, 6, 8, 12};
   int num_bins = 256;
-  int sizes[] = {25, 100, 1000};
+  auto sizes = {25, 100, 1000};
   for (auto n : sizes) {
     for (auto num_categories : categorical_sizes) {
       auto x = GenerateRandomCategoricalSingleColumn(n, num_categories);
       auto dmat = GetDMatrixFromData(x, n, 1);
-      auto cuts = DeviceSketch(0, dmat.get(), num_bins);
+      auto cuts = DeviceSketch(&ctx, dmat.get(), num_bins);
       ValidateCuts(cuts, dmat.get(), num_bins);
     }
   }
 }
 
 TEST(HistUtil, DeviceSketchCategoricalFeatures) {
-  TestCategoricalSketch(1000, 256, 32, false,
-                        [](DMatrix *p_fmat, int32_t num_bins) {
-                          return DeviceSketch(0, p_fmat, num_bins);
-                        });
-  TestCategoricalSketch(1000, 256, 32, true,
-                        [](DMatrix *p_fmat, int32_t num_bins) {
-                          return DeviceSketch(0, p_fmat, num_bins);
-                        });
+  auto ctx = MakeCUDACtx(0);
+  TestCategoricalSketch(1000, 256, 32, false, [ctx](DMatrix* p_fmat, int32_t num_bins) {
+    return DeviceSketch(&ctx, p_fmat, num_bins);
+  });
+  TestCategoricalSketch(1000, 256, 32, true, [ctx](DMatrix* p_fmat, int32_t num_bins) {
+    return DeviceSketch(&ctx, p_fmat, num_bins);
+  });
 }
 
 void TestMixedSketch() {
@@ -162,7 +170,8 @@ void TestMixedSketch() {
   m->Info().feature_types.HostVector().push_back(FeatureType::kCategorical);
   m->Info().feature_types.HostVector().push_back(FeatureType::kNumerical);
 
-  auto cuts = DeviceSketch(0, m.get(), n_bins);
+  auto ctx = MakeCUDACtx(0);
+  auto cuts = DeviceSketch(&ctx, m.get(), n_bins);
   ASSERT_EQ(cuts.Values().size(), n_bins + n_categories);
 }
 
@@ -234,37 +243,40 @@ TEST(HistUtil, RemoveDuplicatedCategories) {
 }
 
 TEST(HistUtil, DeviceSketchMultipleColumns) {
-  int bin_sizes[] = {2, 16, 256, 512};
-  int sizes[] = {100, 1000, 1500};
+  auto ctx = MakeCUDACtx(0);
+  auto bin_sizes = {2, 16, 256, 512};
+  auto sizes = {100, 1000, 1500};
   int num_columns = 5;
   for (auto num_rows : sizes) {
     auto x = GenerateRandom(num_rows, num_columns);
     auto dmat = GetDMatrixFromData(x, num_rows, num_columns);
     for (auto num_bins : bin_sizes) {
-      auto cuts = DeviceSketch(0, dmat.get(), num_bins);
+      auto cuts = DeviceSketch(&ctx, dmat.get(), num_bins);
       ValidateCuts(cuts, dmat.get(), num_bins);
     }
   }
 }
 
 TEST(HistUtil, DeviceSketchMultipleColumnsWeights) {
-  int bin_sizes[] = {2, 16, 256, 512};
-  int sizes[] = {100, 1000, 1500};
+  auto ctx = MakeCUDACtx(0);
+  auto bin_sizes = {2, 16, 256, 512};
+  auto sizes = {100, 1000, 1500};
   int num_columns = 5;
   for (auto num_rows : sizes) {
     auto x = GenerateRandom(num_rows, num_columns);
     auto dmat = GetDMatrixFromData(x, num_rows, num_columns);
     dmat->Info().weights_.HostVector() = GenerateRandomWeights(num_rows);
     for (auto num_bins : bin_sizes) {
-      auto cuts = DeviceSketch(0, dmat.get(), num_bins);
+      auto cuts = DeviceSketch(&ctx, dmat.get(), num_bins);
       ValidateCuts(cuts, dmat.get(), num_bins);
     }
   }
 }
 
 TEST(HistUitl, DeviceSketchWeights) {
-  int bin_sizes[] = {2, 16, 256, 512};
-  int sizes[] = {100, 1000, 1500};
+  auto ctx = MakeCUDACtx(0);
+  auto bin_sizes = {2, 16, 256, 512};
+  auto sizes = {100, 1000, 1500};
   int num_columns = 5;
   for (auto num_rows : sizes) {
     auto x = GenerateRandom(num_rows, num_columns);
@@ -274,8 +286,8 @@ TEST(HistUitl, DeviceSketchWeights) {
     h_weights.resize(num_rows);
     std::fill(h_weights.begin(), h_weights.end(), 1.0f);
     for (auto num_bins : bin_sizes) {
-      auto cuts = DeviceSketch(0, dmat.get(), num_bins);
-      auto wcuts = DeviceSketch(0, weighted_dmat.get(), num_bins);
+      auto cuts = DeviceSketch(&ctx, dmat.get(), num_bins);
+      auto wcuts = DeviceSketch(&ctx, weighted_dmat.get(), num_bins);
       ASSERT_EQ(cuts.MinValues(), wcuts.MinValues());
       ASSERT_EQ(cuts.Ptrs(), wcuts.Ptrs());
       ASSERT_EQ(cuts.Values(), wcuts.Values());
@@ -286,14 +298,15 @@ TEST(HistUitl, DeviceSketchWeights) {
 }
 
 TEST(HistUtil, DeviceSketchBatches) {
+  auto ctx = MakeCUDACtx(0);
   int num_bins = 256;
   int num_rows = 5000;
-  int batch_sizes[] = {0, 100, 1500, 6000};
+  auto batch_sizes = {0, 100, 1500, 6000};
   int num_columns = 5;
   for (auto batch_size : batch_sizes) {
     auto x = GenerateRandom(num_rows, num_columns);
     auto dmat = GetDMatrixFromData(x, num_rows, num_columns);
-    auto cuts = DeviceSketch(0, dmat.get(), num_bins, batch_size);
+    auto cuts = DeviceSketch(&ctx, dmat.get(), num_bins, batch_size);
     ValidateCuts(cuts, dmat.get(), num_bins);
   }
 
@@ -301,8 +314,8 @@ TEST(HistUtil, DeviceSketchBatches) {
   size_t batches = 16;
   auto x = GenerateRandom(num_rows * batches, num_columns);
   auto dmat = GetDMatrixFromData(x, num_rows * batches, num_columns);
-  auto cuts_with_batches = DeviceSketch(0, dmat.get(), num_bins, num_rows);
-  auto cuts = DeviceSketch(0, dmat.get(), num_bins, 0);
+  auto cuts_with_batches = DeviceSketch(&ctx, dmat.get(), num_bins, num_rows);
+  auto cuts = DeviceSketch(&ctx, dmat.get(), num_bins, 0);
 
   auto const& cut_values_batched = cuts_with_batches.Values();
   auto const& cut_values = cuts.Values();
@@ -313,15 +326,16 @@ TEST(HistUtil, DeviceSketchBatches) {
 }
 
 TEST(HistUtil, DeviceSketchMultipleColumnsExternal) {
-  int bin_sizes[] = {2, 16, 256, 512};
-  int sizes[] = {100, 1000, 1500};
+  auto ctx = MakeCUDACtx(0);
+  auto bin_sizes = {2, 16, 256, 512};
+  auto sizes = {100, 1000, 1500};
   int num_columns =5;
   for (auto num_rows : sizes) {
     auto x = GenerateRandom(num_rows, num_columns);
     dmlc::TemporaryDirectory temp;
     auto dmat = GetExternalMemoryDMatrixFromData(x, num_rows, num_columns, temp);
     for (auto num_bins : bin_sizes) {
-      auto cuts = DeviceSketch(0, dmat.get(), num_bins);
+      auto cuts = DeviceSketch(&ctx, dmat.get(), num_bins);
       ValidateCuts(cuts, dmat.get(), num_bins);
     }
   }
@@ -329,8 +343,9 @@ TEST(HistUtil, DeviceSketchMultipleColumnsExternal) {
 
 // See https://github.com/dmlc/xgboost/issues/5866.
 TEST(HistUtil, DeviceSketchExternalMemoryWithWeights) {
-  int bin_sizes[] = {2, 16, 256, 512};
-  int sizes[] = {100, 1000, 1500};
+  auto ctx = MakeCUDACtx(0);
+  auto bin_sizes = {2, 16, 256, 512};
+  auto sizes = {100, 1000, 1500};
   int num_columns = 5;
   dmlc::TemporaryDirectory temp;
   for (auto num_rows : sizes) {
@@ -338,7 +353,7 @@ TEST(HistUtil, DeviceSketchExternalMemoryWithWeights) {
     auto dmat = GetExternalMemoryDMatrixFromData(x, num_rows, num_columns, temp);
     dmat->Info().weights_.HostVector() = GenerateRandomWeights(num_rows);
     for (auto num_bins : bin_sizes) {
-      auto cuts = DeviceSketch(0, dmat.get(), num_bins);
+      auto cuts = DeviceSketch(&ctx, dmat.get(), num_bins);
       ValidateCuts(cuts, dmat.get(), num_bins);
     }
   }
@@ -504,9 +519,9 @@ void TestCategoricalSketchAdapter(size_t n, size_t num_categories,
 }
 
 TEST(HistUtil, AdapterDeviceSketchCategorical) {
-  int categorical_sizes[] = {2, 6, 8, 12};
+  auto categorical_sizes = {2, 6, 8, 12};
   int num_bins = 256;
-  int sizes[] = {25, 100, 1000};
+  auto sizes = {25, 100, 1000};
   for (auto n : sizes) {
     for (auto num_categories : categorical_sizes) {
       auto x = GenerateRandomCategoricalSingleColumn(n, num_categories);
@@ -521,8 +536,8 @@ TEST(HistUtil, AdapterDeviceSketchCategorical) {
 }
 
 TEST(HistUtil, AdapterDeviceSketchMultipleColumns) {
-  int bin_sizes[] = {2, 16, 256, 512};
-  int sizes[] = {100, 1000, 1500};
+  auto bin_sizes = {2, 16, 256, 512};
+  auto sizes = {100, 1000, 1500};
   int num_columns = 5;
   for (auto num_rows : sizes) {
     auto x = GenerateRandom(num_rows, num_columns);
@@ -538,7 +553,7 @@ TEST(HistUtil, AdapterDeviceSketchMultipleColumns) {
 TEST(HistUtil, AdapterDeviceSketchBatches) {
   int num_bins = 256;
   int num_rows = 5000;
-  int batch_sizes[] = {0, 100, 1500, 6000};
+  auto batch_sizes = {0, 100, 1500, 6000};
   int num_columns = 5;
   for (auto batch_size : batch_sizes) {
     auto x = GenerateRandom(num_rows, num_columns);
@@ -619,14 +634,15 @@ TEST(HistUtil, GetColumnSize) {
 // Check sketching from adapter or DMatrix results in the same answer
 // Consistency here is useful for testing and user experience
 TEST(HistUtil, SketchingEquivalent) {
-  int bin_sizes[] = {2, 16, 256, 512};
-  int sizes[] = {100, 1000, 1500};
+  auto ctx = MakeCUDACtx(0);
+  auto bin_sizes = {2, 16, 256, 512};
+  auto sizes = {100, 1000, 1500};
   int num_columns = 5;
   for (auto num_rows : sizes) {
     auto x = GenerateRandom(num_rows, num_columns);
     auto dmat = GetDMatrixFromData(x, num_rows, num_columns);
     for (auto num_bins : bin_sizes) {
-      auto dmat_cuts = DeviceSketch(0, dmat.get(), num_bins);
+      auto dmat_cuts = DeviceSketch(&ctx, dmat.get(), num_bins);
       auto x_device = thrust::device_vector<float>(x);
       auto adapter = AdapterFromData(x_device, num_rows, num_columns);
       common::HistogramCuts adapter_cuts = MakeUnweightedCutsForTest(
@@ -641,21 +657,25 @@ TEST(HistUtil, SketchingEquivalent) {
 }
 
 TEST(HistUtil, DeviceSketchFromGroupWeights) {
+  auto ctx = MakeCUDACtx(0);
   size_t constexpr kRows = 3000, kCols = 200, kBins = 256;
   size_t constexpr kGroups = 10;
   auto m = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
+
+  // sketch with group weight
   auto& h_weights = m->Info().weights_.HostVector();
-  h_weights.resize(kRows);
+  h_weights.resize(kGroups);
   std::fill(h_weights.begin(), h_weights.end(), 1.0f);
   std::vector<bst_group_t> groups(kGroups);
   for (size_t i = 0; i < kGroups; ++i) {
     groups[i] = kRows / kGroups;
   }
   m->SetInfo("group", groups.data(), DataType::kUInt32, kGroups);
-  HistogramCuts weighted_cuts = DeviceSketch(0, m.get(), kBins, 0);
+  HistogramCuts weighted_cuts = DeviceSketch(&ctx, m.get(), kBins, 0);
 
+  // sketch with no weight
   h_weights.clear();
-  HistogramCuts cuts = DeviceSketch(0, m.get(), kBins, 0);
+  HistogramCuts cuts = DeviceSketch(&ctx, m.get(), kBins, 0);
 
   ASSERT_EQ(cuts.Values().size(), weighted_cuts.Values().size());
   ASSERT_EQ(cuts.MinValues().size(), weighted_cuts.MinValues().size());
@@ -723,9 +743,10 @@ void TestAdapterSketchFromWeights(bool with_group) {
   ASSERT_EQ(cuts.Ptrs().size(), kCols + 1);
   ValidateCuts(cuts, dmat.get(), kBins);
 
+  auto cuda_ctx = MakeCUDACtx(0);
   if (with_group) {
     dmat->Info().weights_ = decltype(dmat->Info().weights_)();  // remove weight
-    HistogramCuts non_weighted = DeviceSketch(0, dmat.get(), kBins, 0);
+    HistogramCuts non_weighted = DeviceSketch(&cuda_ctx, dmat.get(), kBins, 0);
     for (size_t i = 0; i < cuts.Values().size(); ++i) {
       ASSERT_EQ(cuts.Values()[i], non_weighted.Values()[i]);
     }
@@ -760,5 +781,156 @@ TEST(HistUtil, AdapterSketchFromWeights) {
   TestAdapterSketchFromWeights(false);
   TestAdapterSketchFromWeights(true);
 }
-}  // namespace common
-}  // namespace xgboost
+
+namespace {
+class DeviceSketchWithHessianTest
+    : public ::testing::TestWithParam<std::tuple<bool, bst_row_t, bst_bin_t>> {
+  bst_feature_t n_features_ = 5;
+  bst_group_t n_groups_{3};
+
+  auto GenerateHessian(Context const* ctx, bst_row_t n_samples) const {
+    HostDeviceVector<float> hessian;
+    auto& h_hess = hessian.HostVector();
+    h_hess = GenerateRandomWeights(n_samples);
+    std::mt19937 rng(0);
+    std::shuffle(h_hess.begin(), h_hess.end(), rng);
+    hessian.SetDevice(ctx->Device());
+    return hessian;
+  }
+
+  void CheckReg(Context const* ctx, std::shared_ptr<DMatrix> p_fmat, bst_bin_t n_bins,
+                HostDeviceVector<float> const& hessian, std::vector<float> const& w,
+                std::size_t n_elements) const {
+    auto const& h_hess = hessian.ConstHostVector();
+    {
+      auto& h_weight = p_fmat->Info().weights_.HostVector();
+      h_weight = w;
+    }
+
+    HistogramCuts cuts_hess =
+        DeviceSketchWithHessian(ctx, p_fmat.get(), n_bins, hessian.ConstDeviceSpan(), n_elements);
+    ValidateCuts(cuts_hess, p_fmat.get(), n_bins);
+
+    // merge hessian
+    {
+      auto& h_weight = p_fmat->Info().weights_.HostVector();
+      ASSERT_EQ(h_weight.size(), h_hess.size());
+      for (std::size_t i = 0; i < h_weight.size(); ++i) {
+        h_weight[i] = w[i] * h_hess[i];
+      }
+    }
+
+    HistogramCuts cuts_wh = DeviceSketch(ctx, p_fmat.get(), n_bins, n_elements);
+    ValidateCuts(cuts_wh, p_fmat.get(), n_bins);
+    ASSERT_EQ(cuts_hess.Values().size(), cuts_wh.Values().size());
+    for (std::size_t i = 0; i < cuts_hess.Values().size(); ++i) {
+      ASSERT_NEAR(cuts_wh.Values()[i], cuts_hess.Values()[i], kRtEps);
+    }
+
+    p_fmat->Info().weights_.HostVector() = w;
+  }
+
+ protected:
+  Context ctx_ = MakeCUDACtx(0);
+
+  void TestLTR(Context const* ctx, bst_row_t n_samples, bst_bin_t n_bins,
+               std::size_t n_elements) const {
+    auto x = GenerateRandom(n_samples, n_features_);
+
+    std::vector<bst_group_t> gptr;
+    gptr.resize(n_groups_ + 1, 0);
+    gptr[1] = n_samples / n_groups_;
+    gptr[2] = n_samples / n_groups_ + gptr[1];
+    gptr.back() = n_samples;
+
+    auto hessian = this->GenerateHessian(ctx, n_samples);
+    auto const& h_hess = hessian.ConstHostVector();
+    auto p_fmat = GetDMatrixFromData(x, n_samples, n_features_);
+    p_fmat->Info().group_ptr_ = gptr;
+
+    // test with constant group weight
+    std::vector<float> w(n_groups_, 1.0f);
+    p_fmat->Info().weights_.HostVector() = w;
+    HistogramCuts cuts_hess =
+        DeviceSketchWithHessian(ctx, p_fmat.get(), n_bins, hessian.ConstDeviceSpan(), n_elements);
+    // make validation easier by converting it into sample weight.
+    p_fmat->Info().weights_.HostVector() = h_hess;
+    p_fmat->Info().group_ptr_.clear();
+    ValidateCuts(cuts_hess, p_fmat.get(), n_bins);
+    // restore ltr properties
+    p_fmat->Info().weights_.HostVector() = w;
+    p_fmat->Info().group_ptr_ = gptr;
+
+    // test with random group weight
+    w = GenerateRandomWeights(n_groups_);
+    p_fmat->Info().weights_.HostVector() = w;
+    cuts_hess =
+        DeviceSketchWithHessian(ctx, p_fmat.get(), n_bins, hessian.ConstDeviceSpan(), n_elements);
+    // make validation easier by converting it into sample weight.
+    p_fmat->Info().weights_.HostVector() = h_hess;
+    p_fmat->Info().group_ptr_.clear();
+    ValidateCuts(cuts_hess, p_fmat.get(), n_bins);
+
+    // merge hessian with sample weight
+    p_fmat->Info().weights_.Resize(n_samples);
+    p_fmat->Info().group_ptr_.clear();
+    for (std::size_t i = 0; i < h_hess.size(); ++i) {
+      auto gidx = dh::SegmentId(Span{gptr.data(), gptr.size()}, i);
+      p_fmat->Info().weights_.HostVector()[i] = w[gidx] * h_hess[i];
+    }
+    auto cuts = DeviceSketch(ctx, p_fmat.get(), n_bins, n_elements);
+    ValidateCuts(cuts, p_fmat.get(), n_bins);
+    ASSERT_EQ(cuts.Values().size(), cuts_hess.Values().size());
+    for (std::size_t i = 0; i < cuts.Values().size(); ++i) {
+      EXPECT_NEAR(cuts.Values()[i], cuts_hess.Values()[i], 1e-4f);
+    }
+  }
+
+  void TestRegression(Context const* ctx, bst_row_t n_samples, bst_bin_t n_bins,
+                      std::size_t n_elements) const {
+    auto x = GenerateRandom(n_samples, n_features_);
+    auto p_fmat = GetDMatrixFromData(x, n_samples, n_features_);
+    std::vector<float> w = GenerateRandomWeights(n_samples);
+
+    auto hessian = this->GenerateHessian(ctx, n_samples);
+
+    this->CheckReg(ctx, p_fmat, n_bins, hessian, w, n_elements);
+  }
+};
+
+auto MakeParamsForTest() {
+  std::vector<bst_row_t> sizes = {1, 2, 256, 512, 1000, 1500};
+  std::vector<bst_bin_t> bin_sizes = {2, 16, 256, 512};
+  std::vector<std::tuple<bool, bst_row_t, bst_bin_t>> configs;
+  for (auto n_samples : sizes) {
+    for (auto n_bins : bin_sizes) {
+      configs.emplace_back(true, n_samples, n_bins);
+      configs.emplace_back(false, n_samples, n_bins);
+    }
+  }
+  return configs;
+}
+}  // namespace
+
+TEST_P(DeviceSketchWithHessianTest, DeviceSketchWithHessian) {
+  auto param = GetParam();
+  auto n_samples = std::get<1>(param);
+  auto n_bins = std::get<2>(param);
+  if (std::get<0>(param)) {
+    this->TestLTR(&ctx_, n_samples, n_bins, 0);
+    this->TestLTR(&ctx_, n_samples, n_bins, 512);
+  } else {
+    this->TestRegression(&ctx_, n_samples, n_bins, 0);
+    this->TestRegression(&ctx_, n_samples, n_bins, 512);
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    HistUtil, DeviceSketchWithHessianTest, ::testing::ValuesIn(MakeParamsForTest()),
+    [](::testing::TestParamInfo<DeviceSketchWithHessianTest::ParamType> const& info) {
+      auto task = std::get<0>(info.param) ? "ltr" : "reg";
+      auto n_samples = std::to_string(std::get<1>(info.param));
+      auto n_bins = std::to_string(std::get<2>(info.param));
+      return std::string{task} + "_" + n_samples + "_" + n_bins;
+    });
+}  // namespace xgboost::common
diff --git a/tests/cpp/common/test_quantile.cu b/tests/cpp/common/test_quantile.cu
index d2dc802a9..eda55ee47 100644
--- a/tests/cpp/common/test_quantile.cu
+++ b/tests/cpp/common/test_quantile.cu
@@ -1,9 +1,14 @@
+/**
+ * Copyright 2020-2023, XGBoost contributors
+ */
 #include <gtest/gtest.h>
-#include "test_quantile.h"
-#include "../helpers.h"
+
 #include "../../../src/collective/communicator-inl.cuh"
 #include "../../../src/common/hist_util.cuh"
 #include "../../../src/common/quantile.cuh"
+#include "../../../src/data/device_adapter.cuh"  // CupyAdapter
+#include "../helpers.h"
+#include "test_quantile.h"
 
 namespace xgboost {
 namespace {
@@ -437,13 +442,13 @@ void TestColumnSplitBasic() {
   }()};
 
   // Generate cuts for distributed environment.
-  auto const device = rank;
-  HistogramCuts distributed_cuts = common::DeviceSketch(device, m.get(), kBins);
+  auto ctx = MakeCUDACtx(rank);
+  HistogramCuts distributed_cuts = common::DeviceSketch(&ctx, m.get(), kBins);
 
   // Generate cuts for single node environment
   collective::Finalize();
   CHECK_EQ(collective::GetWorldSize(), 1);
-  HistogramCuts single_node_cuts = common::DeviceSketch(device, m.get(), kBins);
+  HistogramCuts single_node_cuts = common::DeviceSketch(&ctx, m.get(), kBins);
 
   auto const& sptrs = single_node_cuts.Ptrs();
   auto const& dptrs = distributed_cuts.Ptrs();

From 1b657a55138f60dbf46ec9e5c80c05aae3cf2c4d Mon Sep 17 00:00:00 2001
From: Bobby Wang <wbo4958@gmail.com>
Date: Mon, 24 Jul 2023 18:32:25 +0800
Subject: [PATCH 051/136] [jvm-packages] set device to cuda when tree method is
 "gpu_hist" (#9412)

---
 .../scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala  | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
index 5fc16ec09..f514eaa68 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
@@ -180,10 +180,12 @@ private[this] class XGBoostExecutionParamsFactory(rawParams: Map[String, Any], s
         " as 'hist', 'approx', 'gpu_hist', and 'auto'")
       treeMethod = Some(overridedParams("tree_method").asInstanceOf[String])
     }
-    val device: Option[String] = overridedParams.get("device") match {
-      case None => None
-      case Some(dev: String) => if (treeMethod == "gpu_hist") Some("cuda") else Some(dev)
-    }
+
+    // back-compatible with "gpu_hist"
+    val device: Option[String] = if (treeMethod.exists(_ == "gpu_hist")) {
+      Some("cuda")
+    } else overridedParams.get("device").map(_.toString)
+
     if (overridedParams.contains("train_test_ratio")) {
       logger.warn("train_test_ratio is deprecated since XGBoost 0.82, we recommend to explicitly" +
         " pass a training and multiple evaluation datasets by passing 'eval_sets' and " +

From 3a9996173e5209dff0e71ea8b2e85c954339ab59 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 25 Jul 2023 03:03:28 +0800
Subject: [PATCH 052/136] Revert "Switch to per-thread default stream (#9396)"
 (#9413)

This reverts commit f7f673b00c15458fb4dd74a2a0d2ba80369c5faf.
---
 cmake/Utils.cmake                             |  1 -
 src/collective/nccl_device_communicator.cu    | 27 ++++++++++++-------
 src/collective/nccl_device_communicator.cuh   |  1 +
 src/common/device_helpers.cuh                 |  2 +-
 src/common/hist_util.cuh                      |  4 +--
 src/tree/gpu_hist/row_partitioner.cu          |  2 ++
 src/tree/gpu_hist/row_partitioner.cuh         | 21 ++++++++-------
 .../cpp/tree/gpu_hist/test_row_partitioner.cu |  2 +-
 8 files changed, 35 insertions(+), 25 deletions(-)

diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
index 1e0530efa..cb239f79c 100644
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@@ -127,7 +127,6 @@ endfunction(format_gencode_flags flags)
 # Set CUDA related flags to target.  Must be used after code `format_gencode_flags`.
 function(xgboost_set_cuda_flags target)
   target_compile_options(${target} PRIVATE
-    $<$<COMPILE_LANGUAGE:CUDA>:--default-stream per-thread>
     $<$<COMPILE_LANGUAGE:CUDA>:--expt-extended-lambda>
     $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
     $<$<COMPILE_LANGUAGE:CUDA>:${GEN_CODE}>
diff --git a/src/collective/nccl_device_communicator.cu b/src/collective/nccl_device_communicator.cu
index 51fa5693c..470700d2d 100644
--- a/src/collective/nccl_device_communicator.cu
+++ b/src/collective/nccl_device_communicator.cu
@@ -44,12 +44,16 @@ NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, bool needs_sy
   nccl_unique_id_ = GetUniqueId();
   dh::safe_cuda(cudaSetDevice(device_ordinal_));
   dh::safe_nccl(ncclCommInitRank(&nccl_comm_, world_size_, nccl_unique_id_, rank_));
+  dh::safe_cuda(cudaStreamCreate(&cuda_stream_));
 }
 
 NcclDeviceCommunicator::~NcclDeviceCommunicator() {
   if (world_size_ == 1) {
     return;
   }
+  if (cuda_stream_) {
+    dh::safe_cuda(cudaStreamDestroy(cuda_stream_));
+  }
   if (nccl_comm_) {
     dh::safe_nccl(ncclCommDestroy(nccl_comm_));
   }
@@ -119,8 +123,8 @@ ncclRedOp_t GetNcclRedOp(Operation const &op) {
 
 template <typename Func>
 void RunBitwiseAllreduce(char *out_buffer, char const *device_buffer, Func func, int world_size,
-                         std::size_t size) {
-  dh::LaunchN(size, [=] __device__(std::size_t idx) {
+                         std::size_t size, cudaStream_t stream) {
+  dh::LaunchN(size, stream, [=] __device__(std::size_t idx) {
     auto result = device_buffer[idx];
     for (auto rank = 1; rank < world_size; rank++) {
       result = func(result, device_buffer[rank * size + idx]);
@@ -138,22 +142,25 @@ void NcclDeviceCommunicator::BitwiseAllReduce(void *send_receive_buffer, std::si
 
   // First gather data from all the workers.
   dh::safe_nccl(ncclAllGather(send_receive_buffer, device_buffer, count, GetNcclDataType(data_type),
-                              nccl_comm_, dh::DefaultStream()));
+                              nccl_comm_, cuda_stream_));
   if (needs_sync_) {
-    dh::DefaultStream().Sync();
+    dh::safe_cuda(cudaStreamSynchronize(cuda_stream_));
   }
 
   // Then reduce locally.
   auto *out_buffer = static_cast<char *>(send_receive_buffer);
   switch (op) {
     case Operation::kBitwiseAND:
-      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_and<char>(), world_size_, size);
+      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_and<char>(), world_size_, size,
+                          cuda_stream_);
       break;
     case Operation::kBitwiseOR:
-      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_or<char>(), world_size_, size);
+      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_or<char>(), world_size_, size,
+                          cuda_stream_);
       break;
     case Operation::kBitwiseXOR:
-      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_xor<char>(), world_size_, size);
+      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_xor<char>(), world_size_, size,
+                          cuda_stream_);
       break;
     default:
       LOG(FATAL) << "Not a bitwise reduce operation.";
@@ -172,7 +179,7 @@ void NcclDeviceCommunicator::AllReduce(void *send_receive_buffer, std::size_t co
   } else {
     dh::safe_nccl(ncclAllReduce(send_receive_buffer, send_receive_buffer, count,
                                 GetNcclDataType(data_type), GetNcclRedOp(op), nccl_comm_,
-                                dh::DefaultStream()));
+                                cuda_stream_));
   }
   allreduce_bytes_ += count * GetTypeSize(data_type);
   allreduce_calls_ += 1;
@@ -199,7 +206,7 @@ void NcclDeviceCommunicator::AllGatherV(void const *send_buffer, size_t length_b
   for (int32_t i = 0; i < world_size_; ++i) {
     size_t as_bytes = segments->at(i);
     dh::safe_nccl(ncclBroadcast(send_buffer, receive_buffer->data().get() + offset, as_bytes,
-                                ncclChar, i, nccl_comm_, dh::DefaultStream()));
+                                ncclChar, i, nccl_comm_, cuda_stream_));
     offset += as_bytes;
   }
   dh::safe_nccl(ncclGroupEnd());
@@ -210,7 +217,7 @@ void NcclDeviceCommunicator::Synchronize() {
     return;
   }
   dh::safe_cuda(cudaSetDevice(device_ordinal_));
-  dh::DefaultStream().Sync();
+  dh::safe_cuda(cudaStreamSynchronize(cuda_stream_));
 }
 
 }  // namespace collective
diff --git a/src/collective/nccl_device_communicator.cuh b/src/collective/nccl_device_communicator.cuh
index d99002685..bb3fce45c 100644
--- a/src/collective/nccl_device_communicator.cuh
+++ b/src/collective/nccl_device_communicator.cuh
@@ -77,6 +77,7 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
   int const world_size_;
   int const rank_;
   ncclComm_t nccl_comm_{};
+  cudaStream_t cuda_stream_{};
   ncclUniqueId nccl_unique_id_{};
   size_t allreduce_bytes_{0};  // Keep statistics of the number of bytes communicated.
   size_t allreduce_calls_{0};  // Keep statistics of the number of reduce calls.
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index c45949f66..db38b2222 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -1176,7 +1176,7 @@ inline void CUDAEvent::Record(CUDAStreamView stream) {  // NOLINT
   dh::safe_cuda(cudaEventRecord(event_, cudaStream_t{stream}));
 }
 
-inline CUDAStreamView DefaultStream() { return CUDAStreamView{cudaStreamPerThread}; }
+inline CUDAStreamView DefaultStream() { return CUDAStreamView{cudaStreamLegacy}; }
 
 class CUDAStream {
   cudaStream_t stream_;
diff --git a/src/common/hist_util.cuh b/src/common/hist_util.cuh
index d7be12749..f13f01b3e 100644
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@@ -134,12 +134,12 @@ void LaunchGetColumnSizeKernel(std::int32_t device, IterSpan<BatchIt> batch_iter
       CHECK(!force_use_u64);
       auto kernel = GetColumnSizeSharedMemKernel<kBlockThreads, std::uint32_t, BatchIt>;
       auto grid_size = EstimateGridSize<kBlockThreads>(device, kernel, required_shared_memory);
-      dh::LaunchKernel{grid_size, kBlockThreads, required_shared_memory}(
+      dh::LaunchKernel{grid_size, kBlockThreads, required_shared_memory, dh::DefaultStream()}(
           kernel, batch_iter, is_valid, out_column_size);
     } else {
       auto kernel = GetColumnSizeSharedMemKernel<kBlockThreads, std::size_t, BatchIt>;
       auto grid_size = EstimateGridSize<kBlockThreads>(device, kernel, required_shared_memory);
-      dh::LaunchKernel{grid_size, kBlockThreads, required_shared_memory}(
+      dh::LaunchKernel{grid_size, kBlockThreads, required_shared_memory, dh::DefaultStream()}(
           kernel, batch_iter, is_valid, out_column_size);
     }
   } else {
diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu
index 78b04883c..015d817f3 100644
--- a/src/tree/gpu_hist/row_partitioner.cu
+++ b/src/tree/gpu_hist/row_partitioner.cu
@@ -18,10 +18,12 @@ RowPartitioner::RowPartitioner(int device_idx, size_t num_rows)
   dh::safe_cuda(cudaSetDevice(device_idx_));
   ridx_segments_.emplace_back(NodePositionInfo{Segment(0, num_rows)});
   thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size());
+  dh::safe_cuda(cudaStreamCreate(&stream_));
 }
 
 RowPartitioner::~RowPartitioner() {
   dh::safe_cuda(cudaSetDevice(device_idx_));
+  dh::safe_cuda(cudaStreamDestroy(stream_));
 }
 
 common::Span<const RowPartitioner::RowIndexT> RowPartitioner::GetRows(bst_node_t nidx) {
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index 215a0e49b..f1c420ba0 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -116,7 +116,7 @@ template <typename RowIndexT, typename OpT, typename OpDataT>
 void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
                        common::Span<RowIndexT> ridx, common::Span<RowIndexT> ridx_tmp,
                        common::Span<bst_uint> d_counts, std::size_t total_rows, OpT op,
-                       dh::device_vector<int8_t>* tmp) {
+                       dh::device_vector<int8_t>* tmp, cudaStream_t stream) {
   dh::LDGIterator<PerNodeData<OpDataT>> batch_info_itr(d_batch_info.data());
   WriteResultsFunctor<OpDataT> write_results{batch_info_itr, ridx.data(), ridx_tmp.data(),
                                              d_counts.data()};
@@ -135,12 +135,12 @@ void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
   size_t temp_bytes = 0;
   if (tmp->empty()) {
     cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator, discard_write_iterator,
-                                   IndexFlagOp(), total_rows);
+                                   IndexFlagOp(), total_rows, stream);
     tmp->resize(temp_bytes);
   }
   temp_bytes = tmp->size();
   cub::DeviceScan::InclusiveScan(tmp->data().get(), temp_bytes, input_iterator,
-                                 discard_write_iterator, IndexFlagOp(), total_rows);
+                                 discard_write_iterator, IndexFlagOp(), total_rows, stream);
 
   constexpr int kBlockSize = 256;
 
@@ -149,7 +149,7 @@ void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
   const int grid_size = xgboost::common::DivRoundUp(total_rows, kBlockSize * kItemsThread);
 
   SortPositionCopyKernel<kBlockSize, RowIndexT, OpDataT>
-      <<<grid_size, kBlockSize, 0>>>(batch_info_itr, ridx, ridx_tmp, total_rows);
+      <<<grid_size, kBlockSize, 0, stream>>>(batch_info_itr, ridx, ridx_tmp, total_rows);
 }
 
 struct NodePositionInfo {
@@ -221,6 +221,7 @@ class RowPartitioner {
   dh::device_vector<int8_t> tmp_;
   dh::PinnedMemory pinned_;
   dh::PinnedMemory pinned2_;
+  cudaStream_t stream_;
 
  public:
   RowPartitioner(int device_idx, size_t num_rows);
@@ -277,7 +278,7 @@ class RowPartitioner {
     }
     dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
                                   h_batch_info.size() * sizeof(PerNodeData<OpDataT>),
-                                  cudaMemcpyDefault));
+                                  cudaMemcpyDefault, stream_));
 
     // Temporary arrays
     auto h_counts = pinned_.GetSpan<bst_uint>(nidx.size(), 0);
@@ -286,12 +287,12 @@ class RowPartitioner {
     // Partition the rows according to the operator
     SortPositionBatch<RowIndexT, UpdatePositionOpT, OpDataT>(
         dh::ToSpan(d_batch_info), dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_counts),
-        total_rows, op, &tmp_);
+        total_rows, op, &tmp_, stream_);
     dh::safe_cuda(cudaMemcpyAsync(h_counts.data(), d_counts.data().get(), h_counts.size_bytes(),
-                                  cudaMemcpyDefault));
+                                  cudaMemcpyDefault, stream_));
     // TODO(Rory): this synchronisation hurts performance a lot
     // Future optimisation should find a way to skip this
-    dh::DefaultStream().Sync();
+    dh::safe_cuda(cudaStreamSynchronize(stream_));
 
     // Update segments
     for (size_t i = 0; i < nidx.size(); i++) {
@@ -326,13 +327,13 @@ class RowPartitioner {
     dh::TemporaryArray<NodePositionInfo> d_node_info_storage(ridx_segments_.size());
     dh::safe_cuda(cudaMemcpyAsync(d_node_info_storage.data().get(), ridx_segments_.data(),
                                   sizeof(NodePositionInfo) * ridx_segments_.size(),
-                                  cudaMemcpyDefault));
+                                  cudaMemcpyDefault, stream_));
 
     constexpr int kBlockSize = 512;
     const int kItemsThread = 8;
     const int grid_size = xgboost::common::DivRoundUp(ridx_.size(), kBlockSize * kItemsThread);
     common::Span<const RowIndexT> d_ridx(ridx_.data().get(), ridx_.size());
-    FinalisePositionKernel<kBlockSize><<<grid_size, kBlockSize, 0>>>(
+    FinalisePositionKernel<kBlockSize><<<grid_size, kBlockSize, 0, stream_>>>(
         dh::ToSpan(d_node_info_storage), d_ridx, d_out_position, op);
   }
 };
diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
index 050980400..f82123452 100644
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -73,7 +73,7 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se
   dh::device_vector<int8_t> tmp;
   SortPositionBatch<uint32_t, decltype(op), int>(dh::ToSpan(d_batch_info), dh::ToSpan(ridx),
                                                  dh::ToSpan(ridx_tmp), dh::ToSpan(counts),
-                                                 total_rows, op, &tmp);
+                                                 total_rows, op, &tmp, nullptr);
 
   auto op_without_data = [=] __device__(auto ridx) { return ridx % 2 == 0; };
   for (size_t i = 0; i < segments.size(); i++) {

From 54579da4d7f23f9620ad562b640b82ac429b5a2e Mon Sep 17 00:00:00 2001
From: Nicholas Hilton <32165552+NickHilton@users.noreply.github.com>
Date: Tue, 25 Jul 2023 17:03:04 -0600
Subject: [PATCH 053/136] [doc] Fix typo in prediction.rst (#9415)

Typo for `pred_contribs` and `pred_interactions`
---
 doc/prediction.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/prediction.rst b/doc/prediction.rst
index b98c2fc6b..026009d59 100644
--- a/doc/prediction.rst
+++ b/doc/prediction.rst
@@ -35,14 +35,14 @@ After 1.4 release, we added a new parameter called ``strict_shape``, one can set
   has equivalent output shape of ``multi:softprob`` due to dropped transformation.  If
   strict shape is set to False then output can have 1 or 2 dim depending on used model.
 
-- When using ``preds_contribs`` with ``strict_shape`` set to ``True``:
+- When using ``pred_contribs`` with ``strict_shape`` set to ``True``:
 
   Output is a 3-dim array, with ``(rows, groups, columns + 1)`` as shape.  Whether
   ``approx_contribs`` is used does not change the output shape. If the strict shape
   parameter is not set, it can be a 2 or 3 dimension array depending on whether
   multi-class model is being used.
 
-- When using ``preds_interactions`` with ``strict_shape`` set to ``True``:
+- When using ``pred_interactions`` with ``strict_shape`` set to ``True``:
 
   Output is a 4-dim array, with ``(rows, groups, columns + 1, columns + 1)`` as shape.
   Like the predict contribution case, whether ``approx_contribs`` is used does not change

From 7579905e187fbbc2cec48299f7ea36a0365ba4d8 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Tue, 25 Jul 2023 16:09:12 -0700
Subject: [PATCH 054/136] Retry switching to per-thread default stream (#9416)

---
 CMakeLists.txt                                |  1 +
 cmake/Utils.cmake                             |  5 ++++
 src/collective/nccl_device_communicator.cu    | 27 +++++++------------
 src/collective/nccl_device_communicator.cuh   |  1 -
 src/common/device_helpers.cuh                 | 10 +++++--
 src/common/hist_util.cuh                      |  4 +--
 src/tree/gpu_hist/row_partitioner.cu          |  2 --
 src/tree/gpu_hist/row_partitioner.cuh         | 21 +++++++--------
 .../cpp/tree/gpu_hist/test_row_partitioner.cu |  2 +-
 9 files changed, 37 insertions(+), 36 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ede6c5b75..a026888df 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -50,6 +50,7 @@ option(HIDE_CXX_SYMBOLS "Build shared library and hide all C++ symbols" OFF)
 option(KEEP_BUILD_ARTIFACTS_IN_BINARY_DIR "Output build artifacts in CMake binary dir" OFF)
 ## CUDA
 option(USE_CUDA  "Build with GPU acceleration" OFF)
+option(USE_PER_THREAD_DEFAULT_STREAM "Build with per-thread default stream" ON)
 option(USE_NCCL  "Build with NCCL to enable distributed GPU support." OFF)
 option(BUILD_WITH_SHARED_NCCL "Build with shared NCCL library." OFF)
 set(GPU_COMPUTE_VER "" CACHE STRING
diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
index cb239f79c..98e96e304 100644
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@@ -133,6 +133,11 @@ function(xgboost_set_cuda_flags target)
     $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=${OpenMP_CXX_FLAGS}>
     $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all>)
 
+  if (USE_PER_THREAD_DEFAULT_STREAM)
+    target_compile_options(${target} PRIVATE
+            $<$<COMPILE_LANGUAGE:CUDA>:--default-stream per-thread>)
+  endif (USE_PER_THREAD_DEFAULT_STREAM)
+
   if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
     set_property(TARGET ${target} PROPERTY CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES})
   endif (CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
diff --git a/src/collective/nccl_device_communicator.cu b/src/collective/nccl_device_communicator.cu
index 470700d2d..51fa5693c 100644
--- a/src/collective/nccl_device_communicator.cu
+++ b/src/collective/nccl_device_communicator.cu
@@ -44,16 +44,12 @@ NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, bool needs_sy
   nccl_unique_id_ = GetUniqueId();
   dh::safe_cuda(cudaSetDevice(device_ordinal_));
   dh::safe_nccl(ncclCommInitRank(&nccl_comm_, world_size_, nccl_unique_id_, rank_));
-  dh::safe_cuda(cudaStreamCreate(&cuda_stream_));
 }
 
 NcclDeviceCommunicator::~NcclDeviceCommunicator() {
   if (world_size_ == 1) {
     return;
   }
-  if (cuda_stream_) {
-    dh::safe_cuda(cudaStreamDestroy(cuda_stream_));
-  }
   if (nccl_comm_) {
     dh::safe_nccl(ncclCommDestroy(nccl_comm_));
   }
@@ -123,8 +119,8 @@ ncclRedOp_t GetNcclRedOp(Operation const &op) {
 
 template <typename Func>
 void RunBitwiseAllreduce(char *out_buffer, char const *device_buffer, Func func, int world_size,
-                         std::size_t size, cudaStream_t stream) {
-  dh::LaunchN(size, stream, [=] __device__(std::size_t idx) {
+                         std::size_t size) {
+  dh::LaunchN(size, [=] __device__(std::size_t idx) {
     auto result = device_buffer[idx];
     for (auto rank = 1; rank < world_size; rank++) {
       result = func(result, device_buffer[rank * size + idx]);
@@ -142,25 +138,22 @@ void NcclDeviceCommunicator::BitwiseAllReduce(void *send_receive_buffer, std::si
 
   // First gather data from all the workers.
   dh::safe_nccl(ncclAllGather(send_receive_buffer, device_buffer, count, GetNcclDataType(data_type),
-                              nccl_comm_, cuda_stream_));
+                              nccl_comm_, dh::DefaultStream()));
   if (needs_sync_) {
-    dh::safe_cuda(cudaStreamSynchronize(cuda_stream_));
+    dh::DefaultStream().Sync();
   }
 
   // Then reduce locally.
   auto *out_buffer = static_cast<char *>(send_receive_buffer);
   switch (op) {
     case Operation::kBitwiseAND:
-      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_and<char>(), world_size_, size,
-                          cuda_stream_);
+      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_and<char>(), world_size_, size);
       break;
     case Operation::kBitwiseOR:
-      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_or<char>(), world_size_, size,
-                          cuda_stream_);
+      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_or<char>(), world_size_, size);
       break;
     case Operation::kBitwiseXOR:
-      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_xor<char>(), world_size_, size,
-                          cuda_stream_);
+      RunBitwiseAllreduce(out_buffer, device_buffer, thrust::bit_xor<char>(), world_size_, size);
       break;
     default:
       LOG(FATAL) << "Not a bitwise reduce operation.";
@@ -179,7 +172,7 @@ void NcclDeviceCommunicator::AllReduce(void *send_receive_buffer, std::size_t co
   } else {
     dh::safe_nccl(ncclAllReduce(send_receive_buffer, send_receive_buffer, count,
                                 GetNcclDataType(data_type), GetNcclRedOp(op), nccl_comm_,
-                                cuda_stream_));
+                                dh::DefaultStream()));
   }
   allreduce_bytes_ += count * GetTypeSize(data_type);
   allreduce_calls_ += 1;
@@ -206,7 +199,7 @@ void NcclDeviceCommunicator::AllGatherV(void const *send_buffer, size_t length_b
   for (int32_t i = 0; i < world_size_; ++i) {
     size_t as_bytes = segments->at(i);
     dh::safe_nccl(ncclBroadcast(send_buffer, receive_buffer->data().get() + offset, as_bytes,
-                                ncclChar, i, nccl_comm_, cuda_stream_));
+                                ncclChar, i, nccl_comm_, dh::DefaultStream()));
     offset += as_bytes;
   }
   dh::safe_nccl(ncclGroupEnd());
@@ -217,7 +210,7 @@ void NcclDeviceCommunicator::Synchronize() {
     return;
   }
   dh::safe_cuda(cudaSetDevice(device_ordinal_));
-  dh::safe_cuda(cudaStreamSynchronize(cuda_stream_));
+  dh::DefaultStream().Sync();
 }
 
 }  // namespace collective
diff --git a/src/collective/nccl_device_communicator.cuh b/src/collective/nccl_device_communicator.cuh
index bb3fce45c..d99002685 100644
--- a/src/collective/nccl_device_communicator.cuh
+++ b/src/collective/nccl_device_communicator.cuh
@@ -77,7 +77,6 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
   int const world_size_;
   int const rank_;
   ncclComm_t nccl_comm_{};
-  cudaStream_t cuda_stream_{};
   ncclUniqueId nccl_unique_id_{};
   size_t allreduce_bytes_{0};  // Keep statistics of the number of bytes communicated.
   size_t allreduce_calls_{0};  // Keep statistics of the number of reduce calls.
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index db38b2222..dfaac9c35 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -480,7 +480,7 @@ struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
   cub::CachingDeviceAllocator& GetGlobalCachingAllocator() {
     // Configure allocator with maximum cached bin size of ~1GB and no limit on
     // maximum cached bytes
-    static cub::CachingDeviceAllocator *allocator = new cub::CachingDeviceAllocator(2, 9, 29);
+    thread_local cub::CachingDeviceAllocator *allocator = new cub::CachingDeviceAllocator(2, 9, 29);
     return *allocator;
   }
   pointer allocate(size_t n) {  // NOLINT
@@ -1176,7 +1176,13 @@ inline void CUDAEvent::Record(CUDAStreamView stream) {  // NOLINT
   dh::safe_cuda(cudaEventRecord(event_, cudaStream_t{stream}));
 }
 
-inline CUDAStreamView DefaultStream() { return CUDAStreamView{cudaStreamLegacy}; }
+inline CUDAStreamView DefaultStream() {
+#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM
+  return CUDAStreamView{cudaStreamPerThread};
+#else
+  return CUDAStreamView{cudaStreamLegacy};
+#endif
+}
 
 class CUDAStream {
   cudaStream_t stream_;
diff --git a/src/common/hist_util.cuh b/src/common/hist_util.cuh
index f13f01b3e..d7be12749 100644
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@@ -134,12 +134,12 @@ void LaunchGetColumnSizeKernel(std::int32_t device, IterSpan<BatchIt> batch_iter
       CHECK(!force_use_u64);
       auto kernel = GetColumnSizeSharedMemKernel<kBlockThreads, std::uint32_t, BatchIt>;
       auto grid_size = EstimateGridSize<kBlockThreads>(device, kernel, required_shared_memory);
-      dh::LaunchKernel{grid_size, kBlockThreads, required_shared_memory, dh::DefaultStream()}(
+      dh::LaunchKernel{grid_size, kBlockThreads, required_shared_memory}(
           kernel, batch_iter, is_valid, out_column_size);
     } else {
       auto kernel = GetColumnSizeSharedMemKernel<kBlockThreads, std::size_t, BatchIt>;
       auto grid_size = EstimateGridSize<kBlockThreads>(device, kernel, required_shared_memory);
-      dh::LaunchKernel{grid_size, kBlockThreads, required_shared_memory, dh::DefaultStream()}(
+      dh::LaunchKernel{grid_size, kBlockThreads, required_shared_memory}(
           kernel, batch_iter, is_valid, out_column_size);
     }
   } else {
diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu
index 015d817f3..78b04883c 100644
--- a/src/tree/gpu_hist/row_partitioner.cu
+++ b/src/tree/gpu_hist/row_partitioner.cu
@@ -18,12 +18,10 @@ RowPartitioner::RowPartitioner(int device_idx, size_t num_rows)
   dh::safe_cuda(cudaSetDevice(device_idx_));
   ridx_segments_.emplace_back(NodePositionInfo{Segment(0, num_rows)});
   thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size());
-  dh::safe_cuda(cudaStreamCreate(&stream_));
 }
 
 RowPartitioner::~RowPartitioner() {
   dh::safe_cuda(cudaSetDevice(device_idx_));
-  dh::safe_cuda(cudaStreamDestroy(stream_));
 }
 
 common::Span<const RowPartitioner::RowIndexT> RowPartitioner::GetRows(bst_node_t nidx) {
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index f1c420ba0..215a0e49b 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -116,7 +116,7 @@ template <typename RowIndexT, typename OpT, typename OpDataT>
 void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
                        common::Span<RowIndexT> ridx, common::Span<RowIndexT> ridx_tmp,
                        common::Span<bst_uint> d_counts, std::size_t total_rows, OpT op,
-                       dh::device_vector<int8_t>* tmp, cudaStream_t stream) {
+                       dh::device_vector<int8_t>* tmp) {
   dh::LDGIterator<PerNodeData<OpDataT>> batch_info_itr(d_batch_info.data());
   WriteResultsFunctor<OpDataT> write_results{batch_info_itr, ridx.data(), ridx_tmp.data(),
                                              d_counts.data()};
@@ -135,12 +135,12 @@ void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
   size_t temp_bytes = 0;
   if (tmp->empty()) {
     cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator, discard_write_iterator,
-                                   IndexFlagOp(), total_rows, stream);
+                                   IndexFlagOp(), total_rows);
     tmp->resize(temp_bytes);
   }
   temp_bytes = tmp->size();
   cub::DeviceScan::InclusiveScan(tmp->data().get(), temp_bytes, input_iterator,
-                                 discard_write_iterator, IndexFlagOp(), total_rows, stream);
+                                 discard_write_iterator, IndexFlagOp(), total_rows);
 
   constexpr int kBlockSize = 256;
 
@@ -149,7 +149,7 @@ void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
   const int grid_size = xgboost::common::DivRoundUp(total_rows, kBlockSize * kItemsThread);
 
   SortPositionCopyKernel<kBlockSize, RowIndexT, OpDataT>
-      <<<grid_size, kBlockSize, 0, stream>>>(batch_info_itr, ridx, ridx_tmp, total_rows);
+      <<<grid_size, kBlockSize, 0>>>(batch_info_itr, ridx, ridx_tmp, total_rows);
 }
 
 struct NodePositionInfo {
@@ -221,7 +221,6 @@ class RowPartitioner {
   dh::device_vector<int8_t> tmp_;
   dh::PinnedMemory pinned_;
   dh::PinnedMemory pinned2_;
-  cudaStream_t stream_;
 
  public:
   RowPartitioner(int device_idx, size_t num_rows);
@@ -278,7 +277,7 @@ class RowPartitioner {
     }
     dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
                                   h_batch_info.size() * sizeof(PerNodeData<OpDataT>),
-                                  cudaMemcpyDefault, stream_));
+                                  cudaMemcpyDefault));
 
     // Temporary arrays
     auto h_counts = pinned_.GetSpan<bst_uint>(nidx.size(), 0);
@@ -287,12 +286,12 @@ class RowPartitioner {
     // Partition the rows according to the operator
     SortPositionBatch<RowIndexT, UpdatePositionOpT, OpDataT>(
         dh::ToSpan(d_batch_info), dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_counts),
-        total_rows, op, &tmp_, stream_);
+        total_rows, op, &tmp_);
     dh::safe_cuda(cudaMemcpyAsync(h_counts.data(), d_counts.data().get(), h_counts.size_bytes(),
-                                  cudaMemcpyDefault, stream_));
+                                  cudaMemcpyDefault));
     // TODO(Rory): this synchronisation hurts performance a lot
     // Future optimisation should find a way to skip this
-    dh::safe_cuda(cudaStreamSynchronize(stream_));
+    dh::DefaultStream().Sync();
 
     // Update segments
     for (size_t i = 0; i < nidx.size(); i++) {
@@ -327,13 +326,13 @@ class RowPartitioner {
     dh::TemporaryArray<NodePositionInfo> d_node_info_storage(ridx_segments_.size());
     dh::safe_cuda(cudaMemcpyAsync(d_node_info_storage.data().get(), ridx_segments_.data(),
                                   sizeof(NodePositionInfo) * ridx_segments_.size(),
-                                  cudaMemcpyDefault, stream_));
+                                  cudaMemcpyDefault));
 
     constexpr int kBlockSize = 512;
     const int kItemsThread = 8;
     const int grid_size = xgboost::common::DivRoundUp(ridx_.size(), kBlockSize * kItemsThread);
     common::Span<const RowIndexT> d_ridx(ridx_.data().get(), ridx_.size());
-    FinalisePositionKernel<kBlockSize><<<grid_size, kBlockSize, 0, stream_>>>(
+    FinalisePositionKernel<kBlockSize><<<grid_size, kBlockSize, 0>>>(
         dh::ToSpan(d_node_info_storage), d_ridx, d_out_position, op);
   }
 };
diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
index f82123452..050980400 100644
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -73,7 +73,7 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se
   dh::device_vector<int8_t> tmp;
   SortPositionBatch<uint32_t, decltype(op), int>(dh::ToSpan(d_batch_info), dh::ToSpan(ridx),
                                                  dh::ToSpan(ridx_tmp), dh::ToSpan(counts),
-                                                 total_rows, op, &tmp, nullptr);
+                                                 total_rows, op, &tmp);
 
   auto op_without_data = [=] __device__(auto ridx) { return ridx % 2 == 0; };
   for (size_t i = 0; i < segments.size(); i++) {

From 8f0efb4ab3ab21e527ba1f32c03cac7cd1ac8bd2 Mon Sep 17 00:00:00 2001
From: Bobby Wang <wbo4958@gmail.com>
Date: Thu, 27 Jul 2023 11:09:55 +0800
Subject: [PATCH 055/136] [jvm-packages] automatically set the max/min
 direction for best score (#9404)

---
 .../dmlc/xgboost4j/scala/spark/XGBoost.scala  |  23 +---
 .../spark/params/LearningTaskParams.scala     |   4 -
 .../java/ml/dmlc/xgboost4j/java/XGBoost.java  |  88 ++++++++++---
 .../ml/dmlc/xgboost4j/java/XGBoostTest.java   | 121 ++++++++++++++++++
 4 files changed, 194 insertions(+), 42 deletions(-)
 create mode 100644 jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/XGBoostTest.java

diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
index f514eaa68..2f1f261fb 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
@@ -23,7 +23,6 @@ import scala.util.Random
 import scala.collection.JavaConverters._
 
 import ml.dmlc.xgboost4j.java.{Communicator, IRabitTracker, XGBoostError, RabitTracker => PyRabitTracker}
-import ml.dmlc.xgboost4j.scala.spark.params.LearningTaskParams
 import ml.dmlc.xgboost4j.scala.ExternalCheckpointManager
 import ml.dmlc.xgboost4j.scala.{XGBoost => SXGBoost, _}
 import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
@@ -55,9 +54,6 @@ object TrackerConf {
   def apply(): TrackerConf = TrackerConf(0L)
 }
 
-private[scala] case class XGBoostExecutionEarlyStoppingParams(numEarlyStoppingRounds: Int,
-                                                             maximizeEvalMetrics: Boolean)
-
 private[scala] case class XGBoostExecutionInputParams(trainTestRatio: Double, seed: Long)
 
 private[scala] case class XGBoostExecutionParams(
@@ -71,7 +67,7 @@ private[scala] case class XGBoostExecutionParams(
     trackerConf: TrackerConf,
     checkpointParam: Option[ExternalCheckpointParams],
     xgbInputParams: XGBoostExecutionInputParams,
-    earlyStoppingParams: XGBoostExecutionEarlyStoppingParams,
+    earlyStoppingRounds: Int,
     cacheTrainingSet: Boolean,
     device: Option[String],
     isLocal: Boolean,
@@ -146,15 +142,8 @@ private[this] class XGBoostExecutionParamsFactory(rawParams: Map[String, Any], s
     val numEarlyStoppingRounds = overridedParams.getOrElse(
       "num_early_stopping_rounds", 0).asInstanceOf[Int]
     overridedParams += "num_early_stopping_rounds" -> numEarlyStoppingRounds
-    if (numEarlyStoppingRounds > 0 &&
-      !overridedParams.contains("maximize_evaluation_metrics")) {
-      if (overridedParams.getOrElse("custom_eval", null) != null) {
+    if (numEarlyStoppingRounds > 0 && overridedParams.getOrElse("custom_eval", null) != null) {
         throw new IllegalArgumentException("custom_eval does not support early stopping")
-      }
-      val eval_metric = overridedParams("eval_metric").toString
-      val maximize = LearningTaskParams.evalMetricsToMaximize contains eval_metric
-      logger.info("parameter \"maximize_evaluation_metrics\" is set to " + maximize)
-      overridedParams += ("maximize_evaluation_metrics" -> maximize)
     }
     overridedParams
   }
@@ -213,10 +202,6 @@ private[this] class XGBoostExecutionParamsFactory(rawParams: Map[String, Any], s
 
     val earlyStoppingRounds = overridedParams.getOrElse(
       "num_early_stopping_rounds", 0).asInstanceOf[Int]
-    val maximizeEvalMetrics = overridedParams.getOrElse(
-      "maximize_evaluation_metrics", true).asInstanceOf[Boolean]
-    val xgbExecEarlyStoppingParams = XGBoostExecutionEarlyStoppingParams(earlyStoppingRounds,
-      maximizeEvalMetrics)
 
     val cacheTrainingSet = overridedParams.getOrElse("cache_training_set", false)
       .asInstanceOf[Boolean]
@@ -232,7 +217,7 @@ private[this] class XGBoostExecutionParamsFactory(rawParams: Map[String, Any], s
       missing, allowNonZeroForMissing, trackerConf,
       checkpointParam,
       inputParams,
-      xgbExecEarlyStoppingParams,
+      earlyStoppingRounds,
       cacheTrainingSet,
       device,
       isLocal,
@@ -319,7 +304,7 @@ object XGBoost extends Serializable {
 
       watches = buildWatchesAndCheck(buildWatches)
 
-      val numEarlyStoppingRounds = xgbExecutionParam.earlyStoppingParams.numEarlyStoppingRounds
+      val numEarlyStoppingRounds = xgbExecutionParam.earlyStoppingRounds
       val metrics = Array.tabulate(watches.size)(_ => Array.ofDim[Float](numRounds))
       val externalCheckpointParams = xgbExecutionParam.checkpointParam
 
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/LearningTaskParams.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/LearningTaskParams.scala
index 6aec4d36e..bcbd7548f 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/LearningTaskParams.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/LearningTaskParams.scala
@@ -112,8 +112,4 @@ private[spark] object LearningTaskParams {
 
   val supportedObjectiveType = HashSet("regression", "classification")
 
-  val evalMetricsToMaximize = HashSet("auc", "aucpr", "ndcg", "map")
-
-  val evalMetricsToMinimize = HashSet("rmse", "rmsle", "mae", "mape", "logloss", "error", "merror",
-    "mlogloss", "gamma-deviance")
 }
diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoost.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoost.java
index d765a3cab..bcd0b1b11 100644
--- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoost.java
+++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoost.java
@@ -17,6 +17,8 @@ package ml.dmlc.xgboost4j.java;
 
 import java.io.*;
 import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -30,6 +32,11 @@ import org.apache.hadoop.fs.FileSystem;
 public class XGBoost {
   private static final Log logger = LogFactory.getLog(XGBoost.class);
 
+  public static final String[] MAXIMIZ_METRICES = {
+    "auc", "aucpr", "pre", "pre@", "map", "ndcg",
+    "auc@", "aucpr@", "map@", "ndcg@",
+  };
+
   /**
    * load model from modelPath
    *
@@ -158,7 +165,7 @@ public class XGBoost {
     //collect eval matrixs
     String[] evalNames;
     DMatrix[] evalMats;
-    float bestScore;
+    float bestScore = 1;
     int bestIteration;
     List<String> names = new ArrayList<String>();
     List<DMatrix> mats = new ArrayList<DMatrix>();
@@ -175,11 +182,7 @@ public class XGBoost {
 
     evalNames = names.toArray(new String[names.size()]);
     evalMats = mats.toArray(new DMatrix[mats.size()]);
-    if (isMaximizeEvaluation(params)) {
-      bestScore = -Float.MAX_VALUE;
-    } else {
-      bestScore = Float.MAX_VALUE;
-    }
+
     bestIteration = 0;
     metrics = metrics == null ? new float[evalNames.length][numRounds] : metrics;
 
@@ -210,6 +213,9 @@ public class XGBoost {
       checkpointIterations = new HashSet<>(ecm.getCheckpointRounds(checkpointInterval, numRounds));
     }
 
+    boolean initial_best_score_flag = false;
+    boolean max_direction = false;
+
     // begin to train
     for (int iter = booster.getVersion() / 2; iter < numRounds; iter++) {
       if (booster.getVersion() % 2 == 0) {
@@ -231,6 +237,18 @@ public class XGBoost {
         } else {
           evalInfo = booster.evalSet(evalMats, evalNames, iter, metricsOut);
         }
+
+        if (!initial_best_score_flag) {
+          if (isMaximizeEvaluation(evalInfo, evalNames, params)) {
+            max_direction = true;
+            bestScore = -Float.MAX_VALUE;
+          } else {
+            max_direction = false;
+            bestScore = Float.MAX_VALUE;
+          }
+          initial_best_score_flag = true;
+        }
+
         for (int i = 0; i < metricsOut.length; i++) {
           metrics[i][iter] = metricsOut[i];
         }
@@ -238,7 +256,7 @@ public class XGBoost {
         // If there is more than one evaluation datasets, the last one would be used
         // to determinate early stop.
         float score = metricsOut[metricsOut.length - 1];
-        if (isMaximizeEvaluation(params)) {
+        if (max_direction) {
           // Update best score if the current score is better (no update when equal)
           if (score > bestScore) {
             bestScore = score;
@@ -264,9 +282,7 @@ public class XGBoost {
           break;
         }
         if (Communicator.getRank() == 0 && shouldPrint(params, iter)) {
-          if (shouldPrint(params, iter)){
-            Communicator.communicatorPrint(evalInfo + '\n');
-          }
+          Communicator.communicatorPrint(evalInfo + '\n');
         }
       }
       booster.saveRabitCheckpoint();
@@ -360,16 +376,50 @@ public class XGBoost {
     return iter - bestIteration >= earlyStoppingRounds;
   }
 
-  private static boolean isMaximizeEvaluation(Map<String, Object> params) {
-    try {
-      String maximize = String.valueOf(params.get("maximize_evaluation_metrics"));
-      assert(maximize != null);
-      return Boolean.valueOf(maximize);
-    } catch (Exception ex) {
-      logger.error("maximize_evaluation_metrics has to be specified for enabling early stop," +
-              " allowed value: true/false", ex);
-      throw ex;
+  private static String getMetricNameFromlog(String evalInfo, String[] evalNames) {
+    String regexPattern = Pattern.quote(evalNames[0]) + "-(.*):";
+    Pattern pattern = Pattern.compile(regexPattern);
+    Matcher matcher = pattern.matcher(evalInfo);
+
+    String metricName = null;
+    if (matcher.find()) {
+      metricName = matcher.group(1);
+      logger.debug("Got the metric name: " + metricName);
     }
+    return metricName;
+  }
+
+  // visiable for testing
+  public static boolean isMaximizeEvaluation(String evalInfo,
+                                             String[] evalNames,
+                                             Map<String, Object> params) {
+
+    String metricName;
+
+    if (params.get("maximize_evaluation_metrics") != null) {
+      // user has forced the direction no matter what is the metric name.
+      String maximize = String.valueOf(params.get("maximize_evaluation_metrics"));
+      return Boolean.valueOf(maximize);
+    }
+
+    if (params.get("eval_metric") != null) {
+      // user has special metric name
+      metricName = String.valueOf(params.get("eval_metric"));
+    } else {
+      // infer the metric name from log
+      metricName = getMetricNameFromlog(evalInfo, evalNames);
+    }
+
+    assert metricName != null;
+
+    if (!"mape".equals(metricName)) {
+      for (String x : MAXIMIZ_METRICES) {
+        if (metricName.startsWith(x)) {
+          return true;
+        }
+      }
+    }
+    return false;
   }
 
   /**
diff --git a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/XGBoostTest.java b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/XGBoostTest.java
new file mode 100644
index 000000000..190405c68
--- /dev/null
+++ b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/XGBoostTest.java
@@ -0,0 +1,121 @@
+/*
+ Copyright (c) 2023 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package ml.dmlc.xgboost4j.java;
+
+import junit.framework.TestCase;
+import ml.dmlc.xgboost4j.LabeledPoint;
+import org.junit.Test;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Random;
+
+public class XGBoostTest {
+
+  private String composeEvalInfo(String metric, String evalName) {
+    return "[0]\t" + evalName + "-" + metric + ":" + "\ttest";
+  }
+
+  @Test
+  public void testIsMaximizeEvaluation() {
+    String[] minimum_metrics = {"mape", "logloss", "error", "others"};
+    String[] evalNames = {"set-abc"};
+
+    HashMap<String, Object> params = new HashMap<>();
+
+    // test1, infer the metric from faked log
+    for (String x : XGBoost.MAXIMIZ_METRICES) {
+      String evalInfo = composeEvalInfo(x, evalNames[0]);
+      TestCase.assertTrue(XGBoost.isMaximizeEvaluation(evalInfo, evalNames, params));
+    }
+
+    // test2, the direction for mape should be minimum
+    String evalInfo = composeEvalInfo("mape", evalNames[0]);
+    TestCase.assertFalse(XGBoost.isMaximizeEvaluation(evalInfo, evalNames, params));
+
+    // test3, force maximize_evaluation_metrics
+    params.clear();
+    params.put("maximize_evaluation_metrics", true);
+    // auc should be max,
+    evalInfo = composeEvalInfo("auc", evalNames[0]);
+    TestCase.assertTrue(XGBoost.isMaximizeEvaluation(evalInfo, evalNames, params));
+
+    params.clear();
+    params.put("maximize_evaluation_metrics", false);
+    // auc should be min,
+    evalInfo = composeEvalInfo("auc", evalNames[0]);
+    TestCase.assertFalse(XGBoost.isMaximizeEvaluation(evalInfo, evalNames, params));
+
+    // test4, set the metric manually
+    for (String x : XGBoost.MAXIMIZ_METRICES) {
+      params.clear();
+      params.put("eval_metric", x);
+      evalInfo = composeEvalInfo(x, evalNames[0]);
+      TestCase.assertTrue(XGBoost.isMaximizeEvaluation(evalInfo, evalNames, params));
+    }
+
+    // test5, set the metric manually
+    for (String x : minimum_metrics) {
+      params.clear();
+      params.put("eval_metric", x);
+      evalInfo = composeEvalInfo(x, evalNames[0]);
+      TestCase.assertFalse(XGBoost.isMaximizeEvaluation(evalInfo, evalNames, params));
+    }
+
+  }
+
+  @Test
+  public void testEarlyStop() throws XGBoostError {
+    Random random = new Random(1);
+
+    java.util.ArrayList<Float> labelall = new java.util.ArrayList<Float>();
+    int nrep = 3000;
+    java.util.List<LabeledPoint> blist = new java.util.LinkedList<LabeledPoint>();
+    for (int i = 0; i < nrep; ++i) {
+      LabeledPoint p = new LabeledPoint(
+        i % 2, 4,
+        new int[]{0, 1, 2, 3},
+        new float[]{random.nextFloat(), random.nextFloat(), random.nextFloat(), random.nextFloat()});
+      blist.add(p);
+      labelall.add(p.label());
+    }
+
+    DMatrix dmat = new DMatrix(blist.iterator(), null);
+
+    int round = 50;
+    int earlyStop = 2;
+
+    HashMap<String, Object> mapParams = new HashMap<>();
+    mapParams.put("eta", 0.1);
+    mapParams.put("objective", "binary:logistic");
+    mapParams.put("max_depth", 3);
+    mapParams.put("eval_metric", "auc");
+    mapParams.put("silent", 0);
+
+    HashMap<String, DMatrix> mapWatches = new HashMap<>();
+    mapWatches.put("selTrain-*", dmat);
+
+    try {
+      Booster booster = XGBoost.train(dmat, mapParams, round, mapWatches, null, null, null, earlyStop);
+      Map<String, String> attrs = booster.getAttrs();
+      TestCase.assertTrue(Integer.valueOf(attrs.get("best_iteration")) < round - 1);
+    } catch (Exception e) {
+      TestCase.assertFalse(false);
+    }
+
+  }
+}

From 912e341d575f107be1cc2631271fd0737b75dfba Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 31 Jul 2023 15:50:28 +0800
Subject: [PATCH 056/136] Initial GPU support for the approx tree method.
 (#9414)

---
 doc/parameter.rst                             |   3 +-
 doc/treemethod.rst                            |  32 +--
 python-package/xgboost/testing/updater.py     | 140 ++++++++++-
 src/common/error_msg.h                        |   5 +
 src/common/ranking_utils.h                    |   6 +-
 src/data/ellpack_page.cu                      |   7 +-
 src/data/gradient_index.cc                    |   5 +-
 src/data/simple_dmatrix.cc                    |   6 +-
 src/data/sparse_page_dmatrix.cc               |   1 -
 src/data/sparse_page_dmatrix.cu               |  23 +-
 src/gbm/gbtree.cc                             |   7 +-
 src/tree/constraints.h                        |   8 +-
 src/tree/gpu_hist/gradient_based_sampler.cu   |  28 +--
 src/tree/updater_approx.cc                    |   1 -
 src/tree/updater_gpu_hist.cu                  | 231 +++++++++++++-----
 tests/cpp/tree/test_gpu_hist.cu               |   8 +-
 tests/cpp/tree/test_prediction_cache.cc       |  55 +++--
 tests/cpp/tree/test_regen.cc                  |  61 +++--
 tests/cpp/tree/test_tree_policy.cc            |  46 ++--
 tests/cpp/tree/test_tree_stat.cc              |  32 ++-
 tests/python-gpu/test_gpu_updaters.py         | 130 +++++++---
 tests/python/test_updaters.py                 | 140 +----------
 .../test_gpu_with_dask/test_gpu_with_dask.py  |  24 +-
 23 files changed, 639 insertions(+), 360 deletions(-)

diff --git a/doc/parameter.rst b/doc/parameter.rst
index 2072c4b75..6f767c80d 100644
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -162,7 +162,8 @@ Parameters for Tree Booster
     - ``grow_colmaker``: non-distributed column-based construction of trees.
     - ``grow_histmaker``: distributed tree construction with row-based data splitting based on global proposal of histogram counting.
     - ``grow_quantile_histmaker``: Grow tree using quantized histogram.
-    - ``grow_gpu_hist``: Grow tree with GPU. Enabled when ``tree_method`` is set to ``hist`` along with ``device=cuda``.
+    - ``grow_gpu_hist``:  Enabled when ``tree_method`` is set to ``hist`` along with ``device=cuda``.
+    - ``grow_gpu_approx``: Enabled when ``tree_method`` is set to ``approx`` along with ``device=cuda``.
     - ``sync``: synchronizes trees in all distributed nodes.
     - ``refresh``: refreshes tree's statistics and/or leaf values based on the current data. Note that no random subsampling of data rows is performed.
     - ``prune``: prunes the splits where loss < min_split_loss (or gamma) and nodes that have depth greater than ``max_depth``.
diff --git a/doc/treemethod.rst b/doc/treemethod.rst
index 4dfb107a0..1f83401fe 100644
--- a/doc/treemethod.rst
+++ b/doc/treemethod.rst
@@ -123,23 +123,23 @@ Feature Matrix
 Following table summarizes some differences in supported features between 4 tree methods,
 `T` means supported while `F` means unsupported.
 
-+------------------+-----------+---------------------+---------------------+------------------------+
-|                  | Exact     | Approx              | Hist                | Hist (GPU)             |
-+==================+===========+=====================+=====================+========================+
-| grow_policy      | Depthwise | depthwise/lossguide | depthwise/lossguide | depthwise/lossguide    |
-+------------------+-----------+---------------------+---------------------+------------------------+
-| max_leaves       | F         | T                   | T                   | T                      |
-+------------------+-----------+---------------------+---------------------+------------------------+
-| sampling method  | uniform   | uniform             | uniform             | gradient_based/uniform |
-+------------------+-----------+---------------------+---------------------+------------------------+
-| categorical data | F         | T                   | T                   | T                      |
-+------------------+-----------+---------------------+---------------------+------------------------+
-| External memory  | F         | T                   | T                   | P                      |
-+------------------+-----------+---------------------+---------------------+------------------------+
-| Distributed      | F         | T                   | T                   | T                      |
-+------------------+-----------+---------------------+---------------------+------------------------+
++------------------+-----------+---------------------+------------------------+---------------------+------------------------+
+|                  | Exact     | Approx              | Approx (GPU)           | Hist                | Hist (GPU)             |
++==================+===========+=====================+========================+=====================+========================+
+| grow_policy      | Depthwise | depthwise/lossguide | depthwise/lossguide    | depthwise/lossguide | depthwise/lossguide    |
++------------------+-----------+---------------------+------------------------+---------------------+------------------------+
+| max_leaves       | F         | T                   | T                      | T                   | T                      |
++------------------+-----------+---------------------+------------------------+---------------------+------------------------+
+| sampling method  | uniform   | uniform             | gradient_based/uniform | uniform             | gradient_based/uniform |
++------------------+-----------+---------------------+------------------------+---------------------+------------------------+
+| categorical data | F         | T                   | T                      | T                   | T                      |
++------------------+-----------+---------------------+------------------------+---------------------+------------------------+
+| External memory  | F         | T                   | P                      | T                   | P                      |
++------------------+-----------+---------------------+------------------------+---------------------+------------------------+
+| Distributed      | F         | T                   | T                      | T                   | T                      |
++------------------+-----------+---------------------+------------------------+---------------------+------------------------+
 
-Features/parameters that are not mentioned here are universally supported for all 4 tree
+Features/parameters that are not mentioned here are universally supported for all 3 tree
 methods (for instance, column sampling and constraints).  The `P` in external memory means
 special handling.  Please note that both categorical data and external memory are
 experimental.
diff --git a/python-package/xgboost/testing/updater.py b/python-package/xgboost/testing/updater.py
index 62df8ec2e..af5acf428 100644
--- a/python-package/xgboost/testing/updater.py
+++ b/python-package/xgboost/testing/updater.py
@@ -1,7 +1,7 @@
 """Tests for updaters."""
 import json
 from functools import partial, update_wrapper
-from typing import Any, Dict
+from typing import Any, Dict, List
 
 import numpy as np
 
@@ -256,3 +256,141 @@ def check_get_quantile_cut(tree_method: str) -> None:
     check_get_quantile_cut_device(tree_method, False)
     if use_cupy:
         check_get_quantile_cut_device(tree_method, True)
+
+
+USE_ONEHOT = np.iinfo(np.int32).max
+USE_PART = 1
+
+
+def check_categorical_ohe(  # pylint: disable=too-many-arguments
+    rows: int, cols: int, rounds: int, cats: int, device: str, tree_method: str
+) -> None:
+    "Test for one-hot encoding with categorical data."
+
+    onehot, label = tm.make_categorical(rows, cols, cats, True)
+    cat, _ = tm.make_categorical(rows, cols, cats, False)
+
+    by_etl_results: Dict[str, Dict[str, List[float]]] = {}
+    by_builtin_results: Dict[str, Dict[str, List[float]]] = {}
+
+    parameters: Dict[str, Any] = {
+        "tree_method": tree_method,
+        # Use one-hot exclusively
+        "max_cat_to_onehot": USE_ONEHOT,
+        "device": device,
+    }
+
+    m = xgb.DMatrix(onehot, label, enable_categorical=False)
+    xgb.train(
+        parameters,
+        m,
+        num_boost_round=rounds,
+        evals=[(m, "Train")],
+        evals_result=by_etl_results,
+    )
+
+    m = xgb.DMatrix(cat, label, enable_categorical=True)
+    xgb.train(
+        parameters,
+        m,
+        num_boost_round=rounds,
+        evals=[(m, "Train")],
+        evals_result=by_builtin_results,
+    )
+
+    # There are guidelines on how to specify tolerance based on considering output
+    # as random variables. But in here the tree construction is extremely sensitive
+    # to floating point errors. An 1e-5 error in a histogram bin can lead to an
+    # entirely different tree. So even though the test is quite lenient, hypothesis
+    # can still pick up falsifying examples from time to time.
+    np.testing.assert_allclose(
+        np.array(by_etl_results["Train"]["rmse"]),
+        np.array(by_builtin_results["Train"]["rmse"]),
+        rtol=1e-3,
+    )
+    assert tm.non_increasing(by_builtin_results["Train"]["rmse"])
+
+    by_grouping: Dict[str, Dict[str, List[float]]] = {}
+    # switch to partition-based splits
+    parameters["max_cat_to_onehot"] = USE_PART
+    parameters["reg_lambda"] = 0
+    m = xgb.DMatrix(cat, label, enable_categorical=True)
+    xgb.train(
+        parameters,
+        m,
+        num_boost_round=rounds,
+        evals=[(m, "Train")],
+        evals_result=by_grouping,
+    )
+    rmse_oh = by_builtin_results["Train"]["rmse"]
+    rmse_group = by_grouping["Train"]["rmse"]
+    # always better or equal to onehot when there's no regularization.
+    for a, b in zip(rmse_oh, rmse_group):
+        assert a >= b
+
+    parameters["reg_lambda"] = 1.0
+    by_grouping = {}
+    xgb.train(
+        parameters,
+        m,
+        num_boost_round=32,
+        evals=[(m, "Train")],
+        evals_result=by_grouping,
+    )
+    assert tm.non_increasing(by_grouping["Train"]["rmse"]), by_grouping
+
+
+def check_categorical_missing(
+    rows: int, cols: int, cats: int, device: str, tree_method: str
+) -> None:
+    """Check categorical data with missing values."""
+    parameters: Dict[str, Any] = {"tree_method": tree_method, "device": device}
+    cat, label = tm.make_categorical(
+        rows, n_features=cols, n_categories=cats, onehot=False, sparsity=0.5
+    )
+    Xy = xgb.DMatrix(cat, label, enable_categorical=True)
+
+    def run(max_cat_to_onehot: int) -> None:
+        # Test with onehot splits
+        parameters["max_cat_to_onehot"] = max_cat_to_onehot
+
+        evals_result: Dict[str, Dict] = {}
+        booster = xgb.train(
+            parameters,
+            Xy,
+            num_boost_round=16,
+            evals=[(Xy, "Train")],
+            evals_result=evals_result,
+        )
+        assert tm.non_increasing(evals_result["Train"]["rmse"])
+        y_predt = booster.predict(Xy)
+
+        rmse = tm.root_mean_square(label, y_predt)
+        np.testing.assert_allclose(rmse, evals_result["Train"]["rmse"][-1], rtol=2e-5)
+
+    # Test with OHE split
+    run(USE_ONEHOT)
+
+    # Test with partition-based split
+    run(USE_PART)
+
+
+def train_result(
+    param: Dict[str, Any], dmat: xgb.DMatrix, num_rounds: int
+) -> Dict[str, Any]:
+    """Get training result from parameters and data."""
+    result: Dict[str, Any] = {}
+    booster = xgb.train(
+        param,
+        dmat,
+        num_rounds,
+        evals=[(dmat, "train")],
+        verbose_eval=False,
+        evals_result=result,
+    )
+    assert booster.num_features() == dmat.num_col()
+    assert booster.num_boosted_rounds() == num_rounds
+    assert booster.feature_names == dmat.feature_names
+    assert booster.feature_types == dmat.feature_types
+
+    return result
diff --git a/src/common/error_msg.h b/src/common/error_msg.h
index 8bdc85999..1af4b7c88 100644
--- a/src/common/error_msg.h
+++ b/src/common/error_msg.h
@@ -89,5 +89,10 @@ void WarnDeprecatedGPUId();
 void WarnEmptyDataset();
 
 std::string DeprecatedFunc(StringView old, StringView since, StringView replacement);
+
+constexpr StringView InvalidCUDAOrdinal() {
+  return "Invalid device. `device` is required to be CUDA and there must be at least one GPU "
+         "available for using GPU.";
+}
 }  // namespace xgboost::error
 #endif  // XGBOOST_COMMON_ERROR_MSG_H_
diff --git a/src/common/ranking_utils.h b/src/common/ranking_utils.h
index 7d11de048..75622bd84 100644
--- a/src/common/ranking_utils.h
+++ b/src/common/ranking_utils.h
@@ -12,7 +12,7 @@
 #include <vector>                        // for vector
 
 #include "dmlc/parameter.h"              // for FieldEntry, DMLC_DECLARE_FIELD
-#include "error_msg.h"                   // for GroupWeight, GroupSize
+#include "error_msg.h"                   // for GroupWeight, GroupSize, InvalidCUDAOrdinal
 #include "xgboost/base.h"                // for XGBOOST_DEVICE, bst_group_t
 #include "xgboost/context.h"             // for Context
 #include "xgboost/data.h"                // for MetaInfo
@@ -240,7 +240,7 @@ class RankingCache {
   // The function simply returns a uninitialized buffer as this is only used by the
   // objective for creating pairs.
   common::Span<std::size_t> SortedIdxY(Context const* ctx, std::size_t n_samples) {
-    CHECK(ctx->IsCUDA());
+    CHECK(ctx->IsCUDA()) << error::InvalidCUDAOrdinal();
     if (y_sorted_idx_cache_.Empty()) {
       y_sorted_idx_cache_.SetDevice(ctx->gpu_id);
       y_sorted_idx_cache_.Resize(n_samples);
@@ -248,7 +248,7 @@ class RankingCache {
     return y_sorted_idx_cache_.DeviceSpan();
   }
   common::Span<float> RankedY(Context const* ctx, std::size_t n_samples) {
-    CHECK(ctx->IsCUDA());
+    CHECK(ctx->IsCUDA()) << error::InvalidCUDAOrdinal();
     if (y_ranked_by_model_.Empty()) {
       y_ranked_by_model_.SetDevice(ctx->gpu_id);
       y_ranked_by_model_.Resize(n_samples);
diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index 7097df405..369021376 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -11,7 +11,6 @@
 #include "../common/categorical.h"
 #include "../common/cuda_context.cuh"
 #include "../common/hist_util.cuh"
-#include "../common/random.h"
 #include "../common/transform_iterator.h"  // MakeIndexTransformIter
 #include "./ellpack_page.cuh"
 #include "device_adapter.cuh"  // for HasInfInData
@@ -131,7 +130,11 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchP
   monitor_.Start("Quantiles");
   // Create the quantile sketches for the dmatrix and initialize HistogramCuts.
   row_stride = GetRowStride(dmat);
-  cuts_ = common::DeviceSketch(ctx, dmat, param.max_bin);
+  if (!param.hess.empty()) {
+    cuts_ = common::DeviceSketchWithHessian(ctx, dmat, param.max_bin, param.hess);
+  } else {
+    cuts_ = common::DeviceSketch(ctx, dmat, param.max_bin);
+  }
   monitor_.Stop("Quantiles");
 
   monitor_.Start("InitCompressedData");
diff --git a/src/data/gradient_index.cc b/src/data/gradient_index.cc
index 1ee1bd60b..a2b3f3e54 100644
--- a/src/data/gradient_index.cc
+++ b/src/data/gradient_index.cc
@@ -7,13 +7,12 @@
 #include <algorithm>
 #include <limits>
 #include <memory>
-#include <utility>  // std::forward
+#include <utility>  // for forward
 
 #include "../common/column_matrix.h"
 #include "../common/hist_util.h"
 #include "../common/numeric.h"
-#include "../common/threading_utils.h"
-#include "../common/transform_iterator.h"  // MakeIndexTransformIter
+#include "../common/transform_iterator.h"  // for MakeIndexTransformIter
 
 namespace xgboost {
 
diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc
index 5a2f6f8df..85ede3258 100644
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -8,12 +8,12 @@
 
 #include <algorithm>
 #include <limits>
+#include <numeric>  // for accumulate
 #include <type_traits>
 #include <vector>
 
-#include "../common/error_msg.h"  // for InconsistentMaxBin
-#include "../common/random.h"
-#include "../common/threading_utils.h"
+#include "../collective/communicator-inl.h"  // for GetWorldSize, GetRank, Allgather
+#include "../common/error_msg.h"             // for InconsistentMaxBin
 #include "./simple_batch_iterator.h"
 #include "adapter.h"
 #include "batch_utils.h"   // for CheckEmpty, RegenGHist
diff --git a/src/data/sparse_page_dmatrix.cc b/src/data/sparse_page_dmatrix.cc
index ec9c90b10..042a75c56 100644
--- a/src/data/sparse_page_dmatrix.cc
+++ b/src/data/sparse_page_dmatrix.cc
@@ -8,7 +8,6 @@
 #include "./sparse_page_dmatrix.h"
 
 #include "../collective/communicator-inl.h"
-#include "./simple_batch_iterator.h"
 #include "batch_utils.h"  // for RegenGHist
 #include "gradient_index.h"
 
diff --git a/src/data/sparse_page_dmatrix.cu b/src/data/sparse_page_dmatrix.cu
index 1d9af9f06..9d4c63387 100644
--- a/src/data/sparse_page_dmatrix.cu
+++ b/src/data/sparse_page_dmatrix.cu
@@ -1,13 +1,15 @@
 /**
  * Copyright 2021-2023 by XGBoost contributors
  */
-#include <memory>
+#include <memory>  // for unique_ptr
 
 #include "../common/hist_util.cuh"
-#include "batch_utils.h"  // for CheckEmpty, RegenGHist
+#include "../common/hist_util.h"  // for HistogramCuts
+#include "batch_utils.h"          // for CheckEmpty, RegenGHist
 #include "ellpack_page.cuh"
 #include "sparse_page_dmatrix.h"
-#include "sparse_page_source.h"
+#include "xgboost/context.h"  // for Context
+#include "xgboost/data.h"     // for BatchParam
 
 namespace xgboost::data {
 BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const* ctx,
@@ -25,8 +27,13 @@ BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const* ctx,
     cache_info_.erase(id);
     MakeCache(this, ".ellpack.page", cache_prefix_, &cache_info_);
     std::unique_ptr<common::HistogramCuts> cuts;
-    cuts =
-        std::make_unique<common::HistogramCuts>(common::DeviceSketch(ctx, this, param.max_bin, 0));
+    if (!param.hess.empty()) {
+      cuts = std::make_unique<common::HistogramCuts>(
+          common::DeviceSketchWithHessian(ctx, this, param.max_bin, param.hess));
+    } else {
+      cuts =
+          std::make_unique<common::HistogramCuts>(common::DeviceSketch(ctx, this, param.max_bin));
+    }
     this->InitializeSparsePage(ctx);  // reset after use.
 
     row_stride = GetRowStride(this);
@@ -35,10 +42,10 @@ BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const* ctx,
     batch_param_ = param;
 
     auto ft = this->info_.feature_types.ConstDeviceSpan();
-    ellpack_page_source_.reset();  // release resources.
-    ellpack_page_source_.reset(new EllpackPageSource(
+    ellpack_page_source_.reset();  // make sure resource is released before making new ones.
+    ellpack_page_source_ = std::make_shared<EllpackPageSource>(
         this->missing_, ctx->Threads(), this->Info().num_col_, this->n_batches_, cache_info_.at(id),
-        param, std::move(cuts), this->IsDense(), row_stride, ft, sparse_page_source_, ctx->gpu_id));
+        param, std::move(cuts), this->IsDense(), row_stride, ft, sparse_page_source_, ctx->gpu_id);
   } else {
     CHECK(sparse_page_source_);
     ellpack_page_source_->Reset();
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index 8b456af66..e3df38629 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -47,15 +47,16 @@ std::string MapTreeMethodToUpdaters(Context const* ctx, TreeMethod tree_method)
   if (ctx->IsCUDA()) {
     common::AssertGPUSupport();
   }
+
   switch (tree_method) {
     case TreeMethod::kAuto:  // Use hist as default in 2.0
     case TreeMethod::kHist: {
       return ctx->DispatchDevice([] { return "grow_quantile_histmaker"; },
                                  [] { return "grow_gpu_hist"; });
     }
-    case TreeMethod::kApprox:
-      CHECK(ctx->IsCPU()) << "The `approx` tree method is not supported on GPU.";
-      return "grow_histmaker";
+    case TreeMethod::kApprox: {
+      return ctx->DispatchDevice([] { return "grow_histmaker"; }, [] { return "grow_gpu_approx"; });
+    }
     case TreeMethod::kExact:
       CHECK(ctx->IsCPU()) << "The `exact` tree method is not supported on GPU.";
       return "grow_colmaker,prune";
diff --git a/src/tree/constraints.h b/src/tree/constraints.h
index 580576a58..3789d2a24 100644
--- a/src/tree/constraints.h
+++ b/src/tree/constraints.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2018-2019 by Contributors
+/**
+ * Copyright 2018-2023 by Contributors
  */
 #ifndef XGBOOST_TREE_CONSTRAINTS_H_
 #define XGBOOST_TREE_CONSTRAINTS_H_
@@ -8,10 +8,8 @@
 #include <unordered_set>
 #include <vector>
 
-#include "xgboost/span.h"
-#include "xgboost/base.h"
-
 #include "param.h"
+#include "xgboost/base.h"
 
 namespace xgboost {
 /*!
diff --git a/src/tree/gpu_hist/gradient_based_sampler.cu b/src/tree/gpu_hist/gradient_based_sampler.cu
index 5f763fb93..1082f8955 100644
--- a/src/tree/gpu_hist/gradient_based_sampler.cu
+++ b/src/tree/gpu_hist/gradient_based_sampler.cu
@@ -8,10 +8,10 @@
 #include <xgboost/logging.h>
 
 #include <algorithm>
+#include <cstddef>  // for size_t
 #include <limits>
 #include <utility>
 
-#include "../../common/compressed_iterator.h"
 #include "../../common/cuda_context.cuh"  // for CUDAContext
 #include "../../common/random.h"
 #include "../param.h"
@@ -202,27 +202,27 @@ ExternalMemoryUniformSampling::ExternalMemoryUniformSampling(size_t n_rows,
 GradientBasedSample ExternalMemoryUniformSampling::Sample(Context const* ctx,
                                                           common::Span<GradientPair> gpair,
                                                           DMatrix* dmat) {
+  auto cuctx = ctx->CUDACtx();
   // Set gradient pair to 0 with p = 1 - subsample
-  thrust::replace_if(dh::tbegin(gpair), dh::tend(gpair),
-                     thrust::counting_iterator<size_t>(0),
-                     BernoulliTrial(common::GlobalRandom()(), subsample_),
-                     GradientPair());
+  thrust::replace_if(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair),
+                     thrust::counting_iterator<std::size_t>(0),
+                     BernoulliTrial(common::GlobalRandom()(), subsample_), GradientPair{});
 
   // Count the sampled rows.
-  size_t sample_rows = thrust::count_if(dh::tbegin(gpair), dh::tend(gpair), IsNonZero());
+  size_t sample_rows =
+      thrust::count_if(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), IsNonZero{});
 
   // Compact gradient pairs.
   gpair_.resize(sample_rows);
-  thrust::copy_if(dh::tbegin(gpair), dh::tend(gpair), gpair_.begin(), IsNonZero());
+  thrust::copy_if(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), gpair_.begin(), IsNonZero{});
 
   // Index the sample rows.
-  thrust::transform(dh::tbegin(gpair), dh::tend(gpair), sample_row_index_.begin(), IsNonZero());
-  thrust::exclusive_scan(sample_row_index_.begin(), sample_row_index_.end(),
+  thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), sample_row_index_.begin(),
+                    IsNonZero());
+  thrust::exclusive_scan(cuctx->CTP(), sample_row_index_.begin(), sample_row_index_.end(),
                          sample_row_index_.begin());
-  thrust::transform(dh::tbegin(gpair), dh::tend(gpair),
-                    sample_row_index_.begin(),
-                    sample_row_index_.begin(),
-                    ClearEmptyRows());
+  thrust::transform(cuctx->CTP(), dh::tbegin(gpair), dh::tend(gpair), sample_row_index_.begin(),
+                    sample_row_index_.begin(), ClearEmptyRows());
 
   auto batch_iterator = dmat->GetBatches<EllpackPage>(ctx, batch_param_);
   auto first_page = (*batch_iterator.begin()).Impl();
@@ -232,7 +232,7 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(Context const* ctx,
                                   first_page->row_stride, sample_rows));
 
   // Compact the ELLPACK pages into the single sample page.
-  thrust::fill(dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
+  thrust::fill(cuctx->CTP(), dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
   for (auto& batch : batch_iterator) {
     page_->Compact(ctx->gpu_id, batch.Impl(), dh::ToSpan(sample_row_index_));
   }
diff --git a/src/tree/updater_approx.cc b/src/tree/updater_approx.cc
index 78506305f..7b5020621 100644
--- a/src/tree/updater_approx.cc
+++ b/src/tree/updater_approx.cc
@@ -11,7 +11,6 @@
 #include "../common/random.h"
 #include "../data/gradient_index.h"
 #include "common_row_partitioner.h"
-#include "constraints.h"
 #include "driver.h"
 #include "hist/evaluate_splits.h"
 #include "hist/histogram.h"
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index e2a863e3d..56d7d2a89 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -31,7 +31,6 @@
 #include "gpu_hist/histogram.cuh"
 #include "gpu_hist/row_partitioner.cuh"
 #include "param.h"
-#include "split_evaluator.h"
 #include "updater_gpu_common.cuh"
 #include "xgboost/base.h"
 #include "xgboost/context.h"
@@ -49,13 +48,30 @@ DMLC_REGISTRY_FILE_TAG(updater_gpu_hist);
 #endif  // !defined(GTEST_TEST)
 
 // training parameters specific to this algorithm
-struct GPUHistMakerTrainParam
-    : public XGBoostParameter<GPUHistMakerTrainParam> {
+struct GPUHistMakerTrainParam : public XGBoostParameter<GPUHistMakerTrainParam> {
   bool debug_synchronize;
   // declare parameters
   DMLC_DECLARE_PARAMETER(GPUHistMakerTrainParam) {
-    DMLC_DECLARE_FIELD(debug_synchronize).set_default(false).describe(
-        "Check if all distributed tree are identical after tree construction.");
+    DMLC_DECLARE_FIELD(debug_synchronize)
+        .set_default(false)
+        .describe("Check if all distributed tree are identical after tree construction.");
+  }
+
+  // Only call this method for testing
+  void CheckTreesSynchronized(RegTree const* local_tree) const {
+    if (this->debug_synchronize) {
+      std::string s_model;
+      common::MemoryBufferStream fs(&s_model);
+      int rank = collective::GetRank();
+      if (rank == 0) {
+        local_tree->Save(&fs);
+      }
+      fs.Seek(0);
+      collective::Broadcast(&s_model, 0);
+      RegTree reference_tree{};  // rank 0 tree
+      reference_tree.Load(&fs);
+      CHECK(*local_tree == reference_tree);
+    }
   }
 };
 #if !defined(GTEST_TEST)
@@ -170,16 +186,15 @@ class DeviceHistogramStorage {
 };
 
 // Manage memory for a single GPU
-template <typename GradientSumT>
 struct GPUHistMakerDevice {
  private:
   GPUHistEvaluator evaluator_;
   Context const* ctx_;
+  std::shared_ptr<common::ColumnSampler> column_sampler_;
 
  public:
   EllpackPageImpl const* page{nullptr};
   common::Span<FeatureType const> feature_types;
-  BatchParam batch_param;
 
   std::unique_ptr<RowPartitioner> row_partitioner;
   DeviceHistogramStorage<> hist{};
@@ -199,7 +214,6 @@ struct GPUHistMakerDevice {
   dh::PinnedMemory pinned2;
 
   common::Monitor monitor;
-  common::ColumnSampler column_sampler;
   FeatureInteractionConstraintDevice interaction_constraints;
 
   std::unique_ptr<GradientBasedSampler> sampler;
@@ -208,22 +222,22 @@ struct GPUHistMakerDevice {
 
   GPUHistMakerDevice(Context const* ctx, bool is_external_memory,
                      common::Span<FeatureType const> _feature_types, bst_row_t _n_rows,
-                     TrainParam _param, uint32_t column_sampler_seed, uint32_t n_features,
-                     BatchParam _batch_param)
+                     TrainParam _param, std::shared_ptr<common::ColumnSampler> column_sampler,
+                     uint32_t n_features, BatchParam batch_param)
       : evaluator_{_param, n_features, ctx->gpu_id},
         ctx_(ctx),
         feature_types{_feature_types},
         param(std::move(_param)),
-        column_sampler(column_sampler_seed),
-        interaction_constraints(param, n_features),
-        batch_param(std::move(_batch_param)) {
-    sampler.reset(new GradientBasedSampler(ctx, _n_rows, batch_param, param.subsample,
-                                           param.sampling_method, is_external_memory));
+        column_sampler_(std::move(column_sampler)),
+        interaction_constraints(param, n_features) {
+    sampler = std::make_unique<GradientBasedSampler>(ctx, _n_rows, batch_param, param.subsample,
+                                                     param.sampling_method, is_external_memory);
     if (!param.monotone_constraints.empty()) {
       // Copy assigning an empty vector causes an exception in MSVC debug builds
       monotone_constraints = param.monotone_constraints;
     }
 
+    CHECK(column_sampler_);
     monitor.Init(std::string("GPUHistMakerDevice") + std::to_string(ctx_->gpu_id));
   }
 
@@ -234,16 +248,16 @@ struct GPUHistMakerDevice {
       CHECK(page);
       feature_groups.reset(new FeatureGroups(page->Cuts(), page->is_dense,
                                              dh::MaxSharedMemoryOptin(ctx_->gpu_id),
-                                             sizeof(GradientSumT)));
+                                             sizeof(GradientPairPrecise)));
     }
   }
 
   // Reset values for each update iteration
   void Reset(HostDeviceVector<GradientPair>* dh_gpair, DMatrix* dmat, int64_t num_columns) {
     auto const& info = dmat->Info();
-    this->column_sampler.Init(ctx_, num_columns, info.feature_weights.HostVector(),
-                              param.colsample_bynode, param.colsample_bylevel,
-                              param.colsample_bytree);
+    this->column_sampler_->Init(ctx_, num_columns, info.feature_weights.HostVector(),
+                                param.colsample_bynode, param.colsample_bylevel,
+                                param.colsample_bytree);
     dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
 
     this->interaction_constraints.Reset();
@@ -275,8 +289,8 @@ struct GPUHistMakerDevice {
   GPUExpandEntry EvaluateRootSplit(GradientPairInt64 root_sum) {
     int nidx = RegTree::kRoot;
     GPUTrainingParam gpu_param(param);
-    auto sampled_features = column_sampler.GetFeatureSet(0);
-    sampled_features->SetDevice(ctx_->gpu_id);
+    auto sampled_features = column_sampler_->GetFeatureSet(0);
+    sampled_features->SetDevice(ctx_->Device());
     common::Span<bst_feature_t> feature_set =
         interaction_constraints.Query(sampled_features->DeviceSpan(), nidx);
     auto matrix = page->GetDeviceAccessor(ctx_->gpu_id);
@@ -316,13 +330,13 @@ struct GPUHistMakerDevice {
       int right_nidx = tree[candidate.nid].RightChild();
       nidx[i * 2] = left_nidx;
       nidx[i * 2 + 1] = right_nidx;
-      auto left_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(left_nidx));
-      left_sampled_features->SetDevice(ctx_->gpu_id);
+      auto left_sampled_features = column_sampler_->GetFeatureSet(tree.GetDepth(left_nidx));
+      left_sampled_features->SetDevice(ctx_->Device());
       feature_sets.emplace_back(left_sampled_features);
       common::Span<bst_feature_t> left_feature_set =
           interaction_constraints.Query(left_sampled_features->DeviceSpan(), left_nidx);
-      auto right_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(right_nidx));
-      right_sampled_features->SetDevice(ctx_->gpu_id);
+      auto right_sampled_features = column_sampler_->GetFeatureSet(tree.GetDepth(right_nidx));
+      right_sampled_features->SetDevice(ctx_->Device());
       feature_sets.emplace_back(right_sampled_features);
       common::Span<bst_feature_t> right_feature_set =
           interaction_constraints.Query(right_sampled_features->DeviceSpan(),
@@ -657,7 +671,6 @@ struct GPUHistMakerDevice {
     evaluator_.ApplyTreeSplit(candidate, p_tree);
 
     const auto& parent = tree[candidate.nid];
-    std::size_t max_nidx = std::max(parent.LeftChild(), parent.RightChild());
     interaction_constraints.Split(candidate.nid, parent.SplitIndex(), parent.LeftChild(),
                                   parent.RightChild());
   }
@@ -693,9 +706,8 @@ struct GPUHistMakerDevice {
     return root_entry;
   }
 
-  void UpdateTree(HostDeviceVector<GradientPair>* gpair_all, DMatrix* p_fmat,
-                  ObjInfo const* task, RegTree* p_tree,
-                  HostDeviceVector<bst_node_t>* p_out_position) {
+  void UpdateTree(HostDeviceVector<GradientPair>* gpair_all, DMatrix* p_fmat, ObjInfo const* task,
+                  RegTree* p_tree, HostDeviceVector<bst_node_t>* p_out_position) {
     auto& tree = *p_tree;
     // Process maximum 32 nodes at a time
     Driver<GPUExpandEntry> driver(param, 32);
@@ -720,7 +732,6 @@ struct GPUHistMakerDevice {
       std::copy_if(expand_set.begin(), expand_set.end(), std::back_inserter(filtered_expand_set),
                    [&](const auto& e) { return driver.IsChildValid(e); });
 
-
       auto new_candidates =
           pinned.GetSpan<GPUExpandEntry>(filtered_expand_set.size() * 2, GPUExpandEntry());
 
@@ -753,8 +764,7 @@ class GPUHistMaker : public TreeUpdater {
   using GradientSumT = GradientPairPrecise;
 
  public:
-  explicit GPUHistMaker(Context const* ctx, ObjInfo const* task)
-      : TreeUpdater(ctx), task_{task} {};
+  explicit GPUHistMaker(Context const* ctx, ObjInfo const* task) : TreeUpdater(ctx), task_{task} {};
   void Configure(const Args& args) override {
     // Used in test to count how many configurations are performed
     LOG(DEBUG) << "[GPU Hist]: Configure";
@@ -786,13 +796,10 @@ class GPUHistMaker : public TreeUpdater {
 
     // build tree
     try {
-      size_t t_idx{0};
+      std::size_t t_idx{0};
       for (xgboost::RegTree* tree : trees) {
         this->UpdateTree(param, gpair, dmat, tree, &out_position[t_idx]);
-
-        if (hist_maker_param_.debug_synchronize) {
-          this->CheckTreesSynchronized(tree);
-        }
+        this->hist_maker_param_.CheckTreesSynchronized(tree);
         ++t_idx;
       }
       dh::safe_cuda(cudaGetLastError());
@@ -809,13 +816,14 @@ class GPUHistMaker : public TreeUpdater {
     // Synchronise the column sampling seed
     uint32_t column_sampling_seed = common::GlobalRandom()();
     collective::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0);
+    this->column_sampler_ = std::make_shared<common::ColumnSampler>(column_sampling_seed);
 
     auto batch_param = BatchParam{param->max_bin, TrainParam::DftSparseThreshold()};
     dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
     info_->feature_types.SetDevice(ctx_->gpu_id);
-    maker.reset(new GPUHistMakerDevice<GradientSumT>(
+    maker = std::make_unique<GPUHistMakerDevice>(
         ctx_, !dmat->SingleColBlock(), info_->feature_types.ConstDeviceSpan(), info_->num_row_,
-        *param, column_sampling_seed, info_->num_col_, batch_param));
+        *param, column_sampler_, info_->num_col_, batch_param);
 
     p_last_fmat_ = dmat;
     initialised_ = true;
@@ -830,21 +838,6 @@ class GPUHistMaker : public TreeUpdater {
     p_last_tree_ = p_tree;
   }
 
-  // Only call this method for testing
-  void CheckTreesSynchronized(RegTree* local_tree) const {
-    std::string s_model;
-    common::MemoryBufferStream fs(&s_model);
-    int rank = collective::GetRank();
-    if (rank == 0) {
-      local_tree->Save(&fs);
-    }
-    fs.Seek(0);
-    collective::Broadcast(&s_model, 0);
-    RegTree reference_tree{};  // rank 0 tree
-    reference_tree.Load(&fs);
-    CHECK(*local_tree == reference_tree);
-  }
-
   void UpdateTree(TrainParam const* param, HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat,
                   RegTree* p_tree, HostDeviceVector<bst_node_t>* p_out_position) {
     monitor_.Start("InitData");
@@ -868,7 +861,7 @@ class GPUHistMaker : public TreeUpdater {
 
   MetaInfo* info_{};  // NOLINT
 
-  std::unique_ptr<GPUHistMakerDevice<GradientSumT>> maker;  // NOLINT
+  std::unique_ptr<GPUHistMakerDevice> maker;  // NOLINT
 
   [[nodiscard]] char const* Name() const override { return "grow_gpu_hist"; }
   [[nodiscard]] bool HasNodePosition() const override { return true; }
@@ -883,6 +876,7 @@ class GPUHistMaker : public TreeUpdater {
   ObjInfo const* task_{nullptr};
 
   common::Monitor monitor_;
+  std::shared_ptr<common::ColumnSampler> column_sampler_;
 };
 
 #if !defined(GTEST_TEST)
@@ -892,4 +886,131 @@ XGBOOST_REGISTER_TREE_UPDATER(GPUHistMaker, "grow_gpu_hist")
       return new GPUHistMaker(ctx, task);
     });
 #endif  // !defined(GTEST_TEST)
+
+class GPUGlobalApproxMaker : public TreeUpdater {
+ public:
+  explicit GPUGlobalApproxMaker(Context const* ctx, ObjInfo const* task)
+      : TreeUpdater(ctx), task_{task} {};
+  void Configure(Args const& args) override {
+    // Used in test to count how many configurations are performed
+    LOG(DEBUG) << "[GPU Approx]: Configure";
+    hist_maker_param_.UpdateAllowUnknown(args);
+    dh::CheckComputeCapability();
+    initialised_ = false;
+
+    monitor_.Init(this->Name());
+  }
+
+  void LoadConfig(Json const& in) override {
+    auto const& config = get<Object const>(in);
+    FromJson(config.at("approx_train_param"), &this->hist_maker_param_);
+    initialised_ = false;
+  }
+  void SaveConfig(Json* p_out) const override {
+    auto& out = *p_out;
+    out["approx_train_param"] = ToJson(hist_maker_param_);
+  }
+  ~GPUGlobalApproxMaker() override { dh::GlobalMemoryLogger().Log(); }
+
+  void Update(TrainParam const* param, HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat,
+              common::Span<HostDeviceVector<bst_node_t>> out_position,
+              const std::vector<RegTree*>& trees) override {
+    monitor_.Start("Update");
+
+    this->InitDataOnce(p_fmat);
+    // build tree
+    hess_.resize(gpair->Size());
+    auto hess = dh::ToSpan(hess_);
+
+    gpair->SetDevice(ctx_->Device());
+    auto d_gpair = gpair->ConstDeviceSpan();
+    auto cuctx = ctx_->CUDACtx();
+    thrust::transform(cuctx->CTP(), dh::tcbegin(d_gpair), dh::tcend(d_gpair), dh::tbegin(hess),
+                      [=] XGBOOST_DEVICE(GradientPair const& g) { return g.GetHess(); });
+
+    auto const& info = p_fmat->Info();
+    info.feature_types.SetDevice(ctx_->Device());
+    auto batch = BatchParam{param->max_bin, hess, !task_->const_hess};
+    maker_ = std::make_unique<GPUHistMakerDevice>(
+        ctx_, !p_fmat->SingleColBlock(), info.feature_types.ConstDeviceSpan(), info.num_row_,
+        *param, column_sampler_, info.num_col_, batch);
+
+    std::size_t t_idx{0};
+    for (xgboost::RegTree* tree : trees) {
+      this->UpdateTree(gpair, p_fmat, tree, &out_position[t_idx]);
+      this->hist_maker_param_.CheckTreesSynchronized(tree);
+      ++t_idx;
+    }
+
+    monitor_.Stop("Update");
+  }
+
+  void InitDataOnce(DMatrix* p_fmat) {
+    if (this->initialised_) {
+      return;
+    }
+
+    monitor_.Start(__func__);
+    CHECK(ctx_->IsCUDA()) << error::InvalidCUDAOrdinal();
+    // Synchronise the column sampling seed
+    uint32_t column_sampling_seed = common::GlobalRandom()();
+    collective::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0);
+    this->column_sampler_ = std::make_shared<common::ColumnSampler>(column_sampling_seed);
+
+    p_last_fmat_ = p_fmat;
+    initialised_ = true;
+    monitor_.Stop(__func__);
+  }
+
+  void InitData(DMatrix* p_fmat, RegTree const* p_tree) {
+    this->InitDataOnce(p_fmat);
+    p_last_tree_ = p_tree;
+  }
+
+  void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat, RegTree* p_tree,
+                  HostDeviceVector<bst_node_t>* p_out_position) {
+    monitor_.Start("InitData");
+    this->InitData(p_fmat, p_tree);
+    monitor_.Stop("InitData");
+
+    gpair->SetDevice(ctx_->gpu_id);
+    maker_->UpdateTree(gpair, p_fmat, task_, p_tree, p_out_position);
+  }
+
+  bool UpdatePredictionCache(const DMatrix* data,
+                             linalg::MatrixView<bst_float> p_out_preds) override {
+    if (maker_ == nullptr || p_last_fmat_ == nullptr || p_last_fmat_ != data) {
+      return false;
+    }
+    monitor_.Start("UpdatePredictionCache");
+    bool result = maker_->UpdatePredictionCache(p_out_preds, p_last_tree_);
+    monitor_.Stop("UpdatePredictionCache");
+    return result;
+  }
+
+  [[nodiscard]] char const* Name() const override { return "grow_gpu_approx"; }
+  [[nodiscard]] bool HasNodePosition() const override { return true; }
+
+ private:
+  bool initialised_{false};
+
+  GPUHistMakerTrainParam hist_maker_param_;
+  dh::device_vector<float> hess_;
+  std::shared_ptr<common::ColumnSampler> column_sampler_;
+  std::unique_ptr<GPUHistMakerDevice> maker_;
+
+  DMatrix* p_last_fmat_{nullptr};
+  RegTree const* p_last_tree_{nullptr};
+  ObjInfo const* task_{nullptr};
+
+  common::Monitor monitor_;
+};
+
+#if !defined(GTEST_TEST)
+XGBOOST_REGISTER_TREE_UPDATER(GPUApproxMaker, "grow_gpu_approx")
+    .describe("Grow tree with GPU.")
+    .set_body([](Context const* ctx, ObjInfo const* task) {
+      return new GPUGlobalApproxMaker(ctx, task);
+    });
+#endif  // !defined(GTEST_TEST)
 }  // namespace xgboost::tree
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index b250cd2ab..2bd47d42c 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -13,10 +13,7 @@
 #include "../../../src/common/common.h"
 #include "../../../src/data/ellpack_page.cuh"  // for EllpackPageImpl
 #include "../../../src/data/ellpack_page.h"    // for EllpackPage
-#include "../../../src/data/sparse_page_source.h"
-#include "../../../src/tree/constraints.cuh"
 #include "../../../src/tree/param.h"  // for TrainParam
-#include "../../../src/tree/updater_gpu_common.cuh"
 #include "../../../src/tree/updater_gpu_hist.cu"
 #include "../filesystem.h"  // dmlc::TemporaryDirectory
 #include "../helpers.h"
@@ -94,8 +91,9 @@ void TestBuildHist(bool use_shared_memory_histograms) {
   auto page = BuildEllpackPage(kNRows, kNCols);
   BatchParam batch_param{};
   Context ctx{MakeCUDACtx(0)};
-  GPUHistMakerDevice<GradientSumT> maker(&ctx, /*is_external_memory=*/false, {}, kNRows, param,
-                                         kNCols, kNCols, batch_param);
+  auto cs = std::make_shared<common::ColumnSampler>(0);
+  GPUHistMakerDevice maker(&ctx, /*is_external_memory=*/false, {}, kNRows, param, cs, kNCols,
+                           batch_param);
   xgboost::SimpleLCG gen;
   xgboost::SimpleRealUniformDistribution<bst_float> dist(0.0f, 1.0f);
   HostDeviceVector<GradientPair> gpair(kNRows);
diff --git a/tests/cpp/tree/test_prediction_cache.cc b/tests/cpp/tree/test_prediction_cache.cc
index e60d9cd8a..333f1eccc 100644
--- a/tests/cpp/tree/test_prediction_cache.cc
+++ b/tests/cpp/tree/test_prediction_cache.cc
@@ -24,15 +24,11 @@ class TestPredictionCache : public ::testing::Test {
     Xy_ = RandomDataGenerator{n_samples_, n_features, 0}.Targets(n_targets).GenerateDMatrix(true);
   }
 
-  void RunLearnerTest(std::string updater_name, float subsample, std::string const& grow_policy,
-                      std::string const& strategy) {
+  void RunLearnerTest(Context const* ctx, std::string updater_name, float subsample,
+                      std::string const& grow_policy, std::string const& strategy) {
     std::unique_ptr<Learner> learner{Learner::Create({Xy_})};
-    if (updater_name == "grow_gpu_hist") {
-      // gpu_id setup
-      learner->SetParam("tree_method", "gpu_hist");
-    } else {
-      learner->SetParam("updater", updater_name);
-    }
+    learner->SetParam("device", ctx->DeviceName());
+    learner->SetParam("updater", updater_name);
     learner->SetParam("multi_strategy", strategy);
     learner->SetParam("grow_policy", grow_policy);
     learner->SetParam("subsample", std::to_string(subsample));
@@ -65,20 +61,14 @@ class TestPredictionCache : public ::testing::Test {
     }
   }
 
-  void RunTest(std::string const& updater_name, std::string const& strategy) {
+  void RunTest(Context* ctx, std::string const& updater_name, std::string const& strategy) {
     {
-      Context ctx;
-      ctx.InitAllowUnknown(Args{{"nthread", "8"}});
-      if (updater_name == "grow_gpu_hist") {
-        ctx = ctx.MakeCUDA(0);
-      } else {
-        ctx = ctx.MakeCPU();
-      }
+      ctx->InitAllowUnknown(Args{{"nthread", "8"}});
 
       ObjInfo task{ObjInfo::kRegression};
-      std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create(updater_name, &ctx, &task)};
+      std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create(updater_name, ctx, &task)};
       RegTree tree;
-      std::vector<RegTree *> trees{&tree};
+      std::vector<RegTree*> trees{&tree};
       auto gpair = GenerateRandomGradients(n_samples_);
       tree::TrainParam param;
       param.UpdateAllowUnknown(Args{{"max_bin", "64"}});
@@ -86,33 +76,46 @@ class TestPredictionCache : public ::testing::Test {
       std::vector<HostDeviceVector<bst_node_t>> position(1);
       updater->Update(&param, &gpair, Xy_.get(), position, trees);
       HostDeviceVector<float> out_prediction_cached;
-      out_prediction_cached.SetDevice(ctx.gpu_id);
+      out_prediction_cached.SetDevice(ctx->Device());
       out_prediction_cached.Resize(n_samples_);
       auto cache =
-          linalg::MakeTensorView(&ctx, &out_prediction_cached, out_prediction_cached.Size(), 1);
+          linalg::MakeTensorView(ctx, &out_prediction_cached, out_prediction_cached.Size(), 1);
       ASSERT_TRUE(updater->UpdatePredictionCache(Xy_.get(), cache));
     }
 
     for (auto policy : {"depthwise", "lossguide"}) {
       for (auto subsample : {1.0f, 0.4f}) {
-        this->RunLearnerTest(updater_name, subsample, policy, strategy);
-        this->RunLearnerTest(updater_name, subsample, policy, strategy);
+        this->RunLearnerTest(ctx, updater_name, subsample, policy, strategy);
+        this->RunLearnerTest(ctx, updater_name, subsample, policy, strategy);
       }
     }
   }
 };
 
-TEST_F(TestPredictionCache, Approx) { this->RunTest("grow_histmaker", "one_output_per_tree"); }
+TEST_F(TestPredictionCache, Approx) {
+  Context ctx;
+  this->RunTest(&ctx, "grow_histmaker", "one_output_per_tree");
+}
 
 TEST_F(TestPredictionCache, Hist) {
-  this->RunTest("grow_quantile_histmaker", "one_output_per_tree");
+  Context ctx;
+  this->RunTest(&ctx, "grow_quantile_histmaker", "one_output_per_tree");
 }
 
 TEST_F(TestPredictionCache, HistMulti) {
-  this->RunTest("grow_quantile_histmaker", "multi_output_tree");
+  Context ctx;
+  this->RunTest(&ctx, "grow_quantile_histmaker", "multi_output_tree");
 }
 
 #if defined(XGBOOST_USE_CUDA)
-TEST_F(TestPredictionCache, GpuHist) { this->RunTest("grow_gpu_hist", "one_output_per_tree"); }
+TEST_F(TestPredictionCache, GpuHist) {
+  auto ctx = MakeCUDACtx(0);
+  this->RunTest(&ctx, "grow_gpu_hist", "one_output_per_tree");
+}
+
+TEST_F(TestPredictionCache, GpuApprox) {
+  auto ctx = MakeCUDACtx(0);
+  this->RunTest(&ctx, "grow_gpu_approx", "one_output_per_tree");
+}
 #endif  // defined(XGBOOST_USE_CUDA)
 }  // namespace xgboost
diff --git a/tests/cpp/tree/test_regen.cc b/tests/cpp/tree/test_regen.cc
index d0fe5b449..837159329 100644
--- a/tests/cpp/tree/test_regen.cc
+++ b/tests/cpp/tree/test_regen.cc
@@ -62,8 +62,10 @@ class RegenTest : public ::testing::Test {
   auto constexpr Iter() const { return 4; }
 
   template <typename Page>
-  size_t TestTreeMethod(std::string tree_method, std::string obj, bool reset = true) const {
+  size_t TestTreeMethod(Context const* ctx, std::string tree_method, std::string obj,
+                        bool reset = true) const {
     auto learner = std::unique_ptr<Learner>{Learner::Create({p_fmat_})};
+    learner->SetParam("device", ctx->DeviceName());
     learner->SetParam("tree_method", tree_method);
     learner->SetParam("objective", obj);
     learner->Configure();
@@ -87,40 +89,71 @@ class RegenTest : public ::testing::Test {
 }  // anonymous namespace
 
 TEST_F(RegenTest, Approx) {
-  auto n = this->TestTreeMethod<GHistIndexMatrix>("approx", "reg:squarederror");
+  Context ctx;
+  auto n = this->TestTreeMethod<GHistIndexMatrix>(&ctx, "approx", "reg:squarederror");
   ASSERT_EQ(n, 1);
-  n = this->TestTreeMethod<GHistIndexMatrix>("approx", "reg:logistic");
+  n = this->TestTreeMethod<GHistIndexMatrix>(&ctx, "approx", "reg:logistic");
   ASSERT_EQ(n, this->Iter());
 }
 
 TEST_F(RegenTest, Hist) {
-  auto n = this->TestTreeMethod<GHistIndexMatrix>("hist", "reg:squarederror");
+  Context ctx;
+  auto n = this->TestTreeMethod<GHistIndexMatrix>(&ctx, "hist", "reg:squarederror");
   ASSERT_EQ(n, 1);
-  n = this->TestTreeMethod<GHistIndexMatrix>("hist", "reg:logistic");
+  n = this->TestTreeMethod<GHistIndexMatrix>(&ctx, "hist", "reg:logistic");
   ASSERT_EQ(n, 1);
 }
 
 TEST_F(RegenTest, Mixed) {
-  auto n = this->TestTreeMethod<GHistIndexMatrix>("hist", "reg:squarederror", false);
+  Context ctx;
+  auto n = this->TestTreeMethod<GHistIndexMatrix>(&ctx, "hist", "reg:squarederror", false);
   ASSERT_EQ(n, 1);
-  n = this->TestTreeMethod<GHistIndexMatrix>("approx", "reg:logistic", true);
+  n = this->TestTreeMethod<GHistIndexMatrix>(&ctx, "approx", "reg:logistic", true);
   ASSERT_EQ(n, this->Iter() + 1);
 
-  n = this->TestTreeMethod<GHistIndexMatrix>("approx", "reg:logistic", false);
+  n = this->TestTreeMethod<GHistIndexMatrix>(&ctx, "approx", "reg:logistic", false);
   ASSERT_EQ(n, this->Iter());
-  n = this->TestTreeMethod<GHistIndexMatrix>("hist", "reg:squarederror", true);
+  n = this->TestTreeMethod<GHistIndexMatrix>(&ctx, "hist", "reg:squarederror", true);
   ASSERT_EQ(n, this->Iter() + 1);
 }
 
 #if defined(XGBOOST_USE_CUDA)
-TEST_F(RegenTest, GpuHist) {
-  auto n = this->TestTreeMethod<EllpackPage>("gpu_hist", "reg:squarederror");
+TEST_F(RegenTest, GpuApprox) {
+  auto ctx = MakeCUDACtx(0);
+  auto n = this->TestTreeMethod<EllpackPage>(&ctx, "approx", "reg:squarederror", true);
   ASSERT_EQ(n, 1);
-  n = this->TestTreeMethod<EllpackPage>("gpu_hist", "reg:logistic", false);
+  n = this->TestTreeMethod<EllpackPage>(&ctx, "approx", "reg:logistic", false);
+  ASSERT_EQ(n, this->Iter());
+
+  n = this->TestTreeMethod<EllpackPage>(&ctx, "approx", "reg:logistic", true);
+  ASSERT_EQ(n, this->Iter() * 2);
+}
+
+TEST_F(RegenTest, GpuHist) {
+  auto ctx = MakeCUDACtx(0);
+  auto n = this->TestTreeMethod<EllpackPage>(&ctx, "hist", "reg:squarederror", true);
+  ASSERT_EQ(n, 1);
+  n = this->TestTreeMethod<EllpackPage>(&ctx, "hist", "reg:logistic", false);
   ASSERT_EQ(n, 1);
 
-  n = this->TestTreeMethod<EllpackPage>("hist", "reg:logistic");
-  ASSERT_EQ(n, 2);
+  {
+    Context ctx;
+    n = this->TestTreeMethod<EllpackPage>(&ctx, "hist", "reg:logistic");
+    ASSERT_EQ(n, 2);
+  }
+}
+
+TEST_F(RegenTest, GpuMixed) {
+  auto ctx = MakeCUDACtx(0);
+  auto n = this->TestTreeMethod<EllpackPage>(&ctx, "hist", "reg:squarederror", false);
+  ASSERT_EQ(n, 1);
+  n = this->TestTreeMethod<EllpackPage>(&ctx, "approx", "reg:logistic", true);
+  ASSERT_EQ(n, this->Iter() + 1);
+
+  n = this->TestTreeMethod<EllpackPage>(&ctx, "approx", "reg:logistic", false);
+  ASSERT_EQ(n, this->Iter());
+  n = this->TestTreeMethod<EllpackPage>(&ctx, "hist", "reg:squarederror", true);
+  ASSERT_EQ(n, this->Iter() + 1);
 }
 #endif  // defined(XGBOOST_USE_CUDA)
 }  // namespace xgboost
diff --git a/tests/cpp/tree/test_tree_policy.cc b/tests/cpp/tree/test_tree_policy.cc
index 15f4cd31b..50563be1d 100644
--- a/tests/cpp/tree/test_tree_policy.cc
+++ b/tests/cpp/tree/test_tree_policy.cc
@@ -20,10 +20,11 @@ class TestGrowPolicy : public ::testing::Test {
             true);
   }
 
-  std::unique_ptr<Learner> TrainOneIter(std::string tree_method, std::string policy,
-                                        int32_t max_leaves, int32_t max_depth) {
+  std::unique_ptr<Learner> TrainOneIter(Context const* ctx, std::string tree_method,
+                                        std::string policy, int32_t max_leaves, int32_t max_depth) {
     std::unique_ptr<Learner> learner{Learner::Create({this->Xy_})};
     learner->SetParam("tree_method", tree_method);
+    learner->SetParam("device", ctx->DeviceName());
     if (max_leaves >= 0) {
       learner->SetParam("max_leaves", std::to_string(max_leaves));
     }
@@ -63,7 +64,7 @@ class TestGrowPolicy : public ::testing::Test {
 
     if (max_leaves == 0 && max_depth == 0) {
       // unconstrainted
-      if (tree_method != "gpu_hist") {
+      if (ctx->IsCPU()) {
         // GPU pre-allocates for all nodes.
         learner->UpdateOneIter(0, Xy_);
       }
@@ -86,23 +87,23 @@ class TestGrowPolicy : public ::testing::Test {
     return learner;
   }
 
-  void TestCombination(std::string tree_method) {
+  void TestCombination(Context const* ctx, std::string tree_method) {
     for (auto policy : {"depthwise", "lossguide"}) {
       // -1 means default
       for (auto leaves : {-1, 0, 3}) {
         for (auto depth : {-1, 0, 3}) {
-          this->TrainOneIter(tree_method, policy, leaves, depth);
+          this->TrainOneIter(ctx, tree_method, policy, leaves, depth);
         }
       }
     }
   }
 
-  void TestTreeGrowPolicy(std::string tree_method, std::string policy) {
+  void TestTreeGrowPolicy(Context const* ctx, std::string tree_method, std::string policy) {
     {
       /**
        *  max_leaves
        */
-      auto learner = this->TrainOneIter(tree_method, policy, 16, -1);
+      auto learner = this->TrainOneIter(ctx, tree_method, policy, 16, -1);
       Json model{Object{}};
       learner->SaveModel(&model);
 
@@ -115,7 +116,7 @@ class TestGrowPolicy : public ::testing::Test {
       /**
        *  max_depth
        */
-      auto learner = this->TrainOneIter(tree_method, policy, -1, 3);
+      auto learner = this->TrainOneIter(ctx, tree_method, policy, -1, 3);
       Json model{Object{}};
       learner->SaveModel(&model);
 
@@ -133,25 +134,36 @@ class TestGrowPolicy : public ::testing::Test {
 };
 
 TEST_F(TestGrowPolicy, Approx) {
-  this->TestTreeGrowPolicy("approx", "depthwise");
-  this->TestTreeGrowPolicy("approx", "lossguide");
+  Context ctx;
+  this->TestTreeGrowPolicy(&ctx, "approx", "depthwise");
+  this->TestTreeGrowPolicy(&ctx, "approx", "lossguide");
 
-  this->TestCombination("approx");
+  this->TestCombination(&ctx, "approx");
 }
 
 TEST_F(TestGrowPolicy, Hist) {
-  this->TestTreeGrowPolicy("hist", "depthwise");
-  this->TestTreeGrowPolicy("hist", "lossguide");
+  Context ctx;
+  this->TestTreeGrowPolicy(&ctx, "hist", "depthwise");
+  this->TestTreeGrowPolicy(&ctx, "hist", "lossguide");
 
-  this->TestCombination("hist");
+  this->TestCombination(&ctx, "hist");
 }
 
 #if defined(XGBOOST_USE_CUDA)
 TEST_F(TestGrowPolicy, GpuHist) {
-  this->TestTreeGrowPolicy("gpu_hist", "depthwise");
-  this->TestTreeGrowPolicy("gpu_hist", "lossguide");
+  auto ctx = MakeCUDACtx(0);
+  this->TestTreeGrowPolicy(&ctx, "hist", "depthwise");
+  this->TestTreeGrowPolicy(&ctx, "hist", "lossguide");
 
-  this->TestCombination("gpu_hist");
+  this->TestCombination(&ctx, "hist");
+}
+
+TEST_F(TestGrowPolicy, GpuApprox) {
+  auto ctx = MakeCUDACtx(0);
+  this->TestTreeGrowPolicy(&ctx, "approx", "depthwise");
+  this->TestTreeGrowPolicy(&ctx, "approx", "lossguide");
+
+  this->TestCombination(&ctx, "approx");
 }
 #endif  // defined(XGBOOST_USE_CUDA)
 }  // namespace xgboost
diff --git a/tests/cpp/tree/test_tree_stat.cc b/tests/cpp/tree/test_tree_stat.cc
index fb64e3a78..d125c84d5 100644
--- a/tests/cpp/tree/test_tree_stat.cc
+++ b/tests/cpp/tree/test_tree_stat.cc
@@ -135,7 +135,7 @@ class TestMinSplitLoss : public ::testing::Test {
     gpair_ = GenerateRandomGradients(kRows);
   }
 
-  std::int32_t Update(std::string updater, float gamma) {
+  std::int32_t Update(Context const* ctx, std::string updater, float gamma) {
     Args args{{"max_depth", "1"},
               {"max_leaves", "0"},
 
@@ -154,8 +154,7 @@ class TestMinSplitLoss : public ::testing::Test {
     param.UpdateAllowUnknown(args);
     ObjInfo task{ObjInfo::kRegression};
 
-    Context ctx{MakeCUDACtx(updater == "grow_gpu_hist" ? 0 : Context::kCpuId)};
-    auto up = std::unique_ptr<TreeUpdater>{TreeUpdater::Create(updater, &ctx, &task)};
+    auto up = std::unique_ptr<TreeUpdater>{TreeUpdater::Create(updater, ctx, &task)};
     up->Configure({});
 
     RegTree tree;
@@ -167,16 +166,16 @@ class TestMinSplitLoss : public ::testing::Test {
   }
 
  public:
-  void RunTest(std::string updater) {
+  void RunTest(Context const* ctx, std::string updater) {
     {
-      int32_t n_nodes = Update(updater, 0.01);
+      int32_t n_nodes = Update(ctx, updater, 0.01);
       // This is not strictly verified, meaning the numeber `2` is whatever GPU_Hist retured
       // when writing this test, and only used for testing larger gamma (below) does prevent
       // building tree.
       ASSERT_EQ(n_nodes, 2);
     }
     {
-      int32_t n_nodes = Update(updater, 100.0);
+      int32_t n_nodes = Update(ctx, updater, 100.0);
       // No new nodes with gamma == 100.
       ASSERT_EQ(n_nodes, static_cast<decltype(n_nodes)>(0));
     }
@@ -185,10 +184,25 @@ class TestMinSplitLoss : public ::testing::Test {
 
 /* Exact tree method requires a pruner as an additional updater, so not tested here. */
 
-TEST_F(TestMinSplitLoss, Approx) { this->RunTest("grow_histmaker"); }
+TEST_F(TestMinSplitLoss, Approx) {
+  Context ctx;
+  this->RunTest(&ctx, "grow_histmaker");
+}
+
+TEST_F(TestMinSplitLoss, Hist) {
+  Context ctx;
+  this->RunTest(&ctx, "grow_quantile_histmaker");
+}
 
-TEST_F(TestMinSplitLoss, Hist) { this->RunTest("grow_quantile_histmaker"); }
 #if defined(XGBOOST_USE_CUDA)
-TEST_F(TestMinSplitLoss, GpuHist) { this->RunTest("grow_gpu_hist"); }
+TEST_F(TestMinSplitLoss, GpuHist) {
+  auto ctx = MakeCUDACtx(0);
+  this->RunTest(&ctx, "grow_gpu_hist");
+}
+
+TEST_F(TestMinSplitLoss, GpuApprox) {
+  auto ctx = MakeCUDACtx(0);
+  this->RunTest(&ctx, "grow_gpu_approx");
+}
 #endif  // defined(XGBOOST_USE_CUDA)
 }  // namespace xgboost
diff --git a/tests/python-gpu/test_gpu_updaters.py b/tests/python-gpu/test_gpu_updaters.py
index 7fea42f60..653a99f3a 100644
--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@@ -7,11 +7,18 @@ from hypothesis import assume, given, note, settings, strategies
 
 import xgboost as xgb
 from xgboost import testing as tm
-from xgboost.testing.params import cat_parameter_strategy, hist_parameter_strategy
+from xgboost.testing.params import (
+    cat_parameter_strategy,
+    exact_parameter_strategy,
+    hist_parameter_strategy,
+)
 from xgboost.testing.updater import (
+    check_categorical_missing,
+    check_categorical_ohe,
     check_get_quantile_cut,
     check_init_estimation,
     check_quantile_loss,
+    train_result,
 )
 
 sys.path.append("tests/python")
@@ -20,22 +27,6 @@ import test_updaters as test_up
 pytestmark = tm.timeout(30)
 
 
-def train_result(param, dmat: xgb.DMatrix, num_rounds: int) -> dict:
-    result: xgb.callback.TrainingCallback.EvalsLog = {}
-    booster = xgb.train(
-        param,
-        dmat,
-        num_rounds,
-        [(dmat, "train")],
-        verbose_eval=False,
-        evals_result=result,
-    )
-    assert booster.num_features() == dmat.num_col()
-    assert booster.num_boosted_rounds() == num_rounds
-
-    return result
-
-
 class TestGPUUpdatersMulti:
     @given(
         hist_parameter_strategy, strategies.integers(1, 20), tm.multi_dataset_strategy
@@ -53,14 +44,45 @@ class TestGPUUpdaters:
     cputest = test_up.TestTreeMethod()
 
     @given(
-        hist_parameter_strategy, strategies.integers(1, 20), tm.make_dataset_strategy()
+        exact_parameter_strategy,
+        hist_parameter_strategy,
+        strategies.integers(1, 20),
+        tm.make_dataset_strategy(),
     )
     @settings(deadline=None, max_examples=50, print_blob=True)
-    def test_gpu_hist(self, param, num_rounds, dataset):
-        param["tree_method"] = "gpu_hist"
+    def test_gpu_hist(
+        self,
+        param: Dict[str, Any],
+        hist_param: Dict[str, Any],
+        num_rounds: int,
+        dataset: tm.TestDataset,
+    ) -> None:
+        param.update({"tree_method": "hist", "device": "cuda"})
+        param.update(hist_param)
         param = dataset.set_params(param)
         result = train_result(param, dataset.get_dmat(), num_rounds)
-        note(result)
+        note(str(result))
+        assert tm.non_increasing(result["train"][dataset.metric])
+
+    @given(
+        exact_parameter_strategy,
+        hist_parameter_strategy,
+        strategies.integers(1, 20),
+        tm.make_dataset_strategy(),
+    )
+    @settings(deadline=None, print_blob=True)
+    def test_gpu_approx(
+        self,
+        param: Dict[str, Any],
+        hist_param: Dict[str, Any],
+        num_rounds: int,
+        dataset: tm.TestDataset,
+    ) -> None:
+        param.update({"tree_method": "approx", "device": "cuda"})
+        param.update(hist_param)
+        param = dataset.set_params(param)
+        result = train_result(param, dataset.get_dmat(), num_rounds)
+        note(str(result))
         assert tm.non_increasing(result["train"][dataset.metric])
 
     @given(tm.sparse_datasets_strategy)
@@ -69,23 +91,27 @@ class TestGPUUpdaters:
         param = {"tree_method": "hist", "max_bin": 64}
         hist_result = train_result(param, dataset.get_dmat(), 16)
         note(hist_result)
-        assert tm.non_increasing(hist_result['train'][dataset.metric])
+        assert tm.non_increasing(hist_result["train"][dataset.metric])
 
         param = {"tree_method": "gpu_hist", "max_bin": 64}
         gpu_hist_result = train_result(param, dataset.get_dmat(), 16)
         note(gpu_hist_result)
-        assert tm.non_increasing(gpu_hist_result['train'][dataset.metric])
+        assert tm.non_increasing(gpu_hist_result["train"][dataset.metric])
 
         np.testing.assert_allclose(
             hist_result["train"]["rmse"], gpu_hist_result["train"]["rmse"], rtol=1e-2
         )
 
-    @given(strategies.integers(10, 400), strategies.integers(3, 8),
-           strategies.integers(1, 2), strategies.integers(4, 7))
+    @given(
+        strategies.integers(10, 400),
+        strategies.integers(3, 8),
+        strategies.integers(1, 2),
+        strategies.integers(4, 7),
+    )
     @settings(deadline=None, max_examples=20, print_blob=True)
     @pytest.mark.skipif(**tm.no_pandas())
     def test_categorical_ohe(self, rows, cols, rounds, cats):
-        self.cputest.run_categorical_ohe(rows, cols, rounds, cats, "gpu_hist")
+        check_categorical_ohe(rows, cols, rounds, cats, "cuda", "hist")
 
     @given(
         tm.categorical_dataset_strategy,
@@ -95,7 +121,7 @@ class TestGPUUpdaters:
     )
     @settings(deadline=None, max_examples=20, print_blob=True)
     @pytest.mark.skipif(**tm.no_pandas())
-    def test_categorical(
+    def test_categorical_hist(
         self,
         dataset: tm.TestDataset,
         hist_parameters: Dict[str, Any],
@@ -103,7 +129,30 @@ class TestGPUUpdaters:
         n_rounds: int,
     ) -> None:
         cat_parameters.update(hist_parameters)
-        cat_parameters["tree_method"] = "gpu_hist"
+        cat_parameters["tree_method"] = "hist"
+        cat_parameters["device"] = "cuda"
+
+        results = train_result(cat_parameters, dataset.get_dmat(), n_rounds)
+        tm.non_increasing(results["train"]["rmse"])
+
+    @given(
+        tm.categorical_dataset_strategy,
+        hist_parameter_strategy,
+        cat_parameter_strategy,
+        strategies.integers(4, 32),
+    )
+    @settings(deadline=None, max_examples=20, print_blob=True)
+    @pytest.mark.skipif(**tm.no_pandas())
+    def test_categorical_approx(
+        self,
+        dataset: tm.TestDataset,
+        hist_parameters: Dict[str, Any],
+        cat_parameters: Dict[str, Any],
+        n_rounds: int,
+    ) -> None:
+        cat_parameters.update(hist_parameters)
+        cat_parameters["tree_method"] = "approx"
+        cat_parameters["device"] = "cuda"
 
         results = train_result(cat_parameters, dataset.get_dmat(), n_rounds)
         tm.non_increasing(results["train"]["rmse"])
@@ -129,24 +178,25 @@ class TestGPUUpdaters:
     @given(
         strategies.integers(10, 400),
         strategies.integers(3, 8),
-        strategies.integers(4, 7)
+        strategies.integers(4, 7),
     )
     @settings(deadline=None, max_examples=20, print_blob=True)
     @pytest.mark.skipif(**tm.no_pandas())
     def test_categorical_missing(self, rows, cols, cats):
-        self.cputest.run_categorical_missing(rows, cols, cats, "gpu_hist")
+        check_categorical_missing(rows, cols, cats, "cuda", "approx")
+        check_categorical_missing(rows, cols, cats, "cuda", "hist")
 
     @pytest.mark.skipif(**tm.no_pandas())
     def test_max_cat(self) -> None:
         self.cputest.run_max_cat("gpu_hist")
 
     def test_categorical_32_cat(self):
-        '''32 hits the bound of integer bitset, so special test'''
+        """32 hits the bound of integer bitset, so special test"""
         rows = 1000
         cols = 10
         cats = 32
         rounds = 4
-        self.cputest.run_categorical_ohe(rows, cols, rounds, cats, "gpu_hist")
+        check_categorical_ohe(rows, cols, rounds, cats, "cuda", "hist")
 
     @pytest.mark.skipif(**tm.no_cupy())
     def test_invalid_category(self):
@@ -164,15 +214,15 @@ class TestGPUUpdaters:
     ) -> None:
         # We cannot handle empty dataset yet
         assume(len(dataset.y) > 0)
-        param['tree_method'] = 'gpu_hist'
+        param["tree_method"] = "gpu_hist"
         param = dataset.set_params(param)
         result = train_result(
             param,
             dataset.get_device_dmat(max_bin=param.get("max_bin", None)),
-            num_rounds
+            num_rounds,
         )
         note(result)
-        assert tm.non_increasing(result['train'][dataset.metric], tolerance=1e-3)
+        assert tm.non_increasing(result["train"][dataset.metric], tolerance=1e-3)
 
     @given(
         hist_parameter_strategy,
@@ -185,12 +235,12 @@ class TestGPUUpdaters:
             return
         # We cannot handle empty dataset yet
         assume(len(dataset.y) > 0)
-        param['tree_method'] = 'gpu_hist'
+        param["tree_method"] = "gpu_hist"
         param = dataset.set_params(param)
         m = dataset.get_external_dmat()
         external_result = train_result(param, m, num_rounds)
         del m
-        assert tm.non_increasing(external_result['train'][dataset.metric])
+        assert tm.non_increasing(external_result["train"][dataset.metric])
 
     def test_empty_dmatrix_prediction(self):
         # FIXME(trivialfis): This should be done with all updaters
@@ -207,7 +257,7 @@ class TestGPUUpdaters:
             dtrain,
             verbose_eval=True,
             num_boost_round=6,
-            evals=[(dtrain, 'Train')]
+            evals=[(dtrain, "Train")],
         )
 
         kRows = 100
@@ -222,10 +272,10 @@ class TestGPUUpdaters:
     @given(tm.make_dataset_strategy(), strategies.integers(0, 10))
     @settings(deadline=None, max_examples=10, print_blob=True)
     def test_specified_gpu_id_gpu_update(self, dataset, gpu_id):
-        param = {'tree_method': 'gpu_hist', 'gpu_id': gpu_id}
+        param = {"tree_method": "gpu_hist", "gpu_id": gpu_id}
         param = dataset.set_params(param)
         result = train_result(param, dataset.get_dmat(), 10)
-        assert tm.non_increasing(result['train'][dataset.metric])
+        assert tm.non_increasing(result["train"][dataset.metric])
 
     @pytest.mark.skipif(**tm.no_sklearn())
     @pytest.mark.parametrize("weighted", [True, False])
diff --git a/tests/python/test_updaters.py b/tests/python/test_updaters.py
index 029911bf0..5374a2891 100644
--- a/tests/python/test_updaters.py
+++ b/tests/python/test_updaters.py
@@ -1,6 +1,6 @@
 import json
 from string import ascii_lowercase
-from typing import Any, Dict, List
+from typing import Any, Dict
 
 import numpy as np
 import pytest
@@ -15,30 +15,15 @@ from xgboost.testing.params import (
     hist_parameter_strategy,
 )
 from xgboost.testing.updater import (
+    check_categorical_missing,
+    check_categorical_ohe,
     check_get_quantile_cut,
     check_init_estimation,
     check_quantile_loss,
+    train_result,
 )
 
 
-def train_result(param, dmat, num_rounds):
-    result = {}
-    booster = xgb.train(
-        param,
-        dmat,
-        num_rounds,
-        evals=[(dmat, "train")],
-        verbose_eval=False,
-        evals_result=result,
-    )
-    assert booster.num_features() == dmat.num_col()
-    assert booster.num_boosted_rounds() == num_rounds
-    assert booster.feature_names == dmat.feature_names
-    assert booster.feature_types == dmat.feature_types
-
-    return result
-
-
 class TestTreeMethodMulti:
     @given(
         exact_parameter_strategy, strategies.integers(1, 20), tm.multi_dataset_strategy
@@ -281,115 +266,6 @@ class TestTreeMethod:
     def test_max_cat(self, tree_method) -> None:
         self.run_max_cat(tree_method)
 
-    def run_categorical_missing(
-        self, rows: int, cols: int, cats: int, tree_method: str
-    ) -> None:
-        parameters: Dict[str, Any] = {"tree_method": tree_method}
-        cat, label = tm.make_categorical(
-            rows, n_features=cols, n_categories=cats, onehot=False, sparsity=0.5
-        )
-        Xy = xgb.DMatrix(cat, label, enable_categorical=True)
-
-        def run(max_cat_to_onehot: int):
-            # Test with onehot splits
-            parameters["max_cat_to_onehot"] = max_cat_to_onehot
-
-            evals_result: Dict[str, Dict] = {}
-            booster = xgb.train(
-                parameters,
-                Xy,
-                num_boost_round=16,
-                evals=[(Xy, "Train")],
-                evals_result=evals_result
-            )
-            assert tm.non_increasing(evals_result["Train"]["rmse"])
-            y_predt = booster.predict(Xy)
-
-            rmse = tm.root_mean_square(label, y_predt)
-            np.testing.assert_allclose(
-                rmse, evals_result["Train"]["rmse"][-1], rtol=2e-5
-            )
-
-        # Test with OHE split
-        run(self.USE_ONEHOT)
-
-        # Test with partition-based split
-        run(self.USE_PART)
-
-    def run_categorical_ohe(
-        self, rows: int, cols: int, rounds: int, cats: int, tree_method: str
-    ) -> None:
-        onehot, label = tm.make_categorical(rows, cols, cats, True)
-        cat, _ = tm.make_categorical(rows, cols, cats, False)
-
-        by_etl_results: Dict[str, Dict[str, List[float]]] = {}
-        by_builtin_results: Dict[str, Dict[str, List[float]]] = {}
-
-        parameters: Dict[str, Any] = {
-            "tree_method": tree_method,
-            # Use one-hot exclusively
-            "max_cat_to_onehot": self.USE_ONEHOT
-        }
-
-        m = xgb.DMatrix(onehot, label, enable_categorical=False)
-        xgb.train(
-            parameters,
-            m,
-            num_boost_round=rounds,
-            evals=[(m, "Train")],
-            evals_result=by_etl_results,
-        )
-
-        m = xgb.DMatrix(cat, label, enable_categorical=True)
-        xgb.train(
-            parameters,
-            m,
-            num_boost_round=rounds,
-            evals=[(m, "Train")],
-            evals_result=by_builtin_results,
-        )
-
-        # There are guidelines on how to specify tolerance based on considering output
-        # as random variables. But in here the tree construction is extremely sensitive
-        # to floating point errors. An 1e-5 error in a histogram bin can lead to an
-        # entirely different tree. So even though the test is quite lenient, hypothesis
-        # can still pick up falsifying examples from time to time.
-        np.testing.assert_allclose(
-            np.array(by_etl_results["Train"]["rmse"]),
-            np.array(by_builtin_results["Train"]["rmse"]),
-            rtol=1e-3,
-        )
-        assert tm.non_increasing(by_builtin_results["Train"]["rmse"])
-
-        by_grouping: Dict[str, Dict[str, List[float]]] = {}
-        # switch to partition-based splits
-        parameters["max_cat_to_onehot"] = self.USE_PART
-        parameters["reg_lambda"] = 0
-        m = xgb.DMatrix(cat, label, enable_categorical=True)
-        xgb.train(
-            parameters,
-            m,
-            num_boost_round=rounds,
-            evals=[(m, "Train")],
-            evals_result=by_grouping,
-        )
-        rmse_oh = by_builtin_results["Train"]["rmse"]
-        rmse_group = by_grouping["Train"]["rmse"]
-        # always better or equal to onehot when there's no regularization.
-        for a, b in zip(rmse_oh, rmse_group):
-            assert a >= b
-
-        parameters["reg_lambda"] = 1.0
-        by_grouping = {}
-        xgb.train(
-            parameters,
-            m,
-            num_boost_round=32,
-            evals=[(m, "Train")],
-            evals_result=by_grouping,
-        )
-        assert tm.non_increasing(by_grouping["Train"]["rmse"]), by_grouping
-
     @given(strategies.integers(10, 400), strategies.integers(3, 8),
            strategies.integers(1, 2), strategies.integers(4, 7))
     @settings(deadline=None, print_blob=True)
@@ -397,8 +273,8 @@ class TestTreeMethod:
     def test_categorical_ohe(
         self, rows: int, cols: int, rounds: int, cats: int
     ) -> None:
-        self.run_categorical_ohe(rows, cols, rounds, cats, "approx")
-        self.run_categorical_ohe(rows, cols, rounds, cats, "hist")
+        check_categorical_ohe(rows, cols, rounds, cats, "cpu", "approx")
+        check_categorical_ohe(rows, cols, rounds, cats, "cpu", "hist")
 
     @given(
         tm.categorical_dataset_strategy,
@@ -454,8 +330,8 @@ class TestTreeMethod:
     @settings(deadline=None, print_blob=True)
     @pytest.mark.skipif(**tm.no_pandas())
     def test_categorical_missing(self, rows, cols, cats):
-        self.run_categorical_missing(rows, cols, cats, "approx")
-        self.run_categorical_missing(rows, cols, cats, "hist")
+        check_categorical_missing(rows, cols, cats, "cpu", "approx")
+        check_categorical_missing(rows, cols, cats, "cpu", "hist")
 
     def run_adaptive(self, tree_method, weighted) -> None:
         rng = np.random.RandomState(1994)
diff --git a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
index 9386486de..4cc934579 100644
--- a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
+++ b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
@@ -154,7 +154,6 @@ def run_gpu_hist(
     DMatrixT: Type,
     client: Client,
 ) -> None:
-    params["tree_method"] = "hist"
     params["device"] = "cuda"
     params = dataset.set_params(params)
     # It doesn't make sense to distribute a completely
@@ -275,8 +274,31 @@ class TestDistributedGPU:
         dmatrix_type: type,
         local_cuda_client: Client,
     ) -> None:
+        params["tree_method"] = "hist"
         run_gpu_hist(params, num_rounds, dataset, dmatrix_type, local_cuda_client)
 
+    @given(
+        params=hist_parameter_strategy,
+        num_rounds=strategies.integers(1, 20),
+        dataset=tm.make_dataset_strategy(),
+    )
+    @settings(
+        deadline=duration(seconds=120),
+        max_examples=20,
+        suppress_health_check=suppress,
+        print_blob=True,
+    )
+    @pytest.mark.skipif(**tm.no_cupy())
+    def test_gpu_approx(
+        self,
+        params: Dict,
+        num_rounds: int,
+        dataset: tm.TestDataset,
+        local_cuda_client: Client,
+    ) -> None:
+        params["tree_method"] = "approx"
+        run_gpu_hist(params, num_rounds, dataset, dxgb.DaskDMatrix, local_cuda_client)
+
     def test_empty_quantile_dmatrix(self, local_cuda_client: Client) -> None:
         client = local_cuda_client
         X, y = make_categorical(client, 1, 30, 13)

From a9da2e244a0102732693a454bab4a0a9154a0414 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 1 Aug 2023 23:03:53 +0800
Subject: [PATCH 057/136] [CI] Update github actions. (#9428)

---
 .github/workflows/scorecards.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml
index cb8882961..78cde0a43 100644
--- a/.github/workflows/scorecards.yml
+++ b/.github/workflows/scorecards.yml
@@ -27,21 +27,21 @@ jobs:
           persist-credentials: false
 
       - name: "Run analysis"
-        uses: ossf/scorecard-action@99c53751e09b9529366343771cc321ec74e9bd3d # tag=v2.0.6
+        uses: ossf/scorecard-action@08b4669551908b1024bb425080c797723083c031 # tag=v2.2.0
         with:
           results_file: results.sarif
           results_format: sarif
 
           # Publish the results for public repositories to enable scorecard badges. For more details, see
-          # https://github.com/ossf/scorecard-action#publishing-results. 
-          # For private repositories, `publish_results` will automatically be set to `false`, regardless 
+          # https://github.com/ossf/scorecard-action#publishing-results.
+          # For private repositories, `publish_results` will automatically be set to `false`, regardless
           # of the value entered here.
           publish_results: true
 
       # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
       # format to the repository Actions tab.
       - name: "Upload artifact"
-        uses: actions/upload-artifact@6673cd052c4cd6fcf4b4e6e60ea986c889389535 # tag=v3.0.0
+        uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # tag=v3.1.2
         with:
           name: SARIF file
           path: results.sarif
@@ -49,6 +49,6 @@ jobs:
 
       # Upload the results to GitHub's code scanning dashboard.
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@5f532563584d71fdef14ee64d17bafb34f751ce5 # tag=v1.0.26
+        uses: github/codeql-action/upload-sarif@7b6664fa89524ee6e3c3e9749402d5afd69b3cd8 # tag=v2.14.1
         with:
           sarif_file: results.sarif

From c2b85ab68a29e1f691ebe61f9d1180cb7484b81f Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Tue, 1 Aug 2023 23:31:18 -0700
Subject: [PATCH 058/136] Clean up MGPU C++ tests (#9430)

---
 src/collective/communicator.cc                |  3 +-
 src/collective/communicator.cu                |  3 +-
 src/collective/communicator.h                 |  4 +-
 .../test_nccl_device_communicator.cu          |  9 ++-
 tests/cpp/common/test_quantile.cu             | 39 +++++--------
 tests/cpp/helpers.h                           | 57 +++++++++++++------
 tests/cpp/linear/test_json_io.h               |  2 +-
 tests/cpp/linear/test_linear.cc               |  4 +-
 tests/cpp/metric/test_auc.cc                  | 24 ++++----
 tests/cpp/metric/test_auc.h                   | 12 ++--
 tests/cpp/metric/test_elementwise_metric.cc   | 40 ++++++-------
 tests/cpp/metric/test_elementwise_metric.h    | 38 ++++++-------
 tests/cpp/metric/test_metric.cc               | 10 +---
 tests/cpp/metric/test_multiclass_metric.cc    |  8 +--
 tests/cpp/metric/test_multiclass_metric.h     |  8 +--
 tests/cpp/metric/test_rank_metric.cc          | 18 +++---
 tests/cpp/metric/test_rank_metric.h           |  8 +--
 tests/cpp/metric/test_survival_metric.cu      | 12 ++--
 tests/cpp/metric/test_survival_metric.h       |  6 +-
 tests/cpp/objective/test_aft_obj.cc           | 10 ++--
 tests/cpp/objective/test_hinge.cc             |  2 +-
 tests/cpp/objective/test_lambdarank_obj.cc    |  2 +-
 tests/cpp/objective/test_multiclass_obj.cc    |  6 +-
 tests/cpp/objective/test_quantile_obj.cc      |  4 +-
 tests/cpp/objective/test_regression_obj.cc    | 32 +++++------
 tests/cpp/plugin/test_example_objective.cc    |  2 +-
 tests/cpp/plugin/test_federated_adapter.cu    | 18 +++---
 tests/cpp/predictor/test_gpu_predictor.cu     | 13 ++---
 28 files changed, 200 insertions(+), 194 deletions(-)

diff --git a/src/collective/communicator.cc b/src/collective/communicator.cc
index 22c85f3ad..e4c491c2b 100644
--- a/src/collective/communicator.cc
+++ b/src/collective/communicator.cc
@@ -41,7 +41,8 @@ void Communicator::Init(Json const& config) {
 #endif
       break;
     }
-    case CommunicatorType::kInMemory: {
+    case CommunicatorType::kInMemory:
+    case CommunicatorType::kInMemoryNccl: {
       communicator_.reset(InMemoryCommunicator::Create(config));
       break;
     }
diff --git a/src/collective/communicator.cu b/src/collective/communicator.cu
index 915a3beca..a80eab6d5 100644
--- a/src/collective/communicator.cu
+++ b/src/collective/communicator.cu
@@ -34,9 +34,10 @@ DeviceCommunicator* Communicator::GetDevice(int device_ordinal) {
         device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false));
         break;
       case CommunicatorType::kFederated:
+      case CommunicatorType::kInMemory:
         device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal));
         break;
-      case CommunicatorType::kInMemory:
+      case CommunicatorType::kInMemoryNccl:
         device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, true));
         break;
       default:
diff --git a/src/collective/communicator.h b/src/collective/communicator.h
index 6cda5e47c..def961513 100644
--- a/src/collective/communicator.h
+++ b/src/collective/communicator.h
@@ -69,7 +69,7 @@ enum class Operation {
 
 class DeviceCommunicator;
 
-enum class CommunicatorType { kUnknown, kRabit, kFederated, kInMemory };
+enum class CommunicatorType { kUnknown, kRabit, kFederated, kInMemory, kInMemoryNccl };
 
 /** \brief Case-insensitive string comparison. */
 inline int CompareStringsCaseInsensitive(const char *s1, const char *s2) {
@@ -220,6 +220,8 @@ class Communicator {
       result = CommunicatorType::kFederated;
     } else if (!CompareStringsCaseInsensitive("in-memory", str)) {
       result = CommunicatorType::kInMemory;
+    } else if (!CompareStringsCaseInsensitive("in-memory-nccl", str)) {
+      result = CommunicatorType::kInMemoryNccl;
     } else {
       LOG(FATAL) << "Unknown communicator type " << str;
     }
diff --git a/tests/cpp/collective/test_nccl_device_communicator.cu b/tests/cpp/collective/test_nccl_device_communicator.cu
index cd9cd26de..d6ed400b2 100644
--- a/tests/cpp/collective/test_nccl_device_communicator.cu
+++ b/tests/cpp/collective/test_nccl_device_communicator.cu
@@ -46,7 +46,8 @@ TEST(NcclDeviceCommunicator, MGPUAllReduceBitwiseAND) {
   if (n_gpus <= 1) {
     GTEST_SKIP() << "Skipping MGPUAllReduceBitwiseAND test with # GPUs = " << n_gpus;
   }
-  RunWithInMemoryCommunicator(n_gpus, VerifyAllReduceBitwiseAND);
+  auto constexpr kUseNccl = true;
+  RunWithInMemoryCommunicator<kUseNccl>(n_gpus, VerifyAllReduceBitwiseAND);
 }
 
 namespace {
@@ -67,7 +68,8 @@ TEST(NcclDeviceCommunicator, MGPUAllReduceBitwiseOR) {
   if (n_gpus <= 1) {
     GTEST_SKIP() << "Skipping MGPUAllReduceBitwiseOR test with # GPUs = " << n_gpus;
   }
-  RunWithInMemoryCommunicator(n_gpus, VerifyAllReduceBitwiseOR);
+  auto constexpr kUseNccl = true;
+  RunWithInMemoryCommunicator<kUseNccl>(n_gpus, VerifyAllReduceBitwiseOR);
 }
 
 namespace {
@@ -88,7 +90,8 @@ TEST(NcclDeviceCommunicator, MGPUAllReduceBitwiseXOR) {
   if (n_gpus <= 1) {
     GTEST_SKIP() << "Skipping MGPUAllReduceBitwiseXOR test with # GPUs = " << n_gpus;
   }
-  RunWithInMemoryCommunicator(n_gpus, VerifyAllReduceBitwiseXOR);
+  auto constexpr kUseNccl = true;
+  RunWithInMemoryCommunicator<kUseNccl>(n_gpus, VerifyAllReduceBitwiseXOR);
 }
 
 }  // namespace collective
diff --git a/tests/cpp/common/test_quantile.cu b/tests/cpp/common/test_quantile.cu
index eda55ee47..28d698685 100644
--- a/tests/cpp/common/test_quantile.cu
+++ b/tests/cpp/common/test_quantile.cu
@@ -19,6 +19,9 @@ struct IsSorted {
 };
 }
 namespace common {
+
+class MGPUQuantileTest : public BaseMGPUTest {};
+
 TEST(GPUQuantile, Basic) {
   constexpr size_t kRows = 1000, kCols = 100, kBins = 256;
   HostDeviceVector<FeatureType> ft;
@@ -344,12 +347,11 @@ TEST(GPUQuantile, MultiMerge) {
 }
 
 namespace {
-void TestAllReduceBasic(int32_t n_gpus) {
+void TestAllReduceBasic() {
   auto const world = collective::GetWorldSize();
-  CHECK_EQ(world, n_gpus);
   constexpr size_t kRows = 1000, kCols = 100;
   RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
-    auto const device = collective::GetRank();
+    auto const device = GetGPUId();
 
     // Set up single node version;
     HostDeviceVector<FeatureType> ft({}, device);
@@ -422,12 +424,8 @@ void TestAllReduceBasic(int32_t n_gpus) {
 }
 }  // anonymous namespace
 
-TEST(GPUQuantile, MGPUAllReduceBasic) {
-  auto const n_gpus = AllVisibleGPUs();
-  if (n_gpus <= 1) {
-    GTEST_SKIP() << "Skipping MGPUAllReduceBasic test with # GPUs = " << n_gpus;
-  }
-  RunWithInMemoryCommunicator(n_gpus, TestAllReduceBasic, n_gpus);
+TEST_F(MGPUQuantileTest, AllReduceBasic) {
+  DoTest(TestAllReduceBasic);
 }
 
 namespace {
@@ -442,7 +440,7 @@ void TestColumnSplitBasic() {
   }()};
 
   // Generate cuts for distributed environment.
-  auto ctx = MakeCUDACtx(rank);
+  auto ctx = MakeCUDACtx(GetGPUId());
   HistogramCuts distributed_cuts = common::DeviceSketch(&ctx, m.get(), kBins);
 
   // Generate cuts for single node environment
@@ -474,23 +472,18 @@ void TestColumnSplitBasic() {
 }
 }  // anonymous namespace
 
-TEST(GPUQuantile, MGPUColumnSplitBasic) {
-  auto const n_gpus = AllVisibleGPUs();
-  if (n_gpus <= 1) {
-    GTEST_SKIP() << "Skipping MGPUColumnSplitBasic test with # GPUs = " << n_gpus;
-  }
-  RunWithInMemoryCommunicator(n_gpus, TestColumnSplitBasic);
+TEST_F(MGPUQuantileTest, ColumnSplitBasic) {
+  DoTest(TestColumnSplitBasic);
 }
 
 namespace {
-void TestSameOnAllWorkers(std::int32_t n_gpus) {
+void TestSameOnAllWorkers() {
   auto world = collective::GetWorldSize();
-  CHECK_EQ(world, n_gpus);
   constexpr size_t kRows = 1000, kCols = 100;
   RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins,
                                  MetaInfo const &info) {
     auto const rank = collective::GetRank();
-    auto const device = rank;
+    auto const device = GetGPUId();
     HostDeviceVector<FeatureType> ft({}, device);
     SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, device);
     HostDeviceVector<float> storage({}, device);
@@ -544,12 +537,8 @@ void TestSameOnAllWorkers(std::int32_t n_gpus) {
 }
 }  // anonymous namespace
 
-TEST(GPUQuantile, MGPUSameOnAllWorkers) {
-  auto const n_gpus = AllVisibleGPUs();
-  if (n_gpus <= 1) {
-    GTEST_SKIP() << "Skipping MGPUSameOnAllWorkers test with # GPUs = " << n_gpus;
-  }
-  RunWithInMemoryCommunicator(n_gpus, TestSameOnAllWorkers, n_gpus);
+TEST_F(MGPUQuantileTest, SameOnAllWorkers) {
+  DoTest(TestSameOnAllWorkers);
 }
 
 TEST(GPUQuantile, Push) {
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index b166109d9..6cb0b3405 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -34,24 +34,12 @@
 #define DeclareUnifiedTest(name) name
 #endif
 
-#if defined(__CUDACC__)
-#define GPUIDX 0
-#else
-#define GPUIDX -1
-#endif
-
 #if defined(__CUDACC__)
 #define DeclareUnifiedDistributedTest(name) MGPU ## name
 #else
 #define DeclareUnifiedDistributedTest(name) name
 #endif
 
-#if defined(__CUDACC__)
-#define WORLD_SIZE_FOR_TEST (xgboost::common::AllVisibleGPUs())
-#else
-#define WORLD_SIZE_FOR_TEST (3)
-#endif
-
 namespace xgboost {
 class ObjFunction;
 class Metric;
@@ -522,11 +510,15 @@ inline LearnerModelParam MakeMP(bst_feature_t n_features, float base_score, uint
 
 inline std::int32_t AllThreadsForTest() { return Context{}.Threads(); }
 
-template <typename Function, typename... Args>
+template <bool use_nccl = false, typename Function, typename... Args>
 void RunWithInMemoryCommunicator(int32_t world_size, Function&& function, Args&&... args) {
   auto run = [&](auto rank) {
     Json config{JsonObject()};
-    config["xgboost_communicator"] = String("in-memory");
+    if constexpr (use_nccl) {
+      config["xgboost_communicator"] = String("in-memory-nccl");
+    } else {
+      config["xgboost_communicator"] = String("in-memory");
+    }
     config["in_memory_world_size"] = world_size;
     config["in_memory_rank"] = rank;
     xgboost::collective::Init(config);
@@ -548,15 +540,44 @@ void RunWithInMemoryCommunicator(int32_t world_size, Function&& function, Args&&
 #endif
 }
 
-class DeclareUnifiedDistributedTest(MetricTest) : public ::testing::Test {
+inline int GetGPUId() {
+#if defined(__CUDACC__)
+  auto const n_gpus = common::AllVisibleGPUs();
+  return n_gpus == 1 ? 0 : collective::GetRank();
+#else
+  return -1;
+#endif
+}
+
+class BaseMGPUTest : public ::testing::Test {
  protected:
   int world_size_;
+  bool use_nccl_{false};
 
   void SetUp() override {
-    world_size_ = WORLD_SIZE_FOR_TEST;
-    if (world_size_ <= 1) {
-      GTEST_SKIP() << "Skipping MGPU test with # GPUs = " << world_size_;
+    auto const n_gpus = common::AllVisibleGPUs();
+    if (n_gpus <= 1) {
+      // Use a single GPU to simulate distributed environment.
+      world_size_ = 3;
+      // NCCL doesn't like sharing a single GPU, so we use the adapter instead.
+      use_nccl_ = false;
+    } else {
+      // Use multiple GPUs for real.
+      world_size_ = n_gpus;
+      use_nccl_ = true;
+    }
+  }
+
+  template <typename Function, typename... Args>
+  void DoTest(Function&& function, Args&&... args) {
+    if (use_nccl_) {
+      RunWithInMemoryCommunicator<true>(world_size_, function, args...);
+    } else {
+      RunWithInMemoryCommunicator<false>(world_size_, function, args...);
     }
   }
 };
+
+class DeclareUnifiedDistributedTest(MetricTest) : public BaseMGPUTest{};
+
 }  // namespace xgboost
diff --git a/tests/cpp/linear/test_json_io.h b/tests/cpp/linear/test_json_io.h
index c423448e2..48d4497c3 100644
--- a/tests/cpp/linear/test_json_io.h
+++ b/tests/cpp/linear/test_json_io.h
@@ -12,7 +12,7 @@
 
 namespace xgboost {
 inline void TestUpdaterJsonIO(std::string updater_str) {
-  Context ctx{MakeCUDACtx(GPUIDX)};
+  Context ctx{MakeCUDACtx(GetGPUId())};
   Json config_0 {Object() };
 
   {
diff --git a/tests/cpp/linear/test_linear.cc b/tests/cpp/linear/test_linear.cc
index 6b2d17e10..f15a47e64 100644
--- a/tests/cpp/linear/test_linear.cc
+++ b/tests/cpp/linear/test_linear.cc
@@ -17,7 +17,7 @@ TEST(Linear, Shotgun) {
 
   auto p_fmat = xgboost::RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
 
-  auto ctx = MakeCUDACtx(GPUIDX);
+  auto ctx = MakeCUDACtx(GetGPUId());
   LearnerModelParam mparam{MakeMP(kCols, .5, 1)};
 
   {
@@ -49,7 +49,7 @@ TEST(Linear, coordinate) {
 
   auto p_fmat = xgboost::RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
 
-  auto ctx = MakeCUDACtx(GPUIDX);
+  auto ctx = MakeCUDACtx(GetGPUId());
   LearnerModelParam mparam{MakeMP(kCols, .5, 1)};
 
   auto updater = std::unique_ptr<xgboost::LinearUpdater>(
diff --git a/tests/cpp/metric/test_auc.cc b/tests/cpp/metric/test_auc.cc
index de42bba53..eea54fc32 100644
--- a/tests/cpp/metric/test_auc.cc
+++ b/tests/cpp/metric/test_auc.cc
@@ -18,51 +18,51 @@ TEST(Metric, DeclareUnifiedTest(MultiClassPRAUC)) { VerifyMultiClassPRAUC(); }
 TEST(Metric, DeclareUnifiedTest(RankingPRAUC)) { VerifyRankingPRAUC(); }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), BinaryAUCRowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyBinaryAUC, DataSplitMode::kRow);
+  DoTest(VerifyBinaryAUC, DataSplitMode::kRow);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), BinaryAUCColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyBinaryAUC, DataSplitMode::kCol);
+  DoTest(VerifyBinaryAUC, DataSplitMode::kCol);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), MultiClassAUCRowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyMultiClassAUC, DataSplitMode::kRow);
+  DoTest(VerifyMultiClassAUC, DataSplitMode::kRow);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), MultiClassAUCColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyMultiClassAUC, DataSplitMode::kCol);
+  DoTest(VerifyMultiClassAUC, DataSplitMode::kCol);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), RankingAUCRowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyRankingAUC, DataSplitMode::kRow);
+  DoTest(VerifyRankingAUC, DataSplitMode::kRow);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), RankingAUCColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyRankingAUC, DataSplitMode::kCol);
+  DoTest(VerifyRankingAUC, DataSplitMode::kCol);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), PRAUCRowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyPRAUC, DataSplitMode::kRow);
+  DoTest(VerifyPRAUC, DataSplitMode::kRow);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), PRAUCColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyPRAUC, DataSplitMode::kCol);
+  DoTest(VerifyPRAUC, DataSplitMode::kCol);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), MultiClassPRAUCRowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyMultiClassPRAUC, DataSplitMode::kRow);
+  DoTest(VerifyMultiClassPRAUC, DataSplitMode::kRow);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), MultiClassPRAUCColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyMultiClassPRAUC, DataSplitMode::kCol);
+  DoTest(VerifyMultiClassPRAUC, DataSplitMode::kCol);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), RankingPRAUCRowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyRankingPRAUC, DataSplitMode::kRow);
+  DoTest(VerifyRankingPRAUC, DataSplitMode::kRow);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), RankingPRAUCColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyRankingPRAUC, DataSplitMode::kCol);
+  DoTest(VerifyRankingPRAUC, DataSplitMode::kCol);
 }
 }  // namespace metric
 }  // namespace xgboost
diff --git a/tests/cpp/metric/test_auc.h b/tests/cpp/metric/test_auc.h
index 0dd3dd83e..cd0095ebb 100644
--- a/tests/cpp/metric/test_auc.h
+++ b/tests/cpp/metric/test_auc.h
@@ -11,7 +11,7 @@ namespace xgboost {
 namespace metric {
 
 inline void VerifyBinaryAUC(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = MakeCUDACtx(GPUIDX);
+  auto ctx = MakeCUDACtx(GetGPUId());
   std::unique_ptr<Metric> uni_ptr{Metric::Create("auc", &ctx)};
   Metric* metric = uni_ptr.get();
   ASSERT_STREQ(metric->Name(), "auc");
@@ -54,7 +54,7 @@ inline void VerifyBinaryAUC(DataSplitMode data_split_mode = DataSplitMode::kRow)
 }
 
 inline void VerifyMultiClassAUC(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = MakeCUDACtx(GPUIDX);
+  auto ctx = MakeCUDACtx(GetGPUId());
   std::unique_ptr<Metric> uni_ptr{Metric::Create("auc", &ctx)};
   auto metric = uni_ptr.get();
 
@@ -115,7 +115,7 @@ inline void VerifyMultiClassAUC(DataSplitMode data_split_mode = DataSplitMode::k
 }
 
 inline void VerifyRankingAUC(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = MakeCUDACtx(GPUIDX);
+  auto ctx = MakeCUDACtx(GetGPUId());
   std::unique_ptr<Metric> metric{Metric::Create("auc", &ctx)};
 
   // single group
@@ -149,7 +149,7 @@ inline void VerifyRankingAUC(DataSplitMode data_split_mode = DataSplitMode::kRow
 }
 
 inline void VerifyPRAUC(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = MakeCUDACtx(GPUIDX);
+  auto ctx = MakeCUDACtx(GetGPUId());
 
   xgboost::Metric* metric = xgboost::Metric::Create("aucpr", &ctx);
   ASSERT_STREQ(metric->Name(), "aucpr");
@@ -186,7 +186,7 @@ inline void VerifyPRAUC(DataSplitMode data_split_mode = DataSplitMode::kRow) {
 }
 
 inline void VerifyMultiClassPRAUC(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = MakeCUDACtx(GPUIDX);
+  auto ctx = MakeCUDACtx(GetGPUId());
 
   std::unique_ptr<Metric> metric{Metric::Create("aucpr", &ctx)};
 
@@ -210,7 +210,7 @@ inline void VerifyMultiClassPRAUC(DataSplitMode data_split_mode = DataSplitMode:
 }
 
 inline void VerifyRankingPRAUC(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = MakeCUDACtx(GPUIDX);
+  auto ctx = MakeCUDACtx(GetGPUId());
 
   std::unique_ptr<Metric> metric{Metric::Create("aucpr", &ctx)};
 
diff --git a/tests/cpp/metric/test_elementwise_metric.cc b/tests/cpp/metric/test_elementwise_metric.cc
index 2407dde39..13021fb6a 100644
--- a/tests/cpp/metric/test_elementwise_metric.cc
+++ b/tests/cpp/metric/test_elementwise_metric.cc
@@ -26,83 +26,83 @@ TEST(Metric, DeclareUnifiedTest(MultiRMSE)) { VerifyMultiRMSE(); }
 TEST(Metric, DeclareUnifiedTest(Quantile)) { VerifyQuantile(); }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), RMSERowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyRMSE, DataSplitMode::kRow);
+  DoTest(VerifyRMSE, DataSplitMode::kRow);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), RMSEColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyRMSE, DataSplitMode::kCol);
+  DoTest(VerifyRMSE, DataSplitMode::kCol);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), RMSLERowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyRMSLE, DataSplitMode::kRow);
+  DoTest(VerifyRMSLE, DataSplitMode::kRow);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), RMSLEColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyRMSLE, DataSplitMode::kCol);
+  DoTest(VerifyRMSLE, DataSplitMode::kCol);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), MAERowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyMAE, DataSplitMode::kRow);
+  DoTest(VerifyMAE, DataSplitMode::kRow);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), MAEColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyMAE, DataSplitMode::kCol);
+  DoTest(VerifyMAE, DataSplitMode::kCol);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), MAPERowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyMAPE, DataSplitMode::kRow);
+  DoTest(VerifyMAPE, DataSplitMode::kRow);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), MAPEColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyMAPE, DataSplitMode::kCol);
+  DoTest(VerifyMAPE, DataSplitMode::kCol);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), MPHERowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyMPHE, DataSplitMode::kRow);
+  DoTest(VerifyMPHE, DataSplitMode::kRow);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), MPHEColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyMPHE, DataSplitMode::kCol);
+  DoTest(VerifyMPHE, DataSplitMode::kCol);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), LogLossRowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyLogLoss, DataSplitMode::kRow);
+  DoTest(VerifyLogLoss, DataSplitMode::kRow);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), LogLossColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyLogLoss, DataSplitMode::kCol);
+  DoTest(VerifyLogLoss, DataSplitMode::kCol);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), ErrorRowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyError, DataSplitMode::kRow);
+  DoTest(VerifyError, DataSplitMode::kRow);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), ErrorColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyError, DataSplitMode::kCol);
+  DoTest(VerifyError, DataSplitMode::kCol);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), PoissonNegLogLikRowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyPoissonNegLogLik, DataSplitMode::kRow);
+  DoTest(VerifyPoissonNegLogLik, DataSplitMode::kRow);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), PoissonNegLogLikColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyPoissonNegLogLik, DataSplitMode::kCol);
+  DoTest(VerifyPoissonNegLogLik, DataSplitMode::kCol);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), MultiRMSERowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyMultiRMSE, DataSplitMode::kRow);
+  DoTest(VerifyMultiRMSE, DataSplitMode::kRow);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), MultiRMSEColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyMultiRMSE, DataSplitMode::kCol);
+  DoTest(VerifyMultiRMSE, DataSplitMode::kCol);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), QuantileRowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyQuantile, DataSplitMode::kRow);
+  DoTest(VerifyQuantile, DataSplitMode::kRow);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), QuantileColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyQuantile, DataSplitMode::kCol);
+  DoTest(VerifyQuantile, DataSplitMode::kCol);
 }
 }  // namespace metric
 }  // namespace xgboost
diff --git a/tests/cpp/metric/test_elementwise_metric.h b/tests/cpp/metric/test_elementwise_metric.h
index a32bb0438..9a3d3fe9f 100644
--- a/tests/cpp/metric/test_elementwise_metric.h
+++ b/tests/cpp/metric/test_elementwise_metric.h
@@ -46,7 +46,7 @@ inline void CheckDeterministicMetricElementWise(StringView name, int32_t device)
 }
 
 inline void VerifyRMSE(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = MakeCUDACtx(GPUIDX);
+  auto ctx = MakeCUDACtx(GetGPUId());
   xgboost::Metric * metric = xgboost::Metric::Create("rmse", &ctx);
   metric->Configure({});
   ASSERT_STREQ(metric->Name(), "rmse");
@@ -71,11 +71,11 @@ inline void VerifyRMSE(DataSplitMode data_split_mode = DataSplitMode::kRow) {
               0.6708f, 0.001f);
   delete metric;
 
-  CheckDeterministicMetricElementWise(StringView{"rmse"}, GPUIDX);
+  CheckDeterministicMetricElementWise(StringView{"rmse"}, GetGPUId());
 }
 
 inline void VerifyRMSLE(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = MakeCUDACtx(GPUIDX);
+  auto ctx = MakeCUDACtx(GetGPUId());
   xgboost::Metric * metric = xgboost::Metric::Create("rmsle", &ctx);
   metric->Configure({});
   ASSERT_STREQ(metric->Name(), "rmsle");
@@ -100,11 +100,11 @@ inline void VerifyRMSLE(DataSplitMode data_split_mode = DataSplitMode::kRow) {
               0.2415f, 1e-4);
   delete metric;
 
-  CheckDeterministicMetricElementWise(StringView{"rmsle"}, GPUIDX);
+  CheckDeterministicMetricElementWise(StringView{"rmsle"}, GetGPUId());
 }
 
 inline void VerifyMAE(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = MakeCUDACtx(GPUIDX);
+  auto ctx = MakeCUDACtx(GetGPUId());
   xgboost::Metric * metric = xgboost::Metric::Create("mae", &ctx);
   metric->Configure({});
   ASSERT_STREQ(metric->Name(), "mae");
@@ -129,11 +129,11 @@ inline void VerifyMAE(DataSplitMode data_split_mode = DataSplitMode::kRow) {
               0.54f, 0.001f);
   delete metric;
 
-  CheckDeterministicMetricElementWise(StringView{"mae"}, GPUIDX);
+  CheckDeterministicMetricElementWise(StringView{"mae"}, GetGPUId());
 }
 
 inline void VerifyMAPE(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = MakeCUDACtx(GPUIDX);
+  auto ctx = MakeCUDACtx(GetGPUId());
   xgboost::Metric * metric = xgboost::Metric::Create("mape", &ctx);
   metric->Configure({});
   ASSERT_STREQ(metric->Name(), "mape");
@@ -158,11 +158,11 @@ inline void VerifyMAPE(DataSplitMode data_split_mode = DataSplitMode::kRow) {
               1.3250f, 0.001f);
   delete metric;
 
-  CheckDeterministicMetricElementWise(StringView{"mape"}, GPUIDX);
+  CheckDeterministicMetricElementWise(StringView{"mape"}, GetGPUId());
 }
 
 inline void VerifyMPHE(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = MakeCUDACtx(GPUIDX);
+  auto ctx = MakeCUDACtx(GetGPUId());
   std::unique_ptr<xgboost::Metric> metric{xgboost::Metric::Create("mphe", &ctx)};
   metric->Configure({});
   ASSERT_STREQ(metric->Name(), "mphe");
@@ -186,7 +186,7 @@ inline void VerifyMPHE(DataSplitMode data_split_mode = DataSplitMode::kRow) {
                             {  1,   2,   9,   8}, {}, data_split_mode),
               0.1922f, 1e-4);
 
-  CheckDeterministicMetricElementWise(StringView{"mphe"}, GPUIDX);
+  CheckDeterministicMetricElementWise(StringView{"mphe"}, GetGPUId());
 
   metric->Configure({{"huber_slope", "0.1"}});
   EXPECT_NEAR(GetMetricEval(metric.get(),
@@ -197,7 +197,7 @@ inline void VerifyMPHE(DataSplitMode data_split_mode = DataSplitMode::kRow) {
 }
 
 inline void VerifyLogLoss(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = MakeCUDACtx(GPUIDX);
+  auto ctx = MakeCUDACtx(GetGPUId());
   xgboost::Metric * metric = xgboost::Metric::Create("logloss", &ctx);
   metric->Configure({});
   ASSERT_STREQ(metric->Name(), "logloss");
@@ -226,11 +226,11 @@ inline void VerifyLogLoss(DataSplitMode data_split_mode = DataSplitMode::kRow) {
               1.3138f, 0.001f);
   delete metric;
 
-  CheckDeterministicMetricElementWise(StringView{"logloss"}, GPUIDX);
+  CheckDeterministicMetricElementWise(StringView{"logloss"}, GetGPUId());
 }
 
 inline void VerifyError(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = MakeCUDACtx(GPUIDX);
+  auto ctx = MakeCUDACtx(GetGPUId());
   xgboost::Metric * metric = xgboost::Metric::Create("error", &ctx);
   metric->Configure({});
   ASSERT_STREQ(metric->Name(), "error");
@@ -288,11 +288,11 @@ inline void VerifyError(DataSplitMode data_split_mode = DataSplitMode::kRow) {
               0.45f, 0.001f);
   delete metric;
 
-  CheckDeterministicMetricElementWise(StringView{"error@0.5"}, GPUIDX);
+  CheckDeterministicMetricElementWise(StringView{"error@0.5"}, GetGPUId());
 }
 
 inline void VerifyPoissonNegLogLik(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = MakeCUDACtx(GPUIDX);
+  auto ctx = MakeCUDACtx(GetGPUId());
   xgboost::Metric * metric = xgboost::Metric::Create("poisson-nloglik", &ctx);
   metric->Configure({});
   ASSERT_STREQ(metric->Name(), "poisson-nloglik");
@@ -321,18 +321,18 @@ inline void VerifyPoissonNegLogLik(DataSplitMode data_split_mode = DataSplitMode
               1.5783f, 0.001f);
   delete metric;
 
-  CheckDeterministicMetricElementWise(StringView{"poisson-nloglik"}, GPUIDX);
+  CheckDeterministicMetricElementWise(StringView{"poisson-nloglik"}, GetGPUId());
 }
 
 inline void VerifyMultiRMSE(DataSplitMode data_split_mode = DataSplitMode::kRow) {
   size_t n_samples = 32, n_targets = 8;
-  linalg::Tensor<float, 2> y{{n_samples, n_targets}, GPUIDX};
+  linalg::Tensor<float, 2> y{{n_samples, n_targets}, GetGPUId()};
   auto &h_y = y.Data()->HostVector();
   std::iota(h_y.begin(), h_y.end(), 0);
 
   HostDeviceVector<float> predt(n_samples * n_targets, 0);
 
-  auto ctx = MakeCUDACtx(GPUIDX);
+  auto ctx = MakeCUDACtx(GetGPUId());
   std::unique_ptr<Metric> metric{Metric::Create("rmse", &ctx)};
   metric->Configure({});
 
@@ -347,7 +347,7 @@ inline void VerifyMultiRMSE(DataSplitMode data_split_mode = DataSplitMode::kRow)
 }
 
 inline void VerifyQuantile(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = MakeCUDACtx(GPUIDX);
+  auto ctx = MakeCUDACtx(GetGPUId());
   std::unique_ptr<Metric> metric{Metric::Create("quantile", &ctx)};
 
   HostDeviceVector<float> predts{0.1f, 0.9f, 0.1f, 0.9f};
diff --git a/tests/cpp/metric/test_metric.cc b/tests/cpp/metric/test_metric.cc
index d269dc746..c629a1481 100644
--- a/tests/cpp/metric/test_metric.cc
+++ b/tests/cpp/metric/test_metric.cc
@@ -4,18 +4,14 @@
 #include "../helpers.h"
 namespace xgboost {
 TEST(Metric, UnknownMetric) {
-  auto ctx = MakeCUDACtx(GPUIDX);
+  auto ctx = MakeCUDACtx(GetGPUId());
   xgboost::Metric* metric = nullptr;
   EXPECT_ANY_THROW(metric = xgboost::Metric::Create("unknown_name", &ctx));
   EXPECT_NO_THROW(metric = xgboost::Metric::Create("rmse", &ctx));
-  if (metric) {
-    delete metric;
-  }
+  delete metric;
   metric = nullptr;
   EXPECT_ANY_THROW(metric = xgboost::Metric::Create("unknown_name@1", &ctx));
   EXPECT_NO_THROW(metric = xgboost::Metric::Create("error@0.5f", &ctx));
-  if (metric) {
-    delete metric;
-  }
+  delete metric;
 }
 }  // namespace xgboost
diff --git a/tests/cpp/metric/test_multiclass_metric.cc b/tests/cpp/metric/test_multiclass_metric.cc
index bfb638924..7fc8bc429 100644
--- a/tests/cpp/metric/test_multiclass_metric.cc
+++ b/tests/cpp/metric/test_multiclass_metric.cc
@@ -11,19 +11,19 @@ TEST(Metric, DeclareUnifiedTest(MultiClassError)) { VerifyMultiClassError(); }
 TEST(Metric, DeclareUnifiedTest(MultiClassLogLoss)) { VerifyMultiClassLogLoss(); }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), MultiClassErrorRowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyMultiClassError, DataSplitMode::kRow);
+  DoTest(VerifyMultiClassError, DataSplitMode::kRow);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), MultiClassErrorColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyMultiClassError, DataSplitMode::kCol);
+  DoTest(VerifyMultiClassError, DataSplitMode::kCol);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), MultiClassLogLossRowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyMultiClassLogLoss, DataSplitMode::kRow);
+  DoTest(VerifyMultiClassLogLoss, DataSplitMode::kRow);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), MultiClassLogLossColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyMultiClassLogLoss, DataSplitMode::kCol);
+  DoTest(VerifyMultiClassLogLoss, DataSplitMode::kCol);
 }
 }  // namespace metric
 }  // namespace xgboost
diff --git a/tests/cpp/metric/test_multiclass_metric.h b/tests/cpp/metric/test_multiclass_metric.h
index 5fdead596..f147c91fa 100644
--- a/tests/cpp/metric/test_multiclass_metric.h
+++ b/tests/cpp/metric/test_multiclass_metric.h
@@ -60,8 +60,8 @@ inline void TestMultiClassError(int device, DataSplitMode data_split_mode) {
 }
 
 inline void VerifyMultiClassError(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  TestMultiClassError(GPUIDX, data_split_mode);
-  CheckDeterministicMetricMultiClass(StringView{"merror"}, GPUIDX);
+  TestMultiClassError(GetGPUId(), data_split_mode);
+  CheckDeterministicMetricMultiClass(StringView{"merror"}, GetGPUId());
 }
 
 inline void TestMultiClassLogLoss(int device, DataSplitMode data_split_mode) {
@@ -81,8 +81,8 @@ inline void TestMultiClassLogLoss(int device, DataSplitMode data_split_mode) {
 }
 
 inline void VerifyMultiClassLogLoss(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  TestMultiClassLogLoss(GPUIDX, data_split_mode);
-  CheckDeterministicMetricMultiClass(StringView{"mlogloss"}, GPUIDX);
+  TestMultiClassLogLoss(GetGPUId(), data_split_mode);
+  CheckDeterministicMetricMultiClass(StringView{"mlogloss"}, GetGPUId());
 }
 
 }  // namespace metric
diff --git a/tests/cpp/metric/test_rank_metric.cc b/tests/cpp/metric/test_rank_metric.cc
index 8c83dee5c..066e981b9 100644
--- a/tests/cpp/metric/test_rank_metric.cc
+++ b/tests/cpp/metric/test_rank_metric.cc
@@ -22,7 +22,7 @@ namespace metric {
 
 #if !defined(__CUDACC__)
 TEST(Metric, AMS) {
-  auto ctx = MakeCUDACtx(GPUIDX);
+  auto ctx = MakeCUDACtx(GetGPUId());
   EXPECT_ANY_THROW(Metric::Create("ams", &ctx));
   Metric* metric = Metric::Create("ams@0.5f", &ctx);
   ASSERT_STREQ(metric->Name(), "ams@0.5");
@@ -50,35 +50,35 @@ TEST(Metric, DeclareUnifiedTest(MAP)) { VerifyMAP(); }
 TEST(Metric, DeclareUnifiedTest(NDCGExpGain)) { VerifyNDCGExpGain(); }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), PrecisionRowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyPrecision, DataSplitMode::kRow);
+  DoTest(VerifyPrecision, DataSplitMode::kRow);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), PrecisionColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyPrecision, DataSplitMode::kCol);
+  DoTest(VerifyPrecision, DataSplitMode::kCol);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), NDCGRowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyNDCG, DataSplitMode::kRow);
+  DoTest(VerifyNDCG, DataSplitMode::kRow);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), NDCGColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyNDCG, DataSplitMode::kCol);
+  DoTest(VerifyNDCG, DataSplitMode::kCol);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), MAPRowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyMAP, DataSplitMode::kRow);
+  DoTest(VerifyMAP, DataSplitMode::kRow);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), MAPColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyMAP, DataSplitMode::kCol);
+  DoTest(VerifyMAP, DataSplitMode::kCol);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), NDCGExpGainRowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyNDCGExpGain, DataSplitMode::kRow);
+  DoTest(VerifyNDCGExpGain, DataSplitMode::kRow);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), NDCGExpGainColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyNDCGExpGain, DataSplitMode::kCol);
+  DoTest(VerifyNDCGExpGain, DataSplitMode::kCol);
 }
 }  // namespace metric
 }  // namespace xgboost
diff --git a/tests/cpp/metric/test_rank_metric.h b/tests/cpp/metric/test_rank_metric.h
index 2f7785689..82d3725f5 100644
--- a/tests/cpp/metric/test_rank_metric.h
+++ b/tests/cpp/metric/test_rank_metric.h
@@ -20,7 +20,7 @@
 namespace xgboost::metric {
 
 inline void VerifyPrecision(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = MakeCUDACtx(GPUIDX);
+  auto ctx = MakeCUDACtx(GetGPUId());
   std::unique_ptr<xgboost::Metric> metric{Metric::Create("pre", &ctx)};
   ASSERT_STREQ(metric->Name(), "pre");
   EXPECT_NEAR(GetMetricEval(metric.get(), {0, 1}, {0, 1}, {}, {}, data_split_mode), 0.5, 1e-7);
@@ -44,7 +44,7 @@ inline void VerifyPrecision(DataSplitMode data_split_mode = DataSplitMode::kRow)
 }
 
 inline void VerifyNDCG(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = MakeCUDACtx(GPUIDX);
+  auto ctx = MakeCUDACtx(GetGPUId());
   Metric * metric = xgboost::Metric::Create("ndcg", &ctx);
   ASSERT_STREQ(metric->Name(), "ndcg");
   EXPECT_ANY_THROW(GetMetricEval(metric, {0, 1}, {}, {}, {}, data_split_mode));
@@ -102,7 +102,7 @@ inline void VerifyNDCG(DataSplitMode data_split_mode = DataSplitMode::kRow) {
 }
 
 inline void VerifyMAP(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = MakeCUDACtx(GPUIDX);
+  auto ctx = MakeCUDACtx(GetGPUId());
   Metric * metric = xgboost::Metric::Create("map", &ctx);
   ASSERT_STREQ(metric->Name(), "map");
   EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 1, kRtEps);
@@ -150,7 +150,7 @@ inline void VerifyMAP(DataSplitMode data_split_mode = DataSplitMode::kRow) {
 }
 
 inline void VerifyNDCGExpGain(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  Context ctx = MakeCUDACtx(GPUIDX);
+  Context ctx = MakeCUDACtx(GetGPUId());
 
   auto p_fmat = xgboost::RandomDataGenerator{0, 0, 0}.GenerateDMatrix();
   MetaInfo& info = p_fmat->Info();
diff --git a/tests/cpp/metric/test_survival_metric.cu b/tests/cpp/metric/test_survival_metric.cu
index e3f4501b5..da97b083b 100644
--- a/tests/cpp/metric/test_survival_metric.cu
+++ b/tests/cpp/metric/test_survival_metric.cu
@@ -12,26 +12,26 @@ namespace common {
 TEST(Metric, DeclareUnifiedTest(AFTNegLogLik)) { VerifyAFTNegLogLik(); }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), AFTNegLogLikRowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyAFTNegLogLik, DataSplitMode::kRow);
+  DoTest(VerifyAFTNegLogLik, DataSplitMode::kRow);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), AFTNegLogLikColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyAFTNegLogLik, DataSplitMode::kCol);
+  DoTest(VerifyAFTNegLogLik, DataSplitMode::kCol);
 }
 
 TEST(Metric, DeclareUnifiedTest(IntervalRegressionAccuracy)) { VerifyIntervalRegressionAccuracy(); }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), IntervalRegressionAccuracyRowSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyIntervalRegressionAccuracy, DataSplitMode::kRow);
+  DoTest(VerifyIntervalRegressionAccuracy, DataSplitMode::kRow);
 }
 
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), IntervalRegressionAccuracyColumnSplit) {
-  RunWithInMemoryCommunicator(world_size_, &VerifyIntervalRegressionAccuracy, DataSplitMode::kCol);
+  DoTest(VerifyIntervalRegressionAccuracy, DataSplitMode::kCol);
 }
 
 // Test configuration of AFT metric
 TEST(AFTNegLogLikMetric, DeclareUnifiedTest(Configuration)) {
-  auto ctx = MakeCUDACtx(GPUIDX);
+  auto ctx = MakeCUDACtx(GetGPUId());
   std::unique_ptr<Metric> metric(Metric::Create("aft-nloglik", &ctx));
   metric->Configure({{"aft_loss_distribution", "normal"}, {"aft_loss_distribution_scale", "10"}});
 
@@ -42,7 +42,7 @@ TEST(AFTNegLogLikMetric, DeclareUnifiedTest(Configuration)) {
   EXPECT_EQ(get<String>(aft_param_json["aft_loss_distribution"]), "normal");
   EXPECT_EQ(get<String>(aft_param_json["aft_loss_distribution_scale"]), "10");
 
-  CheckDeterministicMetricElementWise(StringView{"aft-nloglik"}, GPUIDX);
+  CheckDeterministicMetricElementWise(StringView{"aft-nloglik"}, GetGPUId());
 }
 }  // namespace common
 }  // namespace xgboost
diff --git a/tests/cpp/metric/test_survival_metric.h b/tests/cpp/metric/test_survival_metric.h
index 1626d3772..5baa5b5a4 100644
--- a/tests/cpp/metric/test_survival_metric.h
+++ b/tests/cpp/metric/test_survival_metric.h
@@ -48,7 +48,7 @@ inline void CheckDeterministicMetricElementWise(StringView name, int32_t device)
 }
 
 inline void VerifyAFTNegLogLik(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = MakeCUDACtx(GPUIDX);
+  auto ctx = MakeCUDACtx(GetGPUId());
 
   /**
    * Test aggregate output from the AFT metric over a small test data set.
@@ -79,7 +79,7 @@ inline void VerifyAFTNegLogLik(DataSplitMode data_split_mode = DataSplitMode::kR
 }
 
 inline void VerifyIntervalRegressionAccuracy(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = MakeCUDACtx(GPUIDX);
+  auto ctx = MakeCUDACtx(GetGPUId());
 
   auto p_fmat = EmptyDMatrix();
   MetaInfo& info = p_fmat->Info();
@@ -101,7 +101,7 @@ inline void VerifyIntervalRegressionAccuracy(DataSplitMode data_split_mode = Dat
   info.labels_lower_bound_.HostVector()[0] = 70.0f;
   EXPECT_FLOAT_EQ(metric->Evaluate(preds, p_fmat), 0.25f);
 
-  CheckDeterministicMetricElementWise(StringView{"interval-regression-accuracy"}, GPUIDX);
+  CheckDeterministicMetricElementWise(StringView{"interval-regression-accuracy"}, GetGPUId());
 }
 }  // namespace common
 }  // namespace xgboost
diff --git a/tests/cpp/objective/test_aft_obj.cc b/tests/cpp/objective/test_aft_obj.cc
index 74973918c..60aebdf3a 100644
--- a/tests/cpp/objective/test_aft_obj.cc
+++ b/tests/cpp/objective/test_aft_obj.cc
@@ -16,7 +16,7 @@ namespace xgboost {
 namespace common {
 
 TEST(Objective, DeclareUnifiedTest(AFTObjConfiguration)) {
-  auto ctx = MakeCUDACtx(GPUIDX);
+  auto ctx = MakeCUDACtx(GetGPUId());
   std::unique_ptr<ObjFunction> objective(ObjFunction::Create("survival:aft", &ctx));
   objective->Configure({ {"aft_loss_distribution", "logistic"},
                           {"aft_loss_distribution_scale", "5"} });
@@ -77,7 +77,7 @@ static inline void CheckGPairOverGridPoints(
 }
 
 TEST(Objective, DeclareUnifiedTest(AFTObjGPairUncensoredLabels)) {
-  auto ctx = MakeCUDACtx(GPUIDX);
+  auto ctx = MakeCUDACtx(GetGPUId());
   std::unique_ptr<ObjFunction> obj(ObjFunction::Create("survival:aft", &ctx));
 
   CheckGPairOverGridPoints(obj.get(), 100.0f, 100.0f, "normal",
@@ -101,7 +101,7 @@ TEST(Objective, DeclareUnifiedTest(AFTObjGPairUncensoredLabels)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(AFTObjGPairLeftCensoredLabels)) {
-  auto ctx = MakeCUDACtx(GPUIDX);
+  auto ctx = MakeCUDACtx(GetGPUId());
   std::unique_ptr<ObjFunction> obj(ObjFunction::Create("survival:aft", &ctx));
 
   CheckGPairOverGridPoints(obj.get(), 0.0f, 20.0f, "normal",
@@ -122,7 +122,7 @@ TEST(Objective, DeclareUnifiedTest(AFTObjGPairLeftCensoredLabels)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(AFTObjGPairRightCensoredLabels)) {
-  auto ctx = MakeCUDACtx(GPUIDX);
+  auto ctx = MakeCUDACtx(GetGPUId());
   std::unique_ptr<ObjFunction> obj(ObjFunction::Create("survival:aft", &ctx));
 
   CheckGPairOverGridPoints(obj.get(), 60.0f, std::numeric_limits<float>::infinity(), "normal",
@@ -146,7 +146,7 @@ TEST(Objective, DeclareUnifiedTest(AFTObjGPairRightCensoredLabels)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(AFTObjGPairIntervalCensoredLabels)) {
-  auto ctx = MakeCUDACtx(GPUIDX);
+  auto ctx = MakeCUDACtx(GetGPUId());
   std::unique_ptr<ObjFunction> obj(ObjFunction::Create("survival:aft", &ctx));
 
   CheckGPairOverGridPoints(obj.get(), 16.0f, 200.0f, "normal",
diff --git a/tests/cpp/objective/test_hinge.cc b/tests/cpp/objective/test_hinge.cc
index 17d2609d4..a4b8525fa 100644
--- a/tests/cpp/objective/test_hinge.cc
+++ b/tests/cpp/objective/test_hinge.cc
@@ -6,7 +6,7 @@
 #include "../helpers.h"
 namespace xgboost {
 TEST(Objective, DeclareUnifiedTest(HingeObj)) {
-  Context ctx = MakeCUDACtx(GPUIDX);
+  Context ctx = MakeCUDACtx(GetGPUId());
   std::unique_ptr<ObjFunction> obj{ObjFunction::Create("binary:hinge", &ctx)};
 
   float eps = std::numeric_limits<xgboost::bst_float>::min();
diff --git a/tests/cpp/objective/test_lambdarank_obj.cc b/tests/cpp/objective/test_lambdarank_obj.cc
index c808e97f0..0c65780ae 100644
--- a/tests/cpp/objective/test_lambdarank_obj.cc
+++ b/tests/cpp/objective/test_lambdarank_obj.cc
@@ -71,7 +71,7 @@ void TestNDCGGPair(Context const* ctx) {
 
   HostDeviceVector<float> predts{0, 1, 0, 1};
   MetaInfo info;
-  info.labels = linalg::Tensor<float, 2>{{0, 1, 0, 1}, {4, 1}, GPUIDX};
+  info.labels = linalg::Tensor<float, 2>{{0, 1, 0, 1}, {4, 1}, GetGPUId()};
   info.group_ptr_ = {0, 2, 4};
   info.num_row_ = 4;
   HostDeviceVector<GradientPair> gpairs;
diff --git a/tests/cpp/objective/test_multiclass_obj.cc b/tests/cpp/objective/test_multiclass_obj.cc
index d028ef9cf..fa8fc27e4 100644
--- a/tests/cpp/objective/test_multiclass_obj.cc
+++ b/tests/cpp/objective/test_multiclass_obj.cc
@@ -9,7 +9,7 @@
 namespace xgboost {
 
 TEST(Objective, DeclareUnifiedTest(SoftmaxMultiClassObjGPair)) {
-  Context ctx = MakeCUDACtx(GPUIDX);
+  Context ctx = MakeCUDACtx(GetGPUId());
   std::vector<std::pair<std::string, std::string>> args {{"num_class", "3"}};
   std::unique_ptr<ObjFunction> obj {
     ObjFunction::Create("multi:softmax", &ctx)
@@ -36,7 +36,7 @@ TEST(Objective, DeclareUnifiedTest(SoftmaxMultiClassObjGPair)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(SoftmaxMultiClassBasic)) {
-  auto ctx = MakeCUDACtx(GPUIDX);
+  auto ctx = MakeCUDACtx(GetGPUId());
   std::vector<std::pair<std::string, std::string>> args{
       std::pair<std::string, std::string>("num_class", "3")};
 
@@ -57,7 +57,7 @@ TEST(Objective, DeclareUnifiedTest(SoftmaxMultiClassBasic)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(SoftprobMultiClassBasic)) {
-  Context ctx = MakeCUDACtx(GPUIDX);
+  Context ctx = MakeCUDACtx(GetGPUId());
   std::vector<std::pair<std::string, std::string>> args {
     std::pair<std::string, std::string>("num_class", "3")};
 
diff --git a/tests/cpp/objective/test_quantile_obj.cc b/tests/cpp/objective/test_quantile_obj.cc
index b263b4a8f..5078440bb 100644
--- a/tests/cpp/objective/test_quantile_obj.cc
+++ b/tests/cpp/objective/test_quantile_obj.cc
@@ -14,7 +14,7 @@
 
 namespace xgboost {
 TEST(Objective, DeclareUnifiedTest(Quantile)) {
-  Context ctx = MakeCUDACtx(GPUIDX);
+  Context ctx = MakeCUDACtx(GetGPUId());
 
   {
     Args args{{"quantile_alpha", "[0.6, 0.8]"}};
@@ -37,7 +37,7 @@ TEST(Objective, DeclareUnifiedTest(Quantile)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(QuantileIntercept)) {
-  Context ctx = MakeCUDACtx(GPUIDX);
+  Context ctx = MakeCUDACtx(GetGPUId());
   Args args{{"quantile_alpha", "[0.6, 0.8]"}};
   std::unique_ptr<ObjFunction> obj{ObjFunction::Create("reg:quantileerror", &ctx)};
   obj->Configure(args);
diff --git a/tests/cpp/objective/test_regression_obj.cc b/tests/cpp/objective/test_regression_obj.cc
index b8a40603b..635fae997 100644
--- a/tests/cpp/objective/test_regression_obj.cc
+++ b/tests/cpp/objective/test_regression_obj.cc
@@ -17,7 +17,7 @@
 namespace xgboost {
 
 TEST(Objective, DeclareUnifiedTest(LinearRegressionGPair)) {
-  Context ctx = MakeCUDACtx(GPUIDX);
+  Context ctx = MakeCUDACtx(GetGPUId());
   std::vector<std::pair<std::string, std::string>> args;
 
   std::unique_ptr<ObjFunction> obj{ObjFunction::Create("reg:squarederror", &ctx)};
@@ -39,7 +39,7 @@ TEST(Objective, DeclareUnifiedTest(LinearRegressionGPair)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(SquaredLog)) {
-  Context ctx = MakeCUDACtx(GPUIDX);
+  Context ctx = MakeCUDACtx(GetGPUId());
   std::vector<std::pair<std::string, std::string>> args;
 
   std::unique_ptr<ObjFunction> obj{ObjFunction::Create("reg:squaredlogerror", &ctx)};
@@ -62,7 +62,7 @@ TEST(Objective, DeclareUnifiedTest(SquaredLog)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(PseudoHuber)) {
-  Context ctx = MakeCUDACtx(GPUIDX);
+  Context ctx = MakeCUDACtx(GetGPUId());
   Args args;
 
   std::unique_ptr<ObjFunction> obj{ObjFunction::Create("reg:pseudohubererror", &ctx)};
@@ -91,7 +91,7 @@ TEST(Objective, DeclareUnifiedTest(PseudoHuber)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(LogisticRegressionGPair)) {
-  Context ctx = MakeCUDACtx(GPUIDX);
+  Context ctx = MakeCUDACtx(GetGPUId());
   std::vector<std::pair<std::string, std::string>> args;
   std::unique_ptr<ObjFunction> obj{ObjFunction::Create("reg:logistic", &ctx)};
 
@@ -107,7 +107,7 @@ TEST(Objective, DeclareUnifiedTest(LogisticRegressionGPair)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(LogisticRegressionBasic)) {
-  Context ctx = MakeCUDACtx(GPUIDX);
+  Context ctx = MakeCUDACtx(GetGPUId());
   std::vector<std::pair<std::string, std::string>> args;
   std::unique_ptr<ObjFunction> obj{ObjFunction::Create("reg:logistic", &ctx)};
 
@@ -136,7 +136,7 @@ TEST(Objective, DeclareUnifiedTest(LogisticRegressionBasic)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(LogisticRawGPair)) {
-  Context ctx = MakeCUDACtx(GPUIDX);
+  Context ctx = MakeCUDACtx(GetGPUId());
   std::vector<std::pair<std::string, std::string>> args;
   std::unique_ptr<ObjFunction>  obj {
     ObjFunction::Create("binary:logitraw", &ctx)
@@ -152,7 +152,7 @@ TEST(Objective, DeclareUnifiedTest(LogisticRawGPair)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(PoissonRegressionGPair)) {
-  Context ctx = MakeCUDACtx(GPUIDX);
+  Context ctx = MakeCUDACtx(GetGPUId());
   std::vector<std::pair<std::string, std::string>> args;
   std::unique_ptr<ObjFunction> obj {
     ObjFunction::Create("count:poisson", &ctx)
@@ -176,7 +176,7 @@ TEST(Objective, DeclareUnifiedTest(PoissonRegressionGPair)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(PoissonRegressionBasic)) {
-  Context ctx = MakeCUDACtx(GPUIDX);
+  Context ctx = MakeCUDACtx(GetGPUId());
   std::vector<std::pair<std::string, std::string>> args;
   std::unique_ptr<ObjFunction> obj {
     ObjFunction::Create("count:poisson", &ctx)
@@ -205,7 +205,7 @@ TEST(Objective, DeclareUnifiedTest(PoissonRegressionBasic)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(GammaRegressionGPair)) {
-  Context ctx = MakeCUDACtx(GPUIDX);
+  Context ctx = MakeCUDACtx(GetGPUId());
   std::vector<std::pair<std::string, std::string>> args;
   std::unique_ptr<ObjFunction> obj {
     ObjFunction::Create("reg:gamma", &ctx)
@@ -227,7 +227,7 @@ TEST(Objective, DeclareUnifiedTest(GammaRegressionGPair)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(GammaRegressionBasic)) {
-  Context ctx = MakeCUDACtx(GPUIDX);
+  Context ctx = MakeCUDACtx(GetGPUId());
   std::vector<std::pair<std::string, std::string>> args;
   std::unique_ptr<ObjFunction> obj{ObjFunction::Create("reg:gamma", &ctx)};
 
@@ -256,7 +256,7 @@ TEST(Objective, DeclareUnifiedTest(GammaRegressionBasic)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(TweedieRegressionGPair)) {
-  Context ctx = MakeCUDACtx(GPUIDX);
+  Context ctx = MakeCUDACtx(GetGPUId());
   std::vector<std::pair<std::string, std::string>> args;
   std::unique_ptr<ObjFunction> obj{ObjFunction::Create("reg:tweedie", &ctx)};
 
@@ -280,7 +280,7 @@ TEST(Objective, DeclareUnifiedTest(TweedieRegressionGPair)) {
 
 #if defined(__CUDACC__)
 TEST(Objective, CPU_vs_CUDA) {
-  Context ctx = MakeCUDACtx(GPUIDX);
+  Context ctx = MakeCUDACtx(GetGPUId());
 
   ObjFunction* obj = ObjFunction::Create("reg:squarederror", &ctx);
   HostDeviceVector<GradientPair> cpu_out_preds;
@@ -331,7 +331,7 @@ TEST(Objective, CPU_vs_CUDA) {
 #endif
 
 TEST(Objective, DeclareUnifiedTest(TweedieRegressionBasic)) {
-  Context ctx = MakeCUDACtx(GPUIDX);
+  Context ctx = MakeCUDACtx(GetGPUId());
   std::vector<std::pair<std::string, std::string>> args;
   std::unique_ptr<ObjFunction> obj{ObjFunction::Create("reg:tweedie", &ctx)};
 
@@ -360,7 +360,7 @@ TEST(Objective, DeclareUnifiedTest(TweedieRegressionBasic)) {
 // CoxRegression not implemented in GPU code, no need for testing.
 #if !defined(__CUDACC__)
 TEST(Objective, CoxRegressionGPair) {
-  Context ctx = MakeCUDACtx(GPUIDX);
+  Context ctx = MakeCUDACtx(GetGPUId());
   std::vector<std::pair<std::string, std::string>> args;
   std::unique_ptr<ObjFunction> obj{ObjFunction::Create("survival:cox", &ctx)};
 
@@ -375,7 +375,7 @@ TEST(Objective, CoxRegressionGPair) {
 #endif
 
 TEST(Objective, DeclareUnifiedTest(AbsoluteError)) {
-  Context ctx = MakeCUDACtx(GPUIDX);
+  Context ctx = MakeCUDACtx(GetGPUId());
   std::unique_ptr<ObjFunction> obj{ObjFunction::Create("reg:absoluteerror", &ctx)};
   obj->Configure({});
   CheckConfigReload(obj, "reg:absoluteerror");
@@ -419,7 +419,7 @@ TEST(Objective, DeclareUnifiedTest(AbsoluteError)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(AbsoluteErrorLeaf)) {
-  Context ctx = MakeCUDACtx(GPUIDX);
+  Context ctx = MakeCUDACtx(GetGPUId());
   bst_target_t constexpr kTargets = 3, kRows = 16;
   std::unique_ptr<ObjFunction> obj{ObjFunction::Create("reg:absoluteerror", &ctx)};
   obj->Configure({});
diff --git a/tests/cpp/plugin/test_example_objective.cc b/tests/cpp/plugin/test_example_objective.cc
index ccb83c781..29fe2ad2b 100644
--- a/tests/cpp/plugin/test_example_objective.cc
+++ b/tests/cpp/plugin/test_example_objective.cc
@@ -5,7 +5,7 @@
 
 namespace xgboost {
 TEST(Plugin, ExampleObjective) {
-  xgboost::Context ctx = MakeCUDACtx(GPUIDX);
+  xgboost::Context ctx = MakeCUDACtx(GetGPUId());
   auto* obj = xgboost::ObjFunction::Create("mylogistic", &ctx);
   ASSERT_EQ(obj->DefaultEvalMetric(), std::string{"logloss"});
   delete obj;
diff --git a/tests/cpp/plugin/test_federated_adapter.cu b/tests/cpp/plugin/test_federated_adapter.cu
index 134446f11..75422fcca 100644
--- a/tests/cpp/plugin/test_federated_adapter.cu
+++ b/tests/cpp/plugin/test_federated_adapter.cu
@@ -12,6 +12,7 @@
 #include "../../../src/collective/communicator-inl.cuh"
 #include "../../../src/collective/device_communicator_adapter.cuh"
 #include "./helpers.h"
+#include "../helpers.h"
 
 namespace xgboost::collective {
 
@@ -26,10 +27,12 @@ namespace {
 void VerifyAllReduceSum() {
   auto const world_size = collective::GetWorldSize();
   auto const rank = collective::GetRank();
+  auto const device = GetGPUId();
   int count = 3;
+  common::SetDevice(device);
   thrust::device_vector<double> buffer(count, 0);
   thrust::sequence(buffer.begin(), buffer.end());
-  collective::AllReduce<collective::Operation::kSum>(rank, buffer.data().get(), count);
+  collective::AllReduce<collective::Operation::kSum>(device, buffer.data().get(), count);
   thrust::host_vector<double> host_buffer = buffer;
   EXPECT_EQ(host_buffer.size(), count);
   for (auto i = 0; i < count; i++) {
@@ -39,10 +42,6 @@ void VerifyAllReduceSum() {
 }  // anonymous namespace
 
 TEST_F(FederatedAdapterTest, MGPUAllReduceSum) {
-  auto const n_gpus = common::AllVisibleGPUs();
-  if (n_gpus <= 1) {
-    GTEST_SKIP() << "Skipping MGPUAllReduceSum test with # GPUs = " << n_gpus;
-  }
   RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyAllReduceSum);
 }
 
@@ -50,13 +49,15 @@ namespace {
 void VerifyAllGatherV() {
   auto const world_size = collective::GetWorldSize();
   auto const rank = collective::GetRank();
+  auto const device = GetGPUId();
   int const count = rank + 2;
+  common::SetDevice(device);
   thrust::device_vector<char> buffer(count, 0);
   thrust::sequence(buffer.begin(), buffer.end());
   std::vector<std::size_t> segments(world_size);
   dh::caching_device_vector<char> receive_buffer{};
 
-  collective::AllGatherV(rank, buffer.data().get(), count, &segments, &receive_buffer);
+  collective::AllGatherV(device, buffer.data().get(), count, &segments, &receive_buffer);
 
   EXPECT_EQ(segments[0], 2);
   EXPECT_EQ(segments[1], 3);
@@ -70,11 +71,6 @@ void VerifyAllGatherV() {
 }  // anonymous namespace
 
 TEST_F(FederatedAdapterTest, MGPUAllGatherV) {
-  auto const n_gpus = common::AllVisibleGPUs();
-  if (n_gpus <= 1) {
-    GTEST_SKIP() << "Skipping MGPUAllGatherV test with # GPUs = " << n_gpus;
-  }
   RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyAllGatherV);
 }
-
 }  // namespace xgboost::collective
diff --git a/tests/cpp/predictor/test_gpu_predictor.cu b/tests/cpp/predictor/test_gpu_predictor.cu
index be0cad5ce..ecddf2288 100644
--- a/tests/cpp/predictor/test_gpu_predictor.cu
+++ b/tests/cpp/predictor/test_gpu_predictor.cu
@@ -60,7 +60,7 @@ void VerifyBasicColumnSplit(std::array<std::vector<float>, 32> const& expected_r
   auto const world_size = collective::GetWorldSize();
   auto const rank = collective::GetRank();
 
-  auto ctx = MakeCUDACtx(rank);
+  auto ctx = MakeCUDACtx(GetGPUId());
   std::unique_ptr<Predictor> predictor =
       std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", &ctx));
   predictor->Configure({});
@@ -85,12 +85,9 @@ void VerifyBasicColumnSplit(std::array<std::vector<float>, 32> const& expected_r
 }
 }  // anonymous namespace
 
-TEST(GPUPredictor, MGPUBasicColumnSplit) {
-  auto const n_gpus = common::AllVisibleGPUs();
-  if (n_gpus <= 1) {
-    GTEST_SKIP() << "Skipping MGPUIBasicColumnSplit test with # GPUs = " << n_gpus;
-  }
+class MGPUPredictorTest : public BaseMGPUTest {};
 
+TEST_F(MGPUPredictorTest, BasicColumnSplit) {
   auto ctx = MakeCUDACtx(0);
   std::unique_ptr<Predictor> predictor =
       std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", &ctx));
@@ -114,7 +111,7 @@ TEST(GPUPredictor, MGPUBasicColumnSplit) {
     result[i - 1] = out_predictions_h;
   }
 
-  RunWithInMemoryCommunicator(n_gpus, VerifyBasicColumnSplit, result);
+  DoTest(VerifyBasicColumnSplit, result);
 }
 
 TEST(GPUPredictor, EllpackBasic) {
@@ -286,7 +283,7 @@ TEST(GPUPredictor, CategoricalPredictLeaf) {
 TEST(GPUPredictor, PredictLeafBasic) {
   size_t constexpr kRows = 5, kCols = 5;
   auto dmat = RandomDataGenerator(kRows, kCols, 0).Device(0).GenerateDMatrix();
-  auto lparam = MakeCUDACtx(GPUIDX);
+  auto lparam = MakeCUDACtx(GetGPUId());
   std::unique_ptr<Predictor> gpu_predictor =
       std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", &lparam));
   gpu_predictor->Configure({});

From e93a2748230656de9770aa880157dc71c35af2da Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 2 Aug 2023 18:28:26 +0800
Subject: [PATCH 059/136] Small cleanup for histogram routines. (#9427)

* Small cleanup for histogram routines.

- Extract hist train param from GPU hist.
- Make histogram const after construction.
- Unify parameter names.
---
 R-package/src/Makevars.in                     |  1 +
 R-package/src/Makevars.win                    |  1 +
 include/xgboost/linalg.h                      |  4 +-
 src/common/hist_util.cc                       | 41 ++++++------
 src/common/hist_util.h                        |  3 +-
 src/common/threading_utils.h                  |  9 ++-
 src/tree/hist/evaluate_splits.h               | 16 ++---
 src/tree/hist/param.cc                        | 34 ++++++++++
 src/tree/hist/param.h                         | 20 ++++++
 src/tree/updater_approx.cc                    | 29 ++++++---
 src/tree/updater_gpu_hist.cu                  | 46 +++----------
 src/tree/updater_quantile_hist.cc             | 65 ++++++++++++-------
 tests/cpp/tree/test_gpu_hist.cu               | 13 ++--
 tests/cpp/tree/test_histmaker.cc              |  4 ++
 tests/cpp/tree/test_prediction_cache.cc       |  1 +
 tests/cpp/tree/test_quantile_hist.cc          |  5 +-
 .../test_with_dask/test_with_dask.py          |  1 +
 17 files changed, 182 insertions(+), 111 deletions(-)
 create mode 100644 src/tree/hist/param.cc
 create mode 100644 src/tree/hist/param.h

diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in
index f199544a3..f03bbc73f 100644
--- a/R-package/src/Makevars.in
+++ b/R-package/src/Makevars.in
@@ -68,6 +68,7 @@ OBJECTS= \
     $(PKGROOT)/src/tree/updater_quantile_hist.o \
     $(PKGROOT)/src/tree/updater_refresh.o \
     $(PKGROOT)/src/tree/updater_sync.o \
+    $(PKGROOT)/src/tree/hist/param.o \
     $(PKGROOT)/src/linear/linear_updater.o \
     $(PKGROOT)/src/linear/updater_coordinate.o \
     $(PKGROOT)/src/linear/updater_shotgun.o \
diff --git a/R-package/src/Makevars.win b/R-package/src/Makevars.win
index 2e7f98113..9f4d0d5f3 100644
--- a/R-package/src/Makevars.win
+++ b/R-package/src/Makevars.win
@@ -68,6 +68,7 @@ OBJECTS= \
     $(PKGROOT)/src/tree/updater_quantile_hist.o \
     $(PKGROOT)/src/tree/updater_refresh.o \
     $(PKGROOT)/src/tree/updater_sync.o \
+    $(PKGROOT)/src/tree/hist/param.o \
     $(PKGROOT)/src/linear/linear_updater.o \
     $(PKGROOT)/src/linear/updater_coordinate.o \
     $(PKGROOT)/src/linear/updater_shotgun.o \
diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h
index 4ca5b9f7e..6d2b54f84 100644
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -574,7 +574,9 @@ template <typename Container, typename... S,
           std::enable_if_t<!common::detail::IsSpan<Container>::value &&
                            !std::is_pointer_v<Container>> * = nullptr>
 auto MakeTensorView(Context const *ctx, Container &data, S &&...shape) {  // NOLINT
-  using T = typename Container::value_type;
+  using T = std::conditional_t<std::is_const_v<Container>,
+                               std::add_const_t<typename Container::value_type>,
+                               typename Container::value_type>;
   std::size_t in_shape[sizeof...(S)];
   detail::IndexToArr(in_shape, std::forward<S>(shape)...);
   return TensorView<T, sizeof...(S)>{data, in_shape, ctx->gpu_id};
diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc
index 489ef2396..e52ce1f66 100644
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -81,11 +81,11 @@ void InitilizeHistByZeroes(GHistRow hist, size_t begin, size_t end) {
 /*!
  * \brief Increment hist as dst += add in range [begin, end)
  */
-void IncrementHist(GHistRow dst, const GHistRow add, size_t begin, size_t end) {
-  double* pdst = reinterpret_cast<double*>(dst.data());
+void IncrementHist(GHistRow dst, ConstGHistRow add, std::size_t begin, std::size_t end) {
+  double *pdst = reinterpret_cast<double *>(dst.data());
   const double *padd = reinterpret_cast<const double *>(add.data());
 
-  for (size_t i = 2 * begin; i < 2 * end; ++i) {
+  for (std::size_t i = 2 * begin; i < 2 * end; ++i) {
     pdst[i] += padd[i];
   }
 }
@@ -207,18 +207,23 @@ void RowsWiseBuildHistKernel(Span<GradientPair const> gpair,
 
   const size_t size = row_indices.Size();
   const size_t *rid = row_indices.begin;
-  auto const *pgh = reinterpret_cast<const float *>(gpair.data());
+  auto const *p_gpair = reinterpret_cast<const float *>(gpair.data());
   const BinIdxType *gradient_index = gmat.index.data<BinIdxType>();
 
   auto const &row_ptr = gmat.row_ptr.data();
   auto base_rowid = gmat.base_rowid;
-  const uint32_t *offsets = gmat.index.Offset();
-  auto get_row_ptr = [&](size_t ridx) {
+  uint32_t const *offsets = gmat.index.Offset();
+  // There's no feature-based compression if missing value is present.
+  if (kAnyMissing) {
+    CHECK(!offsets);
+  } else {
+    CHECK(offsets);
+  }
+
+  auto get_row_ptr = [&](bst_row_t ridx) {
     return kFirstPage ? row_ptr[ridx] : row_ptr[ridx - base_rowid];
   };
-  auto get_rid = [&](size_t ridx) {
-    return kFirstPage ? ridx : (ridx - base_rowid);
-  };
+  auto get_rid = [&](bst_row_t ridx) { return kFirstPage ? ridx : (ridx - base_rowid); };
 
   const size_t n_features =
       get_row_ptr(row_indices.begin[0] + 1) - get_row_ptr(row_indices.begin[0]);
@@ -228,7 +233,7 @@ void RowsWiseBuildHistKernel(Span<GradientPair const> gpair,
                           // So we need to multiply each row-index/bin-index by 2
                           // to work with gradient pairs as a singe row FP array
 
-  for (size_t i = 0; i < size; ++i) {
+  for (std::size_t i = 0; i < size; ++i) {
     const size_t icol_start =
         kAnyMissing ? get_row_ptr(rid[i]) : get_rid(rid[i]) * n_features;
     const size_t icol_end =
@@ -246,7 +251,7 @@ void RowsWiseBuildHistKernel(Span<GradientPair const> gpair,
           kAnyMissing ? get_row_ptr(rid[i + Prefetch::kPrefetchOffset] + 1)
                       : icol_start_prefetch + n_features;
 
-      PREFETCH_READ_T0(pgh + two * rid[i + Prefetch::kPrefetchOffset]);
+      PREFETCH_READ_T0(p_gpair + two * rid[i + Prefetch::kPrefetchOffset]);
       for (size_t j = icol_start_prefetch; j < icol_end_prefetch;
            j += Prefetch::GetPrefetchStep<uint32_t>()) {
         PREFETCH_READ_T0(gradient_index + j);
@@ -255,12 +260,12 @@ void RowsWiseBuildHistKernel(Span<GradientPair const> gpair,
     const BinIdxType *gr_index_local = gradient_index + icol_start;
 
     // The trick with pgh_t buffer helps the compiler to generate faster binary.
-    const float pgh_t[] = {pgh[idx_gh], pgh[idx_gh + 1]};
+    const float pgh_t[] = {p_gpair[idx_gh], p_gpair[idx_gh + 1]};
     for (size_t j = 0; j < row_size; ++j) {
-      const uint32_t idx_bin = two * (static_cast<uint32_t>(gr_index_local[j]) +
-                                      (kAnyMissing ? 0 : offsets[j]));
+      const uint32_t idx_bin =
+          two * (static_cast<uint32_t>(gr_index_local[j]) + (kAnyMissing ? 0 : offsets[j]));
       auto hist_local = hist_data + idx_bin;
-      *(hist_local)     += pgh_t[0];
+      *(hist_local) += pgh_t[0];
       *(hist_local + 1) += pgh_t[1];
     }
   }
@@ -281,12 +286,10 @@ void ColsWiseBuildHistKernel(Span<GradientPair const> gpair,
   auto const &row_ptr = gmat.row_ptr.data();
   auto base_rowid = gmat.base_rowid;
   const uint32_t *offsets = gmat.index.Offset();
-  auto get_row_ptr = [&](size_t ridx) {
+  auto get_row_ptr = [&](bst_row_t ridx) {
     return kFirstPage ? row_ptr[ridx] : row_ptr[ridx - base_rowid];
   };
-  auto get_rid = [&](size_t ridx) {
-    return kFirstPage ? ridx : (ridx - base_rowid);
-  };
+  auto get_rid = [&](bst_row_t ridx) { return kFirstPage ? ridx : (ridx - base_rowid); };
 
   const size_t n_features = gmat.cut.Ptrs().size() - 1;
   const size_t n_columns = n_features;
diff --git a/src/common/hist_util.h b/src/common/hist_util.h
index fd364b8ac..12db898a9 100644
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -362,6 +362,7 @@ bst_bin_t XGBOOST_HOST_DEV_INLINE BinarySearchBin(std::size_t begin, std::size_t
 }
 
 using GHistRow = Span<xgboost::GradientPairPrecise>;
+using ConstGHistRow = Span<xgboost::GradientPairPrecise const>;
 
 /*!
  * \brief fill a histogram by zeros
@@ -371,7 +372,7 @@ void InitilizeHistByZeroes(GHistRow hist, size_t begin, size_t end);
 /*!
  * \brief Increment hist as dst += add in range [begin, end)
  */
-void IncrementHist(GHistRow dst, const GHistRow add, size_t begin, size_t end);
+void IncrementHist(GHistRow dst, ConstGHistRow add, std::size_t begin, std::size_t end);
 
 /*!
  * \brief Copy hist from src to dst in range [begin, end)
diff --git a/src/common/threading_utils.h b/src/common/threading_utils.h
index 0247e4dcc..9c7483847 100644
--- a/src/common/threading_utils.h
+++ b/src/common/threading_utils.h
@@ -136,7 +136,7 @@ class BlockedSpace2d {
 // Wrapper to implement nested parallelism with simple omp parallel for
 template <typename Func>
 void ParallelFor2d(const BlockedSpace2d& space, int nthreads, Func func) {
-  const size_t num_blocks_in_space = space.Size();
+  std::size_t n_blocks_in_space = space.Size();
   CHECK_GE(nthreads, 1);
 
   dmlc::OMPException exc;
@@ -144,11 +144,10 @@ void ParallelFor2d(const BlockedSpace2d& space, int nthreads, Func func) {
   {
     exc.Run([&]() {
       size_t tid = omp_get_thread_num();
-      size_t chunck_size =
-          num_blocks_in_space / nthreads + !!(num_blocks_in_space % nthreads);
+      size_t chunck_size = n_blocks_in_space / nthreads + !!(n_blocks_in_space % nthreads);
 
-      size_t begin = chunck_size * tid;
-      size_t end = std::min(begin + chunck_size, num_blocks_in_space);
+      std::size_t begin = chunck_size * tid;
+      std::size_t end = std::min(begin + chunck_size, n_blocks_in_space);
       for (auto i = begin; i < end; i++) {
         func(space.GetFirstDimension(i), space.GetRange(i));
       }
diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h
index 4fb857e06..f4e44fa52 100644
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -65,7 +65,7 @@ class HistEvaluator {
    *        pseudo-category for missing value but here we just do a complete scan to avoid
    *        making specialized histogram bin.
    */
-  void EnumerateOneHot(common::HistogramCuts const &cut, const common::GHistRow &hist,
+  void EnumerateOneHot(common::HistogramCuts const &cut, common::ConstGHistRow hist,
                        bst_feature_t fidx, bst_node_t nidx,
                        TreeEvaluator::SplitEvaluator<TrainParam> const &evaluator,
                        SplitEntry *p_best) const {
@@ -143,7 +143,7 @@ class HistEvaluator {
    */
   template <int d_step>
   void EnumeratePart(common::HistogramCuts const &cut, common::Span<size_t const> sorted_idx,
-                     common::GHistRow const &hist, bst_feature_t fidx, bst_node_t nidx,
+                     common::ConstGHistRow hist, bst_feature_t fidx, bst_node_t nidx,
                      TreeEvaluator::SplitEvaluator<TrainParam> const &evaluator,
                      SplitEntry *p_best) {
     static_assert(d_step == +1 || d_step == -1, "Invalid step.");
@@ -214,7 +214,7 @@ class HistEvaluator {
   // Returns the sum of gradients corresponding to the data points that contains
   // a non-missing value for the particular feature fid.
   template <int d_step>
-  GradStats EnumerateSplit(common::HistogramCuts const &cut, const common::GHistRow &hist,
+  GradStats EnumerateSplit(common::HistogramCuts const &cut, common::ConstGHistRow hist,
                            bst_feature_t fidx, bst_node_t nidx,
                            TreeEvaluator::SplitEvaluator<TrainParam> const &evaluator,
                            SplitEntry *p_best) const {
@@ -454,8 +454,8 @@ class HistEvaluator {
                                    right_child);
   }
 
-  auto Evaluator() const { return tree_evaluator_.GetEvaluator(); }
-  auto const& Stats() const { return snode_; }
+  [[nodiscard]] auto Evaluator() const { return tree_evaluator_.GetEvaluator(); }
+  [[nodiscard]] auto const &Stats() const { return snode_; }
 
   float InitRoot(GradStats const &root_sum) {
     snode_.resize(1);
@@ -510,7 +510,7 @@ class HistMultiEvaluator {
 
   template <bst_bin_t d_step>
   bool EnumerateSplit(common::HistogramCuts const &cut, bst_feature_t fidx,
-                      common::Span<common::GHistRow const> hist,
+                      common::Span<common::ConstGHistRow> hist,
                       linalg::VectorView<GradientPairPrecise const> parent_sum, double parent_gain,
                       SplitEntryContainer<std::vector<GradientPairPrecise>> *p_best) const {
     auto const &cut_ptr = cut.Ptrs();
@@ -651,9 +651,9 @@ class HistMultiEvaluator {
       auto entry = &tloc_candidates[n_threads * nidx_in_set + tidx];
       auto best = &entry->split;
       auto parent_sum = stats_.Slice(entry->nid, linalg::All());
-      std::vector<common::GHistRow> node_hist;
+      std::vector<common::ConstGHistRow> node_hist;
       for (auto t_hist : hist) {
-        node_hist.push_back((*t_hist)[entry->nid]);
+        node_hist.emplace_back((*t_hist)[entry->nid]);
       }
       auto features_set = features[nidx_in_set]->ConstHostSpan();
 
diff --git a/src/tree/hist/param.cc b/src/tree/hist/param.cc
new file mode 100644
index 000000000..602566cd3
--- /dev/null
+++ b/src/tree/hist/param.cc
@@ -0,0 +1,34 @@
+/**
+ * Copyright 2021-2023, XGBoost Contributors
+ */
+#include "param.h"
+
+#include <string>  // for string
+
+#include "../../collective/communicator-inl.h"  // for GetRank, Broadcast
+#include "xgboost/json.h"                       // for Object, Json
+#include "xgboost/tree_model.h"                 // for RegTree
+
+namespace xgboost::tree {
+DMLC_REGISTER_PARAMETER(HistMakerTrainParam);
+
+void HistMakerTrainParam::CheckTreesSynchronized(RegTree const* local_tree) const {
+  if (!this->debug_synchronize) {
+    return;
+  }
+
+  std::string s_model;
+  Json model{Object{}};
+  int rank = collective::GetRank();
+  if (rank == 0) {
+    local_tree->SaveModel(&model);
+  }
+  Json::Dump(model, &s_model, std::ios::binary);
+  collective::Broadcast(&s_model, 0);
+
+  RegTree ref_tree{};  // rank 0 tree
+  auto j_ref_tree = Json::Load(StringView{s_model}, std::ios::binary);
+  ref_tree.LoadModel(j_ref_tree);
+  CHECK(*local_tree == ref_tree);
+}
+}  // namespace xgboost::tree
diff --git a/src/tree/hist/param.h b/src/tree/hist/param.h
new file mode 100644
index 000000000..3dfbf68e1
--- /dev/null
+++ b/src/tree/hist/param.h
@@ -0,0 +1,20 @@
+/**
+ * Copyright 2021-2023, XGBoost Contributors
+ */
+#pragma once
+#include "xgboost/parameter.h"
+#include "xgboost/tree_model.h"  // for RegTree
+
+namespace xgboost::tree {
+struct HistMakerTrainParam : public XGBoostParameter<HistMakerTrainParam> {
+  bool debug_synchronize;
+  void CheckTreesSynchronized(RegTree const* local_tree) const;
+
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(HistMakerTrainParam) {
+    DMLC_DECLARE_FIELD(debug_synchronize)
+        .set_default(false)
+        .describe("Check if all distributed tree are identical after tree construction.");
+  }
+};
+}  // namespace xgboost::tree
diff --git a/src/tree/updater_approx.cc b/src/tree/updater_approx.cc
index 7b5020621..9f496d052 100644
--- a/src/tree/updater_approx.cc
+++ b/src/tree/updater_approx.cc
@@ -14,13 +14,14 @@
 #include "driver.h"
 #include "hist/evaluate_splits.h"
 #include "hist/histogram.h"
+#include "hist/param.h"
 #include "hist/sampler.h"  // for SampleGradient
-#include "param.h"
+#include "param.h"         // for HistMakerTrainParam
 #include "xgboost/base.h"
 #include "xgboost/data.h"
 #include "xgboost/json.h"
 #include "xgboost/linalg.h"
-#include "xgboost/task.h"          // for ObjInfo
+#include "xgboost/task.h"  // for ObjInfo
 #include "xgboost/tree_model.h"
 #include "xgboost/tree_updater.h"  // for TreeUpdater
 
@@ -42,6 +43,7 @@ auto BatchSpec(TrainParam const &p, common::Span<float> hess) {
 class GloablApproxBuilder {
  protected:
   TrainParam const *param_;
+  HistMakerTrainParam const *hist_param_{nullptr};
   std::shared_ptr<common::ColumnSampler> col_sampler_;
   HistEvaluator evaluator_;
   HistogramBuilder<CPUExpandEntry> histogram_builder_;
@@ -168,10 +170,12 @@ class GloablApproxBuilder {
   }
 
  public:
-  explicit GloablApproxBuilder(TrainParam const *param, MetaInfo const &info, Context const *ctx,
+  explicit GloablApproxBuilder(TrainParam const *param, HistMakerTrainParam const *hist_param,
+                               MetaInfo const &info, Context const *ctx,
                                std::shared_ptr<common::ColumnSampler> column_sampler,
                                ObjInfo const *task, common::Monitor *monitor)
       : param_{param},
+        hist_param_{hist_param},
         col_sampler_{std::move(column_sampler)},
         evaluator_{ctx, param_, info, col_sampler_},
         ctx_{ctx},
@@ -259,6 +263,7 @@ class GlobalApproxUpdater : public TreeUpdater {
   std::shared_ptr<common::ColumnSampler> column_sampler_ =
       std::make_shared<common::ColumnSampler>();
   ObjInfo const *task_;
+  HistMakerTrainParam hist_param_;
 
  public:
   explicit GlobalApproxUpdater(Context const *ctx, ObjInfo const *task)
@@ -266,9 +271,15 @@ class GlobalApproxUpdater : public TreeUpdater {
     monitor_.Init(__func__);
   }
 
-  void Configure(Args const &) override {}
-  void LoadConfig(Json const &) override {}
-  void SaveConfig(Json *) const override {}
+  void Configure(Args const &args) override { hist_param_.UpdateAllowUnknown(args); }
+  void LoadConfig(Json const &in) override {
+    auto const &config = get<Object const>(in);
+    FromJson(config.at("hist_train_param"), &hist_param_);
+  }
+  void SaveConfig(Json *p_out) const override {
+    auto &out = *p_out;
+    out["hist_train_param"] = ToJson(hist_param_);
+  }
 
   void InitData(TrainParam const &param, HostDeviceVector<GradientPair> const *gpair,
                 linalg::Matrix<GradientPair> *sampled) {
@@ -283,8 +294,9 @@ class GlobalApproxUpdater : public TreeUpdater {
   void Update(TrainParam const *param, HostDeviceVector<GradientPair> *gpair, DMatrix *m,
               common::Span<HostDeviceVector<bst_node_t>> out_position,
               const std::vector<RegTree *> &trees) override {
-    pimpl_ = std::make_unique<GloablApproxBuilder>(param, m->Info(), ctx_, column_sampler_, task_,
-                                                   &monitor_);
+    CHECK(hist_param_.GetInitialised());
+    pimpl_ = std::make_unique<GloablApproxBuilder>(param, &hist_param_, m->Info(), ctx_,
+                                                   column_sampler_, task_, &monitor_);
 
     linalg::Matrix<GradientPair> h_gpair;
     // Obtain the hessian values for weighted sketching
@@ -299,6 +311,7 @@ class GlobalApproxUpdater : public TreeUpdater {
     std::size_t t_idx = 0;
     for (auto p_tree : trees) {
       this->pimpl_->UpdateTree(m, s_gpair, hess, p_tree, &out_position[t_idx]);
+      hist_param_.CheckTreesSynchronized(p_tree);
       ++t_idx;
     }
   }
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 56d7d2a89..0403c7881 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -30,6 +30,7 @@
 #include "gpu_hist/gradient_based_sampler.cuh"
 #include "gpu_hist/histogram.cuh"
 #include "gpu_hist/row_partitioner.cuh"
+#include "hist/param.h"
 #include "param.h"
 #include "updater_gpu_common.cuh"
 #include "xgboost/base.h"
@@ -47,37 +48,6 @@ namespace xgboost::tree {
 DMLC_REGISTRY_FILE_TAG(updater_gpu_hist);
 #endif  // !defined(GTEST_TEST)
 
-// training parameters specific to this algorithm
-struct GPUHistMakerTrainParam : public XGBoostParameter<GPUHistMakerTrainParam> {
-  bool debug_synchronize;
-  // declare parameters
-  DMLC_DECLARE_PARAMETER(GPUHistMakerTrainParam) {
-    DMLC_DECLARE_FIELD(debug_synchronize)
-        .set_default(false)
-        .describe("Check if all distributed tree are identical after tree construction.");
-  }
-
-  // Only call this method for testing
-  void CheckTreesSynchronized(RegTree const* local_tree) const {
-    if (this->debug_synchronize) {
-      std::string s_model;
-      common::MemoryBufferStream fs(&s_model);
-      int rank = collective::GetRank();
-      if (rank == 0) {
-        local_tree->Save(&fs);
-      }
-      fs.Seek(0);
-      collective::Broadcast(&s_model, 0);
-      RegTree reference_tree{};  // rank 0 tree
-      reference_tree.Load(&fs);
-      CHECK(*local_tree == reference_tree);
-    }
-  }
-};
-#if !defined(GTEST_TEST)
-DMLC_REGISTER_PARAMETER(GPUHistMakerTrainParam);
-#endif  // !defined(GTEST_TEST)
-
 /**
  * \struct  DeviceHistogramStorage
  *
@@ -777,12 +747,12 @@ class GPUHistMaker : public TreeUpdater {
 
   void LoadConfig(Json const& in) override {
     auto const& config = get<Object const>(in);
-    FromJson(config.at("gpu_hist_train_param"), &this->hist_maker_param_);
+    FromJson(config.at("hist_train_param"), &this->hist_maker_param_);
     initialised_ = false;
   }
   void SaveConfig(Json* p_out) const override {
     auto& out = *p_out;
-    out["gpu_hist_train_param"] = ToJson(hist_maker_param_);
+    out["hist_train_param"] = ToJson(hist_maker_param_);
   }
 
   ~GPUHistMaker() {  // NOLINT
@@ -836,6 +806,7 @@ class GPUHistMaker : public TreeUpdater {
       monitor_.Stop("InitDataOnce");
     }
     p_last_tree_ = p_tree;
+    CHECK(hist_maker_param_.GetInitialised());
   }
 
   void UpdateTree(TrainParam const* param, HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat,
@@ -869,7 +840,7 @@ class GPUHistMaker : public TreeUpdater {
  private:
   bool initialised_{false};
 
-  GPUHistMakerTrainParam hist_maker_param_;
+  HistMakerTrainParam hist_maker_param_;
 
   DMatrix* p_last_fmat_{nullptr};
   RegTree const* p_last_tree_{nullptr};
@@ -903,12 +874,12 @@ class GPUGlobalApproxMaker : public TreeUpdater {
 
   void LoadConfig(Json const& in) override {
     auto const& config = get<Object const>(in);
-    FromJson(config.at("approx_train_param"), &this->hist_maker_param_);
+    FromJson(config.at("hist_train_param"), &this->hist_maker_param_);
     initialised_ = false;
   }
   void SaveConfig(Json* p_out) const override {
     auto& out = *p_out;
-    out["approx_train_param"] = ToJson(hist_maker_param_);
+    out["hist_train_param"] = ToJson(hist_maker_param_);
   }
   ~GPUGlobalApproxMaker() override { dh::GlobalMemoryLogger().Log(); }
 
@@ -965,6 +936,7 @@ class GPUGlobalApproxMaker : public TreeUpdater {
   void InitData(DMatrix* p_fmat, RegTree const* p_tree) {
     this->InitDataOnce(p_fmat);
     p_last_tree_ = p_tree;
+    CHECK(hist_maker_param_.GetInitialised());
   }
 
   void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat, RegTree* p_tree,
@@ -994,7 +966,7 @@ class GPUGlobalApproxMaker : public TreeUpdater {
  private:
   bool initialised_{false};
 
-  GPUHistMakerTrainParam hist_maker_param_;
+  HistMakerTrainParam hist_maker_param_;
   dh::device_vector<float> hess_;
   std::shared_ptr<common::ColumnSampler> column_sampler_;
   std::unique_ptr<GPUHistMakerDevice> maker_;
diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index 22b715dc7..63aaf27f6 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -4,18 +4,17 @@
  * \brief use quantized feature values to construct a tree
  * \author Philip Cho, Tianqi Checn, Egor Smirnov
  */
-#include <algorithm>                         // for max, copy, transform
-#include <cstddef>                           // for size_t
-#include <cstdint>                           // for uint32_t, int32_t
-#include <memory>                            // for unique_ptr, allocator, make_unique, shared_ptr
-#include <numeric>                           // for accumulate
-#include <ostream>                           // for basic_ostream, char_traits, operator<<
-#include <utility>                           // for move, swap
-#include <vector>                            // for vector
+#include <algorithm>  // for max, copy, transform
+#include <cstddef>    // for size_t
+#include <cstdint>    // for uint32_t, int32_t
+#include <memory>     // for unique_ptr, allocator, make_unique, shared_ptr
+#include <numeric>    // for accumulate
+#include <ostream>    // for basic_ostream, char_traits, operator<<
+#include <utility>    // for move, swap
+#include <vector>     // for vector
 
 #include "../collective/aggregator.h"        // for GlobalSum
 #include "../collective/communicator-inl.h"  // for Allreduce, IsDistributed
-#include "../collective/communicator.h"      // for Operation
 #include "../common/hist_util.h"             // for HistogramCuts, HistCollection
 #include "../common/linalg_op.h"             // for begin, cbegin, cend
 #include "../common/random.h"                // for ColumnSampler
@@ -24,12 +23,12 @@
 #include "../common/transform_iterator.h"    // for IndexTransformIter, MakeIndexTransformIter
 #include "../data/gradient_index.h"          // for GHistIndexMatrix
 #include "common_row_partitioner.h"          // for CommonRowPartitioner
-#include "dmlc/omp.h"                        // for omp_get_thread_num
 #include "dmlc/registry.h"                   // for DMLC_REGISTRY_FILE_TAG
 #include "driver.h"                          // for Driver
 #include "hist/evaluate_splits.h"            // for HistEvaluator, HistMultiEvaluator, UpdatePre...
 #include "hist/expand_entry.h"               // for MultiExpandEntry, CPUExpandEntry
 #include "hist/histogram.h"                  // for HistogramBuilder, ConstructHistSpace
+#include "hist/param.h"                      // for HistMakerTrainParam
 #include "hist/sampler.h"                    // for SampleGradient
 #include "param.h"                           // for TrainParam, SplitEntryContainer, GradStats
 #include "xgboost/base.h"                    // for GradientPairInternal, GradientPair, bst_targ...
@@ -117,6 +116,7 @@ class MultiTargetHistBuilder {
  private:
   common::Monitor *monitor_{nullptr};
   TrainParam const *param_{nullptr};
+  HistMakerTrainParam const *hist_param_{nullptr};
   std::shared_ptr<common::ColumnSampler> col_sampler_;
   std::unique_ptr<HistMultiEvaluator> evaluator_;
   // Histogram builder for each target.
@@ -306,10 +306,12 @@ class MultiTargetHistBuilder {
 
  public:
   explicit MultiTargetHistBuilder(Context const *ctx, MetaInfo const &info, TrainParam const *param,
+                                  HistMakerTrainParam const *hist_param,
                                   std::shared_ptr<common::ColumnSampler> column_sampler,
                                   ObjInfo const *task, common::Monitor *monitor)
       : monitor_{monitor},
         param_{param},
+        hist_param_{hist_param},
         col_sampler_{std::move(column_sampler)},
         evaluator_{std::make_unique<HistMultiEvaluator>(ctx, info, param, col_sampler_)},
         ctx_{ctx},
@@ -331,10 +333,14 @@ class MultiTargetHistBuilder {
   }
 };
 
-class HistBuilder {
+/**
+ * @brief Tree updater for single-target trees.
+ */
+class HistUpdater {
  private:
   common::Monitor *monitor_;
   TrainParam const *param_;
+  HistMakerTrainParam const *hist_param_{nullptr};
   std::shared_ptr<common::ColumnSampler> col_sampler_;
   std::unique_ptr<HistEvaluator> evaluator_;
   std::vector<CommonRowPartitioner> partitioner_;
@@ -349,14 +355,14 @@ class HistBuilder {
   Context const *ctx_{nullptr};
 
  public:
-  explicit HistBuilder(Context const *ctx, std::shared_ptr<common::ColumnSampler> column_sampler,
-                       TrainParam const *param, DMatrix const *fmat, ObjInfo const *task,
-                       common::Monitor *monitor)
+  explicit HistUpdater(Context const *ctx, std::shared_ptr<common::ColumnSampler> column_sampler,
+                       TrainParam const *param, HistMakerTrainParam const *hist_param,
+                       DMatrix const *fmat, ObjInfo const *task, common::Monitor *monitor)
       : monitor_{monitor},
         param_{param},
+        hist_param_{hist_param},
         col_sampler_{std::move(column_sampler)},
-        evaluator_{std::make_unique<HistEvaluator>(ctx, param, fmat->Info(),
-                                                                   col_sampler_)},
+        evaluator_{std::make_unique<HistEvaluator>(ctx, param, fmat->Info(), col_sampler_)},
         p_last_fmat_(fmat),
         histogram_builder_{new HistogramBuilder<CPUExpandEntry>},
         task_{task},
@@ -529,7 +535,7 @@ class HistBuilder {
                      std::vector<bst_node_t> *p_out_position) {
     monitor_->Start(__func__);
     if (!task_->UpdateTreeLeaf()) {
-    monitor_->Stop(__func__);
+      monitor_->Stop(__func__);
       return;
     }
     for (auto const &part : partitioner_) {
@@ -541,20 +547,27 @@ class HistBuilder {
 
 /*! \brief construct a tree using quantized feature values */
 class QuantileHistMaker : public TreeUpdater {
-  std::unique_ptr<HistBuilder> p_impl_{nullptr};
+  std::unique_ptr<HistUpdater> p_impl_{nullptr};
   std::unique_ptr<MultiTargetHistBuilder> p_mtimpl_{nullptr};
   std::shared_ptr<common::ColumnSampler> column_sampler_ =
       std::make_shared<common::ColumnSampler>();
   common::Monitor monitor_;
   ObjInfo const *task_{nullptr};
+  HistMakerTrainParam hist_param_;
 
  public:
   explicit QuantileHistMaker(Context const *ctx, ObjInfo const *task)
       : TreeUpdater{ctx}, task_{task} {}
-  void Configure(const Args &) override {}
 
-  void LoadConfig(Json const &) override {}
-  void SaveConfig(Json *) const override {}
+  void Configure(Args const &args) override { hist_param_.UpdateAllowUnknown(args); }
+  void LoadConfig(Json const &in) override {
+    auto const &config = get<Object const>(in);
+    FromJson(config.at("hist_train_param"), &hist_param_);
+  }
+  void SaveConfig(Json *p_out) const override {
+    auto &out = *p_out;
+    out["hist_train_param"] = ToJson(hist_param_);
+  }
 
   [[nodiscard]] char const *Name() const override { return "grow_quantile_histmaker"; }
 
@@ -562,15 +575,17 @@ class QuantileHistMaker : public TreeUpdater {
               common::Span<HostDeviceVector<bst_node_t>> out_position,
               const std::vector<RegTree *> &trees) override {
     if (trees.front()->IsMultiTarget()) {
+      CHECK(hist_param_.GetInitialised());
       CHECK(param->monotone_constraints.empty()) << "monotone constraint" << MTNotImplemented();
       if (!p_mtimpl_) {
         this->p_mtimpl_ = std::make_unique<MultiTargetHistBuilder>(
-            ctx_, p_fmat->Info(), param, column_sampler_, task_, &monitor_);
+            ctx_, p_fmat->Info(), param, &hist_param_, column_sampler_, task_, &monitor_);
       }
     } else {
+      CHECK(hist_param_.GetInitialised());
       if (!p_impl_) {
-        p_impl_ =
-            std::make_unique<HistBuilder>(ctx_, column_sampler_, param, p_fmat, task_, &monitor_);
+        p_impl_ = std::make_unique<HistUpdater>(ctx_, column_sampler_, param, &hist_param_, p_fmat,
+                                                task_, &monitor_);
       }
     }
 
@@ -601,6 +616,8 @@ class QuantileHistMaker : public TreeUpdater {
         UpdateTree<CPUExpandEntry>(&monitor_, h_sample_out, p_impl_.get(), p_fmat, param,
                                    h_out_position, *tree_it);
       }
+
+      hist_param_.CheckTreesSynchronized(*tree_it);
     }
   }
 
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index 2bd47d42c..dd2d802ca 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -105,13 +105,13 @@ void TestBuildHist(bool use_shared_memory_histograms) {
   gpair.SetDevice(0);
 
   thrust::host_vector<common::CompressedByteT> h_gidx_buffer (page->gidx_buffer.HostVector());
-  maker.row_partitioner.reset(new RowPartitioner(0, kNRows));
+  maker.row_partitioner = std::make_unique<RowPartitioner>(0, kNRows);
 
   maker.hist.Init(0, page->Cuts().TotalBins());
   maker.hist.AllocateHistograms({0});
 
   maker.gpair = gpair.DeviceSpan();
-  maker.quantiser.reset(new GradientQuantiser(maker.gpair));
+  maker.quantiser = std::make_unique<GradientQuantiser>(maker.gpair);
   maker.page = page.get();
 
   maker.InitFeatureGroupsOnce();
@@ -246,6 +246,7 @@ void UpdateTree(Context const* ctx, HostDeviceVector<GradientPair>* gpair, DMatr
 
   ObjInfo task{ObjInfo::kRegression};
   tree::GPUHistMaker hist_maker{ctx, &task};
+  hist_maker.Configure(Args{});
 
   std::vector<HostDeviceVector<bst_node_t>> position(1);
   hist_maker.Update(&param, gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position},
@@ -397,14 +398,14 @@ TEST(GpuHist, ConfigIO) {
   std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_gpu_hist", &ctx, &task)};
   updater->Configure(Args{});
 
-  Json j_updater { Object() };
+  Json j_updater{Object{}};
   updater->SaveConfig(&j_updater);
-  ASSERT_TRUE(IsA<Object>(j_updater["gpu_hist_train_param"]));
+  ASSERT_TRUE(IsA<Object>(j_updater["hist_train_param"]));
   updater->LoadConfig(j_updater);
 
-  Json j_updater_roundtrip { Object() };
+  Json j_updater_roundtrip{Object{}};
   updater->SaveConfig(&j_updater_roundtrip);
-  ASSERT_TRUE(IsA<Object>(j_updater_roundtrip["gpu_hist_train_param"]));
+  ASSERT_TRUE(IsA<Object>(j_updater_roundtrip["hist_train_param"]));
 
   ASSERT_EQ(j_updater, j_updater_roundtrip);
 }
diff --git a/tests/cpp/tree/test_histmaker.cc b/tests/cpp/tree/test_histmaker.cc
index 8ba53e3f1..d03440339 100644
--- a/tests/cpp/tree/test_histmaker.cc
+++ b/tests/cpp/tree/test_histmaker.cc
@@ -39,6 +39,7 @@ TEST(GrowHistMaker, InteractionConstraint) {
     param.UpdateAllowUnknown(
         Args{{"interaction_constraints", "[[0, 1]]"}, {"num_feature", std::to_string(kCols)}});
     std::vector<HostDeviceVector<bst_node_t>> position(1);
+    updater->Configure(Args{});
     updater->Update(&param, p_gradients.get(), p_dmat.get(), position, {&tree});
 
     ASSERT_EQ(tree.NumExtraNodes(), 4);
@@ -55,6 +56,7 @@ TEST(GrowHistMaker, InteractionConstraint) {
     std::vector<HostDeviceVector<bst_node_t>> position(1);
     TrainParam param;
     param.Init(Args{});
+    updater->Configure(Args{});
     updater->Update(&param, p_gradients.get(), p_dmat.get(), position, {&tree});
 
     ASSERT_EQ(tree.NumExtraNodes(), 10);
@@ -81,6 +83,7 @@ void VerifyColumnSplit(int32_t rows, bst_feature_t cols, bool categorical,
   RegTree tree{1u, cols};
   TrainParam param;
   param.Init(Args{});
+  updater->Configure(Args{});
   updater->Update(&param, p_gradients.get(), sliced.get(), position, {&tree});
 
   Json json{Object{}};
@@ -104,6 +107,7 @@ void TestColumnSplit(bool categorical) {
     std::vector<HostDeviceVector<bst_node_t>> position(1);
     TrainParam param;
     param.Init(Args{});
+    updater->Configure(Args{});
     updater->Update(&param, p_gradients.get(), p_dmat.get(), position, {&expected_tree});
   }
 
diff --git a/tests/cpp/tree/test_prediction_cache.cc b/tests/cpp/tree/test_prediction_cache.cc
index 333f1eccc..0aafb0a4f 100644
--- a/tests/cpp/tree/test_prediction_cache.cc
+++ b/tests/cpp/tree/test_prediction_cache.cc
@@ -73,6 +73,7 @@ class TestPredictionCache : public ::testing::Test {
       tree::TrainParam param;
       param.UpdateAllowUnknown(Args{{"max_bin", "64"}});
 
+      updater->Configure(Args{});
       std::vector<HostDeviceVector<bst_node_t>> position(1);
       updater->Update(&param, &gpair, Xy_.get(), position, trees);
       HostDeviceVector<float> out_prediction_cached;
diff --git a/tests/cpp/tree/test_quantile_hist.cc b/tests/cpp/tree/test_quantile_hist.cc
index 7b99a5bf2..4afea74ce 100644
--- a/tests/cpp/tree/test_quantile_hist.cc
+++ b/tests/cpp/tree/test_quantile_hist.cc
@@ -13,7 +13,6 @@
 #include "../../../src/tree/common_row_partitioner.h"
 #include "../../../src/tree/hist/expand_entry.h"  // for MultiExpandEntry, CPUExpandEntry
 #include "../../../src/tree/param.h"
-#include "../../../src/tree/split_evaluator.h"
 #include "../helpers.h"
 #include "test_partitioner.h"
 #include "xgboost/data.h"
@@ -49,7 +48,7 @@ void TestPartitioner(bst_target_t n_targets) {
       auto min_value = gmat.cut.MinValues()[split_ind];
       RegTree tree{n_targets, n_features};
       CommonRowPartitioner partitioner{&ctx, n_samples, base_rowid, false};
-      if constexpr (std::is_same<ExpandEntry, CPUExpandEntry>::value) {
+      if constexpr (std::is_same_v<ExpandEntry, CPUExpandEntry>) {
         GetSplit(&tree, min_value, &candidates);
       } else {
         GetMultiSplitForTest(&tree, min_value, &candidates);
@@ -217,6 +216,7 @@ void VerifyColumnSplit(bst_row_t rows, bst_feature_t cols, bst_target_t n_target
   RegTree tree{n_targets, cols};
   TrainParam param;
   param.Init(Args{});
+  updater->Configure(Args{});
   updater->Update(&param, p_gradients.get(), sliced.get(), position, {&tree});
 
   Json json{Object{}};
@@ -241,6 +241,7 @@ void TestColumnSplit(bst_target_t n_targets) {
     std::vector<HostDeviceVector<bst_node_t>> position(1);
     TrainParam param;
     param.Init(Args{});
+    updater->Configure(Args{});
     updater->Update(&param, p_gradients.get(), Xy.get(), position, {&expected_tree});
   }
 
diff --git a/tests/test_distributed/test_with_dask/test_with_dask.py b/tests/test_distributed/test_with_dask/test_with_dask.py
index 66c6058a5..23bfc2d23 100644
--- a/tests/test_distributed/test_with_dask/test_with_dask.py
+++ b/tests/test_distributed/test_with_dask/test_with_dask.py
@@ -1459,6 +1459,7 @@ class TestWithDask:
         tree_method: str,
     ) -> None:
         params["tree_method"] = tree_method
+        params["debug_synchronize"] = True
         params = dataset.set_params(params)
         # It doesn't make sense to distribute a completely
         # empty dataset.

From 71299888473b59310b3a14e1080364472438573e Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 3 Aug 2023 12:44:16 +0800
Subject: [PATCH 060/136] Accept only keyword arguments in data iterator.
 (#9431)

---
 python-package/xgboost/core.py             | 5 ++++-
 python-package/xgboost/testing/__init__.py | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 5658a5079..14a96f117 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -582,8 +582,8 @@ class DataIter(ABC):  # pylint: disable=too-many-instance-attributes
 
         @require_keyword_args(True)
         def input_data(
-            data: Any,
             *,
+            data: Any,
             feature_names: Optional[FeatureNames] = None,
             feature_types: Optional[FeatureTypes] = None,
             **kwargs: Any,
@@ -684,6 +684,9 @@ def require_keyword_args(
         @wraps(func)
         def inner_f(*args: Any, **kwargs: Any) -> _T:
             extra_args = len(args) - len(all_args)
+            if not all_args and extra_args > 0:  # keyword argument only
+                raise TypeError("Keyword argument is required.")
+
             if extra_args > 0:
                 # ignore first 'self' argument for instance methods
                 args_msg = [
diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py
index 6445f1c94..48809b46f 100644
--- a/python-package/xgboost/testing/__init__.py
+++ b/python-package/xgboost/testing/__init__.py
@@ -212,7 +212,7 @@ class IteratorForTest(xgb.core.DataIter):
         if self.it == len(self.X):
             return 0
 
-        with pytest.raises(TypeError, match="keyword args"):
+        with pytest.raises(TypeError, match="Keyword argument"):
             input_data(self.X[self.it], self.y[self.it], None)
 
         # Use copy to make sure the iterator doesn't hold a reference to the data.

From f958e326832c9acc20f4c9548132f036cf1785af Mon Sep 17 00:00:00 2001
From: Hendrik Makait <hendrik@makait.com>
Date: Thu, 3 Aug 2023 14:14:07 +0200
Subject: [PATCH 061/136] Raise if expected workers are not alive in
 `xgboost.dask.train` (#9421)

---
 python-package/xgboost/dask.py                | 13 ++++++-
 .../test_with_dask/test_with_dask.py          | 38 ++++++++++++++++++-
 2 files changed, 48 insertions(+), 3 deletions(-)

diff --git a/python-package/xgboost/dask.py b/python-package/xgboost/dask.py
index 271a5e458..219ad2698 100644
--- a/python-package/xgboost/dask.py
+++ b/python-package/xgboost/dask.py
@@ -850,8 +850,6 @@ async def _get_rabit_args(
     except Exception:  # pylint: disable=broad-except
         sched_addr = None
 
-    # make sure all workers are online so that we can obtain reliable scheduler_info
-    await client.wait_for_workers(n_workers)  # type: ignore
     env = await client.run_on_scheduler(
         _start_tracker, n_workers, sched_addr, user_addr
     )
@@ -907,6 +905,16 @@ def _filter_empty(
     raise ValueError("None of the workers can provide a valid result.")
 
 
+async def _check_workers_are_alive(
+    workers: List[str], client: "distributed.Client"
+) -> None:
+    info = await client.scheduler.identity()
+    current_workers = info["workers"].keys()
+    missing_workers = set(workers) - current_workers
+    if missing_workers:
+        raise RuntimeError(f"Missing required workers: {missing_workers}")
+
+
 async def _train_async(
     client: "distributed.Client",
     global_config: Dict[str, Any],
@@ -924,6 +932,7 @@ async def _train_async(
     custom_metric: Optional[Metric],
 ) -> Optional[TrainReturnT]:
     workers = _get_workers_from_data(dtrain, evals)
+    await _check_workers_are_alive(workers, client)
     _rabit_args = await _get_rabit_args(len(workers), dconfig, client)
     _check_distributed_params(params)
 
diff --git a/tests/test_distributed/test_with_dask/test_with_dask.py b/tests/test_distributed/test_with_dask/test_with_dask.py
index 23bfc2d23..3add01192 100644
--- a/tests/test_distributed/test_with_dask/test_with_dask.py
+++ b/tests/test_distributed/test_with_dask/test_with_dask.py
@@ -36,7 +36,8 @@ pytestmark = [tm.timeout(60), pytest.mark.skipif(**tm.no_dask())]
 import dask
 import dask.array as da
 import dask.dataframe as dd
-from distributed import Client, LocalCluster
+from distributed import Client, LocalCluster, Nanny, Worker
+from distributed.utils_test import async_poll_for, gen_cluster
 from toolz import sliding_window  # dependency of dask
 
 from xgboost.dask import DaskDMatrix
@@ -2226,3 +2227,38 @@ class TestDaskCallbacks:
             )
             for i in range(1, 10):
                 assert os.path.exists(os.path.join(tmpdir, "model_" + str(i) + ".json"))
+
+
+@gen_cluster(client=True, clean_kwargs={"processes": False, "threads": False}, allow_unclosed=True)
+async def test_worker_left(c, s, a, b):
+    async with Worker(s.address):
+        dx = da.random.random((1000, 10)).rechunk(chunks=(10, None))
+        dy = da.random.random((1000,)).rechunk(chunks=(10,))
+        d_train = await xgb.dask.DaskDMatrix(
+            c, dx, dy,
+        )
+    await async_poll_for(lambda: len(s.workers) == 2, timeout=5)
+    with pytest.raises(RuntimeError, match="Missing"):
+        await xgb.dask.train( 
+            c,
+            {},
+            d_train,
+            evals=[(d_train, "train")],
+        )
+
+
+@gen_cluster(client=True, Worker=Nanny, clean_kwargs={"processes": False, "threads": False}, allow_unclosed=True)
+async def test_worker_restarted(c, s, a, b):
+    dx = da.random.random((1000, 10)).rechunk(chunks=(10, None))
+    dy = da.random.random((1000,)).rechunk(chunks=(10,))
+    d_train = await xgb.dask.DaskDMatrix(
+        c, dx, dy,
+    )
+    await c.restart_workers([a.worker_address])
+    with pytest.raises(RuntimeError, match="Missing"):
+        await xgb.dask.train( 
+            c,
+            {},
+            d_train,
+            evals=[(d_train, "train")],
+        )

From 1332ff787ff620c0d12230b4f3af4d695022e483 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 3 Aug 2023 21:46:36 +0800
Subject: [PATCH 062/136] Unify the code path between local and distributed
 training. (#9433)

This removes the need for a local histogram space during distributed training, which cuts the cache size by half.
---
 src/common/hist_util.h                |   1 +
 src/tree/hist/histogram.h             | 183 +++++---------------------
 tests/cpp/tree/hist/test_histogram.cc |  34 ++---
 3 files changed, 45 insertions(+), 173 deletions(-)

diff --git a/src/common/hist_util.h b/src/common/hist_util.h
index 12db898a9..9bc44409e 100644
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -453,6 +453,7 @@ class HistCollection {
       data_[0].resize(new_size);
     }
   }
+  [[nodiscard]] bool IsContiguous() const { return contiguous_allocation_; }
 
  private:
   /*! \brief number of all bins over all features */
diff --git a/src/tree/hist/histogram.h b/src/tree/hist/histogram.h
index b7f5f5da6..aef7f6df1 100644
--- a/src/tree/hist/histogram.h
+++ b/src/tree/hist/histogram.h
@@ -14,14 +14,11 @@
 #include "expand_entry.h"
 #include "xgboost/tree_model.h"  // for RegTree
 
-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {
 template <typename ExpandEntry>
 class HistogramBuilder {
   /*! \brief culmulative histogram of gradients. */
   common::HistCollection hist_;
-  /*! \brief culmulative local parent histogram of gradients. */
-  common::HistCollection hist_local_worker_;
   common::ParallelGHistBuilder buffer_;
   BatchParam param_;
   int32_t n_threads_{-1};
@@ -46,12 +43,9 @@ class HistogramBuilder {
     n_batches_ = n_batches;
     param_ = p;
     hist_.Init(total_bins);
-    hist_local_worker_.Init(total_bins);
     buffer_.Init(total_bins);
     is_distributed_ = is_distributed;
     is_col_split_ = is_col_split;
-    // Workaround s390x gcc 7.5.0
-    auto DMLC_ATTRIBUTE_UNUSED __force_instantiation = &GradientPairPrecise::Reduce;
   }
 
   template <bool any_missing>
@@ -91,17 +85,19 @@ class HistogramBuilder {
     });
   }
 
-  void AddHistRows(int *starting_index, int *sync_count,
+  void AddHistRows(int *starting_index,
                    std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
-                   std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
-                   RegTree const *p_tree) {
-    if (is_distributed_ && !is_col_split_) {
-      this->AddHistRowsDistributed(starting_index, sync_count, nodes_for_explicit_hist_build,
-                                   nodes_for_subtraction_trick, p_tree);
-    } else {
-      this->AddHistRowsLocal(starting_index, sync_count, nodes_for_explicit_hist_build,
-                             nodes_for_subtraction_trick);
+                   std::vector<ExpandEntry> const &nodes_for_subtraction_trick) {
+    for (auto const &entry : nodes_for_explicit_hist_build) {
+      int nid = entry.nid;
+      this->hist_.AddHistRow(nid);
+      (*starting_index) = std::min(nid, (*starting_index));
     }
+
+    for (auto const &node : nodes_for_subtraction_trick) {
+      this->hist_.AddHistRow(node.nid);
+    }
+    this->hist_.AllocateAllData();
   }
 
   /** Main entry point of this class, build histogram for tree nodes. */
@@ -111,10 +107,9 @@ class HistogramBuilder {
                  std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
                  common::Span<GradientPair const> gpair, bool force_read_by_column = false) {
     int starting_index = std::numeric_limits<int>::max();
-    int sync_count = 0;
     if (page_id == 0) {
-      this->AddHistRows(&starting_index, &sync_count, nodes_for_explicit_hist_build,
-                        nodes_for_subtraction_trick, p_tree);
+      this->AddHistRows(&starting_index, nodes_for_explicit_hist_build,
+                        nodes_for_subtraction_trick);
     }
     if (gidx.IsDense()) {
       this->BuildLocalHistograms<false>(page_id, space, gidx, nodes_for_explicit_hist_build,
@@ -129,13 +124,8 @@ class HistogramBuilder {
       return;
     }
 
-    if (is_distributed_ && !is_col_split_) {
-      this->SyncHistogramDistributed(p_tree, nodes_for_explicit_hist_build,
-                                     nodes_for_subtraction_trick,
-                                     starting_index, sync_count);
-    } else {
-      this->SyncHistogramLocal(p_tree, nodes_for_explicit_hist_build, nodes_for_subtraction_trick);
-    }
+    this->SyncHistogram(p_tree, nodes_for_explicit_hist_build,
+                                   nodes_for_subtraction_trick, starting_index);
   }
   /** same as the other build hist but handles only single batch data (in-core) */
   void BuildHist(size_t page_id, GHistIndexMatrix const &gidx, RegTree *p_tree,
@@ -156,62 +146,33 @@ class HistogramBuilder {
                     nodes_for_subtraction_trick, gpair, force_read_by_column);
   }
 
-  void SyncHistogramDistributed(RegTree const *p_tree,
-                                std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
-                                std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
-                                int starting_index, int sync_count) {
+  void SyncHistogram(RegTree const *p_tree,
+                     std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
+                     std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
+                     int starting_index) {
     auto n_bins = buffer_.TotalBins();
     common::BlockedSpace2d space(
         nodes_for_explicit_hist_build.size(), [&](size_t) { return n_bins; }, 1024);
-    common::ParallelFor2d(space, n_threads_, [&](size_t node, common::Range1d r) {
-      const auto &entry = nodes_for_explicit_hist_build[node];
-      auto this_hist = this->hist_[entry.nid];
-      // Merging histograms from each thread into once
-      buffer_.ReduceHist(node, r.begin(), r.end());
-      // Store posible parent node
-      auto this_local = hist_local_worker_[entry.nid];
-      common::CopyHist(this_local, this_hist, r.begin(), r.end());
-
-      if (!p_tree->IsRoot(entry.nid)) {
-        const size_t parent_id = p_tree->Parent(entry.nid);
-        const int subtraction_node_id = nodes_for_subtraction_trick[node].nid;
-        auto parent_hist = this->hist_local_worker_[parent_id];
-        auto sibling_hist = this->hist_[subtraction_node_id];
-        common::SubtractionHist(sibling_hist, parent_hist, this_hist, r.begin(), r.end());
-        // Store posible parent node
-        auto sibling_local = hist_local_worker_[subtraction_node_id];
-        common::CopyHist(sibling_local, sibling_hist, r.begin(), r.end());
-      }
-    });
-
-    collective::Allreduce<collective::Operation::kSum>(
-        reinterpret_cast<double *>(this->hist_[starting_index].data()), n_bins * sync_count * 2);
-
-    ParallelSubtractionHist(space, nodes_for_explicit_hist_build, nodes_for_subtraction_trick,
-                            p_tree);
-
-    common::BlockedSpace2d space2(
-        nodes_for_subtraction_trick.size(), [&](size_t) { return n_bins; }, 1024);
-    ParallelSubtractionHist(space2, nodes_for_subtraction_trick, nodes_for_explicit_hist_build,
-                            p_tree);
-  }
-
-  void SyncHistogramLocal(RegTree const *p_tree,
-                          std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
-                          std::vector<ExpandEntry> const &nodes_for_subtraction_trick) {
-    const size_t nbins = this->buffer_.TotalBins();
-    common::BlockedSpace2d space(
-        nodes_for_explicit_hist_build.size(), [&](size_t) { return nbins; }, 1024);
-
+    CHECK(hist_.IsContiguous());
     common::ParallelFor2d(space, this->n_threads_, [&](size_t node, common::Range1d r) {
       const auto &entry = nodes_for_explicit_hist_build[node];
       auto this_hist = this->hist_[entry.nid];
       // Merging histograms from each thread into once
       this->buffer_.ReduceHist(node, r.begin(), r.end());
+    });
 
+    if (is_distributed_ && !is_col_split_) {
+      collective::Allreduce<collective::Operation::kSum>(
+          reinterpret_cast<double *>(this->hist_[starting_index].data()),
+          n_bins * nodes_for_explicit_hist_build.size() * 2);
+    }
+
+    common::ParallelFor2d(space, this->n_threads_, [&](std::size_t nidx_in_set, common::Range1d r) {
+      const auto &entry = nodes_for_explicit_hist_build[nidx_in_set];
+      auto this_hist = this->hist_[entry.nid];
       if (!p_tree->IsRoot(entry.nid)) {
         auto const parent_id = p_tree->Parent(entry.nid);
-        auto const subtraction_node_id = nodes_for_subtraction_trick[node].nid;
+        auto const subtraction_node_id = nodes_for_subtraction_trick[nidx_in_set].nid;
         auto parent_hist = this->hist_[parent_id];
         auto sibling_hist = this->hist_[subtraction_node_id];
         common::SubtractionHist(sibling_hist, parent_hist, this_hist, r.begin(), r.end());
@@ -222,82 +183,7 @@ class HistogramBuilder {
  public:
   /* Getters for tests. */
   common::HistCollection const &Histogram() { return hist_; }
-  auto& Buffer() { return buffer_; }
-
- private:
-  void
-  ParallelSubtractionHist(const common::BlockedSpace2d &space,
-                          const std::vector<ExpandEntry> &nodes,
-                          const std::vector<ExpandEntry> &subtraction_nodes,
-                          const RegTree *p_tree) {
-    common::ParallelFor2d(
-        space, this->n_threads_, [&](size_t node, common::Range1d r) {
-          const auto &entry = nodes[node];
-          if (!(p_tree->IsLeftChild(entry.nid))) {
-            auto this_hist = this->hist_[entry.nid];
-
-            if (!p_tree->IsRoot(entry.nid)) {
-              const int subtraction_node_id = subtraction_nodes[node].nid;
-              auto parent_hist = hist_[(*p_tree)[entry.nid].Parent()];
-              auto sibling_hist = hist_[subtraction_node_id];
-              common::SubtractionHist(this_hist, parent_hist, sibling_hist,
-                                      r.begin(), r.end());
-            }
-          }
-        });
-  }
-
-  // Add a tree node to histogram buffer in local training environment.
-  void AddHistRowsLocal(
-      int *starting_index, int *sync_count,
-      std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
-      std::vector<ExpandEntry> const &nodes_for_subtraction_trick) {
-    for (auto const &entry : nodes_for_explicit_hist_build) {
-      int nid = entry.nid;
-      this->hist_.AddHistRow(nid);
-      (*starting_index) = std::min(nid, (*starting_index));
-    }
-    (*sync_count) = nodes_for_explicit_hist_build.size();
-
-    for (auto const &node : nodes_for_subtraction_trick) {
-      this->hist_.AddHistRow(node.nid);
-    }
-    this->hist_.AllocateAllData();
-  }
-
-  void AddHistRowsDistributed(int *starting_index, int *sync_count,
-                              std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
-                              std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
-                              RegTree const *p_tree) {
-    const size_t explicit_size = nodes_for_explicit_hist_build.size();
-    const size_t subtaction_size = nodes_for_subtraction_trick.size();
-    std::vector<int> merged_node_ids(explicit_size + subtaction_size);
-    for (size_t i = 0; i < explicit_size; ++i) {
-      merged_node_ids[i] = nodes_for_explicit_hist_build[i].nid;
-    }
-    for (size_t i = 0; i < subtaction_size; ++i) {
-      merged_node_ids[explicit_size + i] = nodes_for_subtraction_trick[i].nid;
-    }
-    std::sort(merged_node_ids.begin(), merged_node_ids.end());
-    int n_left = 0;
-    for (auto const &nid : merged_node_ids) {
-      if (p_tree->IsLeftChild(nid)) {
-        this->hist_.AddHistRow(nid);
-        (*starting_index) = std::min(nid, (*starting_index));
-        n_left++;
-        this->hist_local_worker_.AddHistRow(nid);
-      }
-    }
-    for (auto const &nid : merged_node_ids) {
-      if (!(p_tree->IsLeftChild(nid))) {
-        this->hist_.AddHistRow(nid);
-        this->hist_local_worker_.AddHistRow(nid);
-      }
-    }
-    this->hist_.AllocateAllData();
-    this->hist_local_worker_.AllocateAllData();
-    (*sync_count) = std::max(1, n_left);
-  }
+  auto &Buffer() { return buffer_; }
 };
 
 // Construct a work space for building histogram.  Eventually we should move this
@@ -318,6 +204,5 @@ common::BlockedSpace2d ConstructHistSpace(Partitioner const &partitioners,
       nodes_to_build.size(), [&](size_t nidx_in_set) { return partition_size[nidx_in_set]; }, 256};
   return space;
 }
-}      // namespace tree
-}      // namespace xgboost
+}  // namespace xgboost::tree
 #endif  // XGBOOST_TREE_HIST_HISTOGRAM_H_
diff --git a/tests/cpp/tree/hist/test_histogram.cc b/tests/cpp/tree/hist/test_histogram.cc
index 0198c6c80..b43f7e360 100644
--- a/tests/cpp/tree/hist/test_histogram.cc
+++ b/tests/cpp/tree/hist/test_histogram.cc
@@ -28,7 +28,6 @@ void TestAddHistRows(bool is_distributed) {
   std::vector<CPUExpandEntry> nodes_for_explicit_hist_build_;
   std::vector<CPUExpandEntry> nodes_for_subtraction_trick_;
   int starting_index = std::numeric_limits<int>::max();
-  int sync_count = 0;
 
   size_t constexpr kNRows = 8, kNCols = 16;
   int32_t constexpr kMaxBins = 4;
@@ -49,11 +48,9 @@ void TestAddHistRows(bool is_distributed) {
   HistogramBuilder<CPUExpandEntry> histogram_builder;
   histogram_builder.Reset(gmat.cut.TotalBins(), {kMaxBins, 0.5}, omp_get_max_threads(), 1,
                           is_distributed, false);
-  histogram_builder.AddHistRows(&starting_index, &sync_count,
-                                nodes_for_explicit_hist_build_,
-                                nodes_for_subtraction_trick_, &tree);
+  histogram_builder.AddHistRows(&starting_index, nodes_for_explicit_hist_build_,
+                                nodes_for_subtraction_trick_);
 
-  ASSERT_EQ(sync_count, 2);
   ASSERT_EQ(starting_index, 3);
 
   for (const CPUExpandEntry &node : nodes_for_explicit_hist_build_) {
@@ -78,7 +75,6 @@ void TestSyncHist(bool is_distributed) {
   std::vector<CPUExpandEntry> nodes_for_explicit_hist_build_;
   std::vector<CPUExpandEntry> nodes_for_subtraction_trick_;
   int starting_index = std::numeric_limits<int>::max();
-  int sync_count = 0;
   RegTree tree;
 
   auto p_fmat = RandomDataGenerator(kNRows, kNCols, 0.8).Seed(3).GenerateDMatrix();
@@ -100,9 +96,8 @@ void TestSyncHist(bool is_distributed) {
 
   // level 0
   nodes_for_explicit_hist_build_.emplace_back(0, tree.GetDepth(0));
-  histogram.AddHistRows(&starting_index, &sync_count,
-                        nodes_for_explicit_hist_build_,
-                        nodes_for_subtraction_trick_, &tree);
+  histogram.AddHistRows(&starting_index, nodes_for_explicit_hist_build_,
+                        nodes_for_subtraction_trick_);
 
   tree.ExpandNode(0, 0, 0, false, 0, 0, 0, 0, 0, 0, 0);
   nodes_for_explicit_hist_build_.clear();
@@ -112,9 +107,8 @@ void TestSyncHist(bool is_distributed) {
   nodes_for_explicit_hist_build_.emplace_back(tree[0].LeftChild(), tree.GetDepth(1));
   nodes_for_subtraction_trick_.emplace_back(tree[0].RightChild(), tree.GetDepth(2));
 
-  histogram.AddHistRows(&starting_index, &sync_count,
-                        nodes_for_explicit_hist_build_,
-                        nodes_for_subtraction_trick_, &tree);
+  histogram.AddHistRows(&starting_index, nodes_for_explicit_hist_build_,
+                        nodes_for_subtraction_trick_);
 
   tree.ExpandNode(tree[0].LeftChild(), 0, 0, false, 0, 0, 0, 0, 0, 0, 0);
   tree.ExpandNode(tree[0].RightChild(), 0, 0, false, 0, 0, 0, 0, 0, 0, 0);
@@ -127,9 +121,8 @@ void TestSyncHist(bool is_distributed) {
   nodes_for_explicit_hist_build_.emplace_back(5, tree.GetDepth(5));
   nodes_for_subtraction_trick_.emplace_back(6, tree.GetDepth(6));
 
-  histogram.AddHistRows(&starting_index, &sync_count,
-                        nodes_for_explicit_hist_build_,
-                        nodes_for_subtraction_trick_, &tree);
+  histogram.AddHistRows(&starting_index, nodes_for_explicit_hist_build_,
+                        nodes_for_subtraction_trick_);
 
   const size_t n_nodes = nodes_for_explicit_hist_build_.size();
   ASSERT_EQ(n_nodes, 2ul);
@@ -175,14 +168,8 @@ void TestSyncHist(bool is_distributed) {
 
   histogram.Buffer().Reset(1, n_nodes, space, target_hists);
   // sync hist
-  if (is_distributed) {
-    histogram.SyncHistogramDistributed(&tree, nodes_for_explicit_hist_build_,
-                                       nodes_for_subtraction_trick_,
-                                       starting_index, sync_count);
-  } else {
-    histogram.SyncHistogramLocal(&tree, nodes_for_explicit_hist_build_,
-                                 nodes_for_subtraction_trick_);
-  }
+  histogram.SyncHistogram(&tree, nodes_for_explicit_hist_build_,
+                                     nodes_for_subtraction_trick_, starting_index);
 
   using GHistRowT = common::GHistRow;
   auto check_hist = [](const GHistRowT parent, const GHistRowT left, const GHistRowT right,
@@ -487,4 +474,3 @@ TEST(CPUHistogram, ExternalMemory) {
   TestHistogramExternalMemory(&ctx, {kBins, sparse_thresh}, false, true);
 }
 }  // namespace xgboost::tree
-

From 04c99683c3b4f27afb8a877c67721ccd6715881a Mon Sep 17 00:00:00 2001
From: jinmfeng001 <102719116+jinmfeng001@users.noreply.github.com>
Date: Thu, 3 Aug 2023 23:40:04 +0800
Subject: [PATCH 063/136] Change training stage from ResultStage to
 ShuffleMapStage (#9423)

---
 .../main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala   | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
index 2f1f261fb..7bb245035 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
@@ -407,7 +407,10 @@ object XGBoost extends Serializable {
 
         }}
 
-        val (booster, metrics) = boostersAndMetrics.collect()(0)
+        // The repartition step is to make training stage as ShuffleMapStage, so that when one
+        // of the training task fails the training stage can retry. ResultStage won't retry when
+        // it fails.
+        val (booster, metrics) = boostersAndMetrics.repartition(1).collect()(0)
         val trackerReturnVal = tracker.waitFor(0L)
         logger.info(s"Rabit returns with exit code $trackerReturnVal")
         if (trackerReturnVal != 0) {

From 1aabc690ec3969c3e62666cd702808d870e8671c Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Thu, 3 Aug 2023 20:42:07 -0700
Subject: [PATCH 064/136] [Doc] Clarify the output behavior of reg:logistic
 (#9435)

---
 doc/parameter.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/parameter.rst b/doc/parameter.rst
index 6f767c80d..1b1bb80a4 100644
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -345,7 +345,7 @@ Specify the learning task and the corresponding learning objective. The objectiv
 
   - ``reg:squarederror``: regression with squared loss.
   - ``reg:squaredlogerror``: regression with squared log loss :math:`\frac{1}{2}[log(pred + 1) - log(label + 1)]^2`.  All input labels are required to be greater than -1.  Also, see metric ``rmsle`` for possible issue  with this objective.
-  - ``reg:logistic``: logistic regression.
+  - ``reg:logistic``: logistic regression, output probability
   - ``reg:pseudohubererror``: regression with Pseudo Huber loss, a twice differentiable alternative to absolute loss.
   - ``reg:absoluteerror``: Regression with L1 error. When tree model is used, leaf value is refreshed after tree construction. If used in distributed training, the leaf value is calculated as the mean value from all workers, which is not guaranteed to be optimal.
 

From bde1ebc209e749775ccced5155c8b1b4e7af240e Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Fri, 4 Aug 2023 00:14:31 -0700
Subject: [PATCH 065/136] Switch back to the GPUIDX macro (#9438)

---
 tests/cpp/common/test_quantile.cu          |  6 ++--
 tests/cpp/helpers.h                        | 15 ++++-----
 tests/cpp/linear/test_json_io.h            |  2 +-
 tests/cpp/linear/test_linear.cc            |  4 +--
 tests/cpp/metric/test_auc.h                | 12 +++----
 tests/cpp/metric/test_elementwise_metric.h | 38 +++++++++++-----------
 tests/cpp/metric/test_metric.cc            |  2 +-
 tests/cpp/metric/test_multiclass_metric.h  |  8 ++---
 tests/cpp/metric/test_rank_metric.cc       |  2 +-
 tests/cpp/metric/test_rank_metric.h        |  8 ++---
 tests/cpp/metric/test_survival_metric.cu   |  4 +--
 tests/cpp/metric/test_survival_metric.h    |  6 ++--
 tests/cpp/objective/test_aft_obj.cc        | 10 +++---
 tests/cpp/objective/test_hinge.cc          |  2 +-
 tests/cpp/objective/test_lambdarank_obj.cc |  2 +-
 tests/cpp/objective/test_multiclass_obj.cc |  6 ++--
 tests/cpp/objective/test_quantile_obj.cc   |  4 +--
 tests/cpp/objective/test_regression_obj.cc | 32 +++++++++---------
 tests/cpp/plugin/test_example_objective.cc |  2 +-
 tests/cpp/plugin/test_federated_adapter.cu |  4 +--
 tests/cpp/predictor/test_gpu_predictor.cu  |  4 +--
 21 files changed, 85 insertions(+), 88 deletions(-)

diff --git a/tests/cpp/common/test_quantile.cu b/tests/cpp/common/test_quantile.cu
index 28d698685..a5ecbb598 100644
--- a/tests/cpp/common/test_quantile.cu
+++ b/tests/cpp/common/test_quantile.cu
@@ -351,7 +351,7 @@ void TestAllReduceBasic() {
   auto const world = collective::GetWorldSize();
   constexpr size_t kRows = 1000, kCols = 100;
   RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
-    auto const device = GetGPUId();
+    auto const device = GPUIDX;
 
     // Set up single node version;
     HostDeviceVector<FeatureType> ft({}, device);
@@ -440,7 +440,7 @@ void TestColumnSplitBasic() {
   }()};
 
   // Generate cuts for distributed environment.
-  auto ctx = MakeCUDACtx(GetGPUId());
+  auto ctx = MakeCUDACtx(GPUIDX);
   HistogramCuts distributed_cuts = common::DeviceSketch(&ctx, m.get(), kBins);
 
   // Generate cuts for single node environment
@@ -483,7 +483,7 @@ void TestSameOnAllWorkers() {
   RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins,
                                  MetaInfo const &info) {
     auto const rank = collective::GetRank();
-    auto const device = GetGPUId();
+    auto const device = GPUIDX;
     HostDeviceVector<FeatureType> ft({}, device);
     SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, device);
     HostDeviceVector<float> storage({}, device);
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index 6cb0b3405..e39375dfa 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -34,6 +34,12 @@
 #define DeclareUnifiedTest(name) name
 #endif
 
+#if defined(__CUDACC__)
+#define GPUIDX (common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank())
+#else
+#define GPUIDX (-1)
+#endif
+
 #if defined(__CUDACC__)
 #define DeclareUnifiedDistributedTest(name) MGPU ## name
 #else
@@ -540,15 +546,6 @@ void RunWithInMemoryCommunicator(int32_t world_size, Function&& function, Args&&
 #endif
 }
 
-inline int GetGPUId() {
-#if defined(__CUDACC__)
-  auto const n_gpus = common::AllVisibleGPUs();
-  return n_gpus == 1 ? 0 : collective::GetRank();
-#else
-  return -1;
-#endif
-}
-
 class BaseMGPUTest : public ::testing::Test {
  protected:
   int world_size_;
diff --git a/tests/cpp/linear/test_json_io.h b/tests/cpp/linear/test_json_io.h
index 48d4497c3..c423448e2 100644
--- a/tests/cpp/linear/test_json_io.h
+++ b/tests/cpp/linear/test_json_io.h
@@ -12,7 +12,7 @@
 
 namespace xgboost {
 inline void TestUpdaterJsonIO(std::string updater_str) {
-  Context ctx{MakeCUDACtx(GetGPUId())};
+  Context ctx{MakeCUDACtx(GPUIDX)};
   Json config_0 {Object() };
 
   {
diff --git a/tests/cpp/linear/test_linear.cc b/tests/cpp/linear/test_linear.cc
index f15a47e64..6b2d17e10 100644
--- a/tests/cpp/linear/test_linear.cc
+++ b/tests/cpp/linear/test_linear.cc
@@ -17,7 +17,7 @@ TEST(Linear, Shotgun) {
 
   auto p_fmat = xgboost::RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
 
-  auto ctx = MakeCUDACtx(GetGPUId());
+  auto ctx = MakeCUDACtx(GPUIDX);
   LearnerModelParam mparam{MakeMP(kCols, .5, 1)};
 
   {
@@ -49,7 +49,7 @@ TEST(Linear, coordinate) {
 
   auto p_fmat = xgboost::RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
 
-  auto ctx = MakeCUDACtx(GetGPUId());
+  auto ctx = MakeCUDACtx(GPUIDX);
   LearnerModelParam mparam{MakeMP(kCols, .5, 1)};
 
   auto updater = std::unique_ptr<xgboost::LinearUpdater>(
diff --git a/tests/cpp/metric/test_auc.h b/tests/cpp/metric/test_auc.h
index cd0095ebb..0dd3dd83e 100644
--- a/tests/cpp/metric/test_auc.h
+++ b/tests/cpp/metric/test_auc.h
@@ -11,7 +11,7 @@ namespace xgboost {
 namespace metric {
 
 inline void VerifyBinaryAUC(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = MakeCUDACtx(GetGPUId());
+  auto ctx = MakeCUDACtx(GPUIDX);
   std::unique_ptr<Metric> uni_ptr{Metric::Create("auc", &ctx)};
   Metric* metric = uni_ptr.get();
   ASSERT_STREQ(metric->Name(), "auc");
@@ -54,7 +54,7 @@ inline void VerifyBinaryAUC(DataSplitMode data_split_mode = DataSplitMode::kRow)
 }
 
 inline void VerifyMultiClassAUC(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = MakeCUDACtx(GetGPUId());
+  auto ctx = MakeCUDACtx(GPUIDX);
   std::unique_ptr<Metric> uni_ptr{Metric::Create("auc", &ctx)};
   auto metric = uni_ptr.get();
 
@@ -115,7 +115,7 @@ inline void VerifyMultiClassAUC(DataSplitMode data_split_mode = DataSplitMode::k
 }
 
 inline void VerifyRankingAUC(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = MakeCUDACtx(GetGPUId());
+  auto ctx = MakeCUDACtx(GPUIDX);
   std::unique_ptr<Metric> metric{Metric::Create("auc", &ctx)};
 
   // single group
@@ -149,7 +149,7 @@ inline void VerifyRankingAUC(DataSplitMode data_split_mode = DataSplitMode::kRow
 }
 
 inline void VerifyPRAUC(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = MakeCUDACtx(GetGPUId());
+  auto ctx = MakeCUDACtx(GPUIDX);
 
   xgboost::Metric* metric = xgboost::Metric::Create("aucpr", &ctx);
   ASSERT_STREQ(metric->Name(), "aucpr");
@@ -186,7 +186,7 @@ inline void VerifyPRAUC(DataSplitMode data_split_mode = DataSplitMode::kRow) {
 }
 
 inline void VerifyMultiClassPRAUC(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = MakeCUDACtx(GetGPUId());
+  auto ctx = MakeCUDACtx(GPUIDX);
 
   std::unique_ptr<Metric> metric{Metric::Create("aucpr", &ctx)};
 
@@ -210,7 +210,7 @@ inline void VerifyMultiClassPRAUC(DataSplitMode data_split_mode = DataSplitMode:
 }
 
 inline void VerifyRankingPRAUC(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = MakeCUDACtx(GetGPUId());
+  auto ctx = MakeCUDACtx(GPUIDX);
 
   std::unique_ptr<Metric> metric{Metric::Create("aucpr", &ctx)};
 
diff --git a/tests/cpp/metric/test_elementwise_metric.h b/tests/cpp/metric/test_elementwise_metric.h
index 9a3d3fe9f..a32bb0438 100644
--- a/tests/cpp/metric/test_elementwise_metric.h
+++ b/tests/cpp/metric/test_elementwise_metric.h
@@ -46,7 +46,7 @@ inline void CheckDeterministicMetricElementWise(StringView name, int32_t device)
 }
 
 inline void VerifyRMSE(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = MakeCUDACtx(GetGPUId());
+  auto ctx = MakeCUDACtx(GPUIDX);
   xgboost::Metric * metric = xgboost::Metric::Create("rmse", &ctx);
   metric->Configure({});
   ASSERT_STREQ(metric->Name(), "rmse");
@@ -71,11 +71,11 @@ inline void VerifyRMSE(DataSplitMode data_split_mode = DataSplitMode::kRow) {
               0.6708f, 0.001f);
   delete metric;
 
-  CheckDeterministicMetricElementWise(StringView{"rmse"}, GetGPUId());
+  CheckDeterministicMetricElementWise(StringView{"rmse"}, GPUIDX);
 }
 
 inline void VerifyRMSLE(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = MakeCUDACtx(GetGPUId());
+  auto ctx = MakeCUDACtx(GPUIDX);
   xgboost::Metric * metric = xgboost::Metric::Create("rmsle", &ctx);
   metric->Configure({});
   ASSERT_STREQ(metric->Name(), "rmsle");
@@ -100,11 +100,11 @@ inline void VerifyRMSLE(DataSplitMode data_split_mode = DataSplitMode::kRow) {
               0.2415f, 1e-4);
   delete metric;
 
-  CheckDeterministicMetricElementWise(StringView{"rmsle"}, GetGPUId());
+  CheckDeterministicMetricElementWise(StringView{"rmsle"}, GPUIDX);
 }
 
 inline void VerifyMAE(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = MakeCUDACtx(GetGPUId());
+  auto ctx = MakeCUDACtx(GPUIDX);
   xgboost::Metric * metric = xgboost::Metric::Create("mae", &ctx);
   metric->Configure({});
   ASSERT_STREQ(metric->Name(), "mae");
@@ -129,11 +129,11 @@ inline void VerifyMAE(DataSplitMode data_split_mode = DataSplitMode::kRow) {
               0.54f, 0.001f);
   delete metric;
 
-  CheckDeterministicMetricElementWise(StringView{"mae"}, GetGPUId());
+  CheckDeterministicMetricElementWise(StringView{"mae"}, GPUIDX);
 }
 
 inline void VerifyMAPE(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = MakeCUDACtx(GetGPUId());
+  auto ctx = MakeCUDACtx(GPUIDX);
   xgboost::Metric * metric = xgboost::Metric::Create("mape", &ctx);
   metric->Configure({});
   ASSERT_STREQ(metric->Name(), "mape");
@@ -158,11 +158,11 @@ inline void VerifyMAPE(DataSplitMode data_split_mode = DataSplitMode::kRow) {
               1.3250f, 0.001f);
   delete metric;
 
-  CheckDeterministicMetricElementWise(StringView{"mape"}, GetGPUId());
+  CheckDeterministicMetricElementWise(StringView{"mape"}, GPUIDX);
 }
 
 inline void VerifyMPHE(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = MakeCUDACtx(GetGPUId());
+  auto ctx = MakeCUDACtx(GPUIDX);
   std::unique_ptr<xgboost::Metric> metric{xgboost::Metric::Create("mphe", &ctx)};
   metric->Configure({});
   ASSERT_STREQ(metric->Name(), "mphe");
@@ -186,7 +186,7 @@ inline void VerifyMPHE(DataSplitMode data_split_mode = DataSplitMode::kRow) {
                             {  1,   2,   9,   8}, {}, data_split_mode),
               0.1922f, 1e-4);
 
-  CheckDeterministicMetricElementWise(StringView{"mphe"}, GetGPUId());
+  CheckDeterministicMetricElementWise(StringView{"mphe"}, GPUIDX);
 
   metric->Configure({{"huber_slope", "0.1"}});
   EXPECT_NEAR(GetMetricEval(metric.get(),
@@ -197,7 +197,7 @@ inline void VerifyMPHE(DataSplitMode data_split_mode = DataSplitMode::kRow) {
 }
 
 inline void VerifyLogLoss(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = MakeCUDACtx(GetGPUId());
+  auto ctx = MakeCUDACtx(GPUIDX);
   xgboost::Metric * metric = xgboost::Metric::Create("logloss", &ctx);
   metric->Configure({});
   ASSERT_STREQ(metric->Name(), "logloss");
@@ -226,11 +226,11 @@ inline void VerifyLogLoss(DataSplitMode data_split_mode = DataSplitMode::kRow) {
               1.3138f, 0.001f);
   delete metric;
 
-  CheckDeterministicMetricElementWise(StringView{"logloss"}, GetGPUId());
+  CheckDeterministicMetricElementWise(StringView{"logloss"}, GPUIDX);
 }
 
 inline void VerifyError(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = MakeCUDACtx(GetGPUId());
+  auto ctx = MakeCUDACtx(GPUIDX);
   xgboost::Metric * metric = xgboost::Metric::Create("error", &ctx);
   metric->Configure({});
   ASSERT_STREQ(metric->Name(), "error");
@@ -288,11 +288,11 @@ inline void VerifyError(DataSplitMode data_split_mode = DataSplitMode::kRow) {
               0.45f, 0.001f);
   delete metric;
 
-  CheckDeterministicMetricElementWise(StringView{"error@0.5"}, GetGPUId());
+  CheckDeterministicMetricElementWise(StringView{"error@0.5"}, GPUIDX);
 }
 
 inline void VerifyPoissonNegLogLik(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = MakeCUDACtx(GetGPUId());
+  auto ctx = MakeCUDACtx(GPUIDX);
   xgboost::Metric * metric = xgboost::Metric::Create("poisson-nloglik", &ctx);
   metric->Configure({});
   ASSERT_STREQ(metric->Name(), "poisson-nloglik");
@@ -321,18 +321,18 @@ inline void VerifyPoissonNegLogLik(DataSplitMode data_split_mode = DataSplitMode
               1.5783f, 0.001f);
   delete metric;
 
-  CheckDeterministicMetricElementWise(StringView{"poisson-nloglik"}, GetGPUId());
+  CheckDeterministicMetricElementWise(StringView{"poisson-nloglik"}, GPUIDX);
 }
 
 inline void VerifyMultiRMSE(DataSplitMode data_split_mode = DataSplitMode::kRow) {
   size_t n_samples = 32, n_targets = 8;
-  linalg::Tensor<float, 2> y{{n_samples, n_targets}, GetGPUId()};
+  linalg::Tensor<float, 2> y{{n_samples, n_targets}, GPUIDX};
   auto &h_y = y.Data()->HostVector();
   std::iota(h_y.begin(), h_y.end(), 0);
 
   HostDeviceVector<float> predt(n_samples * n_targets, 0);
 
-  auto ctx = MakeCUDACtx(GetGPUId());
+  auto ctx = MakeCUDACtx(GPUIDX);
   std::unique_ptr<Metric> metric{Metric::Create("rmse", &ctx)};
   metric->Configure({});
 
@@ -347,7 +347,7 @@ inline void VerifyMultiRMSE(DataSplitMode data_split_mode = DataSplitMode::kRow)
 }
 
 inline void VerifyQuantile(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = MakeCUDACtx(GetGPUId());
+  auto ctx = MakeCUDACtx(GPUIDX);
   std::unique_ptr<Metric> metric{Metric::Create("quantile", &ctx)};
 
   HostDeviceVector<float> predts{0.1f, 0.9f, 0.1f, 0.9f};
diff --git a/tests/cpp/metric/test_metric.cc b/tests/cpp/metric/test_metric.cc
index c629a1481..f43751366 100644
--- a/tests/cpp/metric/test_metric.cc
+++ b/tests/cpp/metric/test_metric.cc
@@ -4,7 +4,7 @@
 #include "../helpers.h"
 namespace xgboost {
 TEST(Metric, UnknownMetric) {
-  auto ctx = MakeCUDACtx(GetGPUId());
+  auto ctx = MakeCUDACtx(GPUIDX);
   xgboost::Metric* metric = nullptr;
   EXPECT_ANY_THROW(metric = xgboost::Metric::Create("unknown_name", &ctx));
   EXPECT_NO_THROW(metric = xgboost::Metric::Create("rmse", &ctx));
diff --git a/tests/cpp/metric/test_multiclass_metric.h b/tests/cpp/metric/test_multiclass_metric.h
index f147c91fa..5fdead596 100644
--- a/tests/cpp/metric/test_multiclass_metric.h
+++ b/tests/cpp/metric/test_multiclass_metric.h
@@ -60,8 +60,8 @@ inline void TestMultiClassError(int device, DataSplitMode data_split_mode) {
 }
 
 inline void VerifyMultiClassError(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  TestMultiClassError(GetGPUId(), data_split_mode);
-  CheckDeterministicMetricMultiClass(StringView{"merror"}, GetGPUId());
+  TestMultiClassError(GPUIDX, data_split_mode);
+  CheckDeterministicMetricMultiClass(StringView{"merror"}, GPUIDX);
 }
 
 inline void TestMultiClassLogLoss(int device, DataSplitMode data_split_mode) {
@@ -81,8 +81,8 @@ inline void TestMultiClassLogLoss(int device, DataSplitMode data_split_mode) {
 }
 
 inline void VerifyMultiClassLogLoss(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  TestMultiClassLogLoss(GetGPUId(), data_split_mode);
-  CheckDeterministicMetricMultiClass(StringView{"mlogloss"}, GetGPUId());
+  TestMultiClassLogLoss(GPUIDX, data_split_mode);
+  CheckDeterministicMetricMultiClass(StringView{"mlogloss"}, GPUIDX);
 }
 
 }  // namespace metric
diff --git a/tests/cpp/metric/test_rank_metric.cc b/tests/cpp/metric/test_rank_metric.cc
index 066e981b9..fbf0611b3 100644
--- a/tests/cpp/metric/test_rank_metric.cc
+++ b/tests/cpp/metric/test_rank_metric.cc
@@ -22,7 +22,7 @@ namespace metric {
 
 #if !defined(__CUDACC__)
 TEST(Metric, AMS) {
-  auto ctx = MakeCUDACtx(GetGPUId());
+  auto ctx = MakeCUDACtx(GPUIDX);
   EXPECT_ANY_THROW(Metric::Create("ams", &ctx));
   Metric* metric = Metric::Create("ams@0.5f", &ctx);
   ASSERT_STREQ(metric->Name(), "ams@0.5");
diff --git a/tests/cpp/metric/test_rank_metric.h b/tests/cpp/metric/test_rank_metric.h
index 82d3725f5..2f7785689 100644
--- a/tests/cpp/metric/test_rank_metric.h
+++ b/tests/cpp/metric/test_rank_metric.h
@@ -20,7 +20,7 @@
 namespace xgboost::metric {
 
 inline void VerifyPrecision(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = MakeCUDACtx(GetGPUId());
+  auto ctx = MakeCUDACtx(GPUIDX);
   std::unique_ptr<xgboost::Metric> metric{Metric::Create("pre", &ctx)};
   ASSERT_STREQ(metric->Name(), "pre");
   EXPECT_NEAR(GetMetricEval(metric.get(), {0, 1}, {0, 1}, {}, {}, data_split_mode), 0.5, 1e-7);
@@ -44,7 +44,7 @@ inline void VerifyPrecision(DataSplitMode data_split_mode = DataSplitMode::kRow)
 }
 
 inline void VerifyNDCG(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = MakeCUDACtx(GetGPUId());
+  auto ctx = MakeCUDACtx(GPUIDX);
   Metric * metric = xgboost::Metric::Create("ndcg", &ctx);
   ASSERT_STREQ(metric->Name(), "ndcg");
   EXPECT_ANY_THROW(GetMetricEval(metric, {0, 1}, {}, {}, {}, data_split_mode));
@@ -102,7 +102,7 @@ inline void VerifyNDCG(DataSplitMode data_split_mode = DataSplitMode::kRow) {
 }
 
 inline void VerifyMAP(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = MakeCUDACtx(GetGPUId());
+  auto ctx = MakeCUDACtx(GPUIDX);
   Metric * metric = xgboost::Metric::Create("map", &ctx);
   ASSERT_STREQ(metric->Name(), "map");
   EXPECT_NEAR(GetMetricEval(metric, {0, 1}, {0, 1}, {}, {}, data_split_mode), 1, kRtEps);
@@ -150,7 +150,7 @@ inline void VerifyMAP(DataSplitMode data_split_mode = DataSplitMode::kRow) {
 }
 
 inline void VerifyNDCGExpGain(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  Context ctx = MakeCUDACtx(GetGPUId());
+  Context ctx = MakeCUDACtx(GPUIDX);
 
   auto p_fmat = xgboost::RandomDataGenerator{0, 0, 0}.GenerateDMatrix();
   MetaInfo& info = p_fmat->Info();
diff --git a/tests/cpp/metric/test_survival_metric.cu b/tests/cpp/metric/test_survival_metric.cu
index da97b083b..eec92dc99 100644
--- a/tests/cpp/metric/test_survival_metric.cu
+++ b/tests/cpp/metric/test_survival_metric.cu
@@ -31,7 +31,7 @@ TEST_F(DeclareUnifiedDistributedTest(MetricTest), IntervalRegressionAccuracyColu
 
 // Test configuration of AFT metric
 TEST(AFTNegLogLikMetric, DeclareUnifiedTest(Configuration)) {
-  auto ctx = MakeCUDACtx(GetGPUId());
+  auto ctx = MakeCUDACtx(GPUIDX);
   std::unique_ptr<Metric> metric(Metric::Create("aft-nloglik", &ctx));
   metric->Configure({{"aft_loss_distribution", "normal"}, {"aft_loss_distribution_scale", "10"}});
 
@@ -42,7 +42,7 @@ TEST(AFTNegLogLikMetric, DeclareUnifiedTest(Configuration)) {
   EXPECT_EQ(get<String>(aft_param_json["aft_loss_distribution"]), "normal");
   EXPECT_EQ(get<String>(aft_param_json["aft_loss_distribution_scale"]), "10");
 
-  CheckDeterministicMetricElementWise(StringView{"aft-nloglik"}, GetGPUId());
+  CheckDeterministicMetricElementWise(StringView{"aft-nloglik"}, GPUIDX);
 }
 }  // namespace common
 }  // namespace xgboost
diff --git a/tests/cpp/metric/test_survival_metric.h b/tests/cpp/metric/test_survival_metric.h
index 5baa5b5a4..1626d3772 100644
--- a/tests/cpp/metric/test_survival_metric.h
+++ b/tests/cpp/metric/test_survival_metric.h
@@ -48,7 +48,7 @@ inline void CheckDeterministicMetricElementWise(StringView name, int32_t device)
 }
 
 inline void VerifyAFTNegLogLik(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = MakeCUDACtx(GetGPUId());
+  auto ctx = MakeCUDACtx(GPUIDX);
 
   /**
    * Test aggregate output from the AFT metric over a small test data set.
@@ -79,7 +79,7 @@ inline void VerifyAFTNegLogLik(DataSplitMode data_split_mode = DataSplitMode::kR
 }
 
 inline void VerifyIntervalRegressionAccuracy(DataSplitMode data_split_mode = DataSplitMode::kRow) {
-  auto ctx = MakeCUDACtx(GetGPUId());
+  auto ctx = MakeCUDACtx(GPUIDX);
 
   auto p_fmat = EmptyDMatrix();
   MetaInfo& info = p_fmat->Info();
@@ -101,7 +101,7 @@ inline void VerifyIntervalRegressionAccuracy(DataSplitMode data_split_mode = Dat
   info.labels_lower_bound_.HostVector()[0] = 70.0f;
   EXPECT_FLOAT_EQ(metric->Evaluate(preds, p_fmat), 0.25f);
 
-  CheckDeterministicMetricElementWise(StringView{"interval-regression-accuracy"}, GetGPUId());
+  CheckDeterministicMetricElementWise(StringView{"interval-regression-accuracy"}, GPUIDX);
 }
 }  // namespace common
 }  // namespace xgboost
diff --git a/tests/cpp/objective/test_aft_obj.cc b/tests/cpp/objective/test_aft_obj.cc
index 60aebdf3a..74973918c 100644
--- a/tests/cpp/objective/test_aft_obj.cc
+++ b/tests/cpp/objective/test_aft_obj.cc
@@ -16,7 +16,7 @@ namespace xgboost {
 namespace common {
 
 TEST(Objective, DeclareUnifiedTest(AFTObjConfiguration)) {
-  auto ctx = MakeCUDACtx(GetGPUId());
+  auto ctx = MakeCUDACtx(GPUIDX);
   std::unique_ptr<ObjFunction> objective(ObjFunction::Create("survival:aft", &ctx));
   objective->Configure({ {"aft_loss_distribution", "logistic"},
                           {"aft_loss_distribution_scale", "5"} });
@@ -77,7 +77,7 @@ static inline void CheckGPairOverGridPoints(
 }
 
 TEST(Objective, DeclareUnifiedTest(AFTObjGPairUncensoredLabels)) {
-  auto ctx = MakeCUDACtx(GetGPUId());
+  auto ctx = MakeCUDACtx(GPUIDX);
   std::unique_ptr<ObjFunction> obj(ObjFunction::Create("survival:aft", &ctx));
 
   CheckGPairOverGridPoints(obj.get(), 100.0f, 100.0f, "normal",
@@ -101,7 +101,7 @@ TEST(Objective, DeclareUnifiedTest(AFTObjGPairUncensoredLabels)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(AFTObjGPairLeftCensoredLabels)) {
-  auto ctx = MakeCUDACtx(GetGPUId());
+  auto ctx = MakeCUDACtx(GPUIDX);
   std::unique_ptr<ObjFunction> obj(ObjFunction::Create("survival:aft", &ctx));
 
   CheckGPairOverGridPoints(obj.get(), 0.0f, 20.0f, "normal",
@@ -122,7 +122,7 @@ TEST(Objective, DeclareUnifiedTest(AFTObjGPairLeftCensoredLabels)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(AFTObjGPairRightCensoredLabels)) {
-  auto ctx = MakeCUDACtx(GetGPUId());
+  auto ctx = MakeCUDACtx(GPUIDX);
   std::unique_ptr<ObjFunction> obj(ObjFunction::Create("survival:aft", &ctx));
 
   CheckGPairOverGridPoints(obj.get(), 60.0f, std::numeric_limits<float>::infinity(), "normal",
@@ -146,7 +146,7 @@ TEST(Objective, DeclareUnifiedTest(AFTObjGPairRightCensoredLabels)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(AFTObjGPairIntervalCensoredLabels)) {
-  auto ctx = MakeCUDACtx(GetGPUId());
+  auto ctx = MakeCUDACtx(GPUIDX);
   std::unique_ptr<ObjFunction> obj(ObjFunction::Create("survival:aft", &ctx));
 
   CheckGPairOverGridPoints(obj.get(), 16.0f, 200.0f, "normal",
diff --git a/tests/cpp/objective/test_hinge.cc b/tests/cpp/objective/test_hinge.cc
index a4b8525fa..17d2609d4 100644
--- a/tests/cpp/objective/test_hinge.cc
+++ b/tests/cpp/objective/test_hinge.cc
@@ -6,7 +6,7 @@
 #include "../helpers.h"
 namespace xgboost {
 TEST(Objective, DeclareUnifiedTest(HingeObj)) {
-  Context ctx = MakeCUDACtx(GetGPUId());
+  Context ctx = MakeCUDACtx(GPUIDX);
   std::unique_ptr<ObjFunction> obj{ObjFunction::Create("binary:hinge", &ctx)};
 
   float eps = std::numeric_limits<xgboost::bst_float>::min();
diff --git a/tests/cpp/objective/test_lambdarank_obj.cc b/tests/cpp/objective/test_lambdarank_obj.cc
index 0c65780ae..c808e97f0 100644
--- a/tests/cpp/objective/test_lambdarank_obj.cc
+++ b/tests/cpp/objective/test_lambdarank_obj.cc
@@ -71,7 +71,7 @@ void TestNDCGGPair(Context const* ctx) {
 
   HostDeviceVector<float> predts{0, 1, 0, 1};
   MetaInfo info;
-  info.labels = linalg::Tensor<float, 2>{{0, 1, 0, 1}, {4, 1}, GetGPUId()};
+  info.labels = linalg::Tensor<float, 2>{{0, 1, 0, 1}, {4, 1}, GPUIDX};
   info.group_ptr_ = {0, 2, 4};
   info.num_row_ = 4;
   HostDeviceVector<GradientPair> gpairs;
diff --git a/tests/cpp/objective/test_multiclass_obj.cc b/tests/cpp/objective/test_multiclass_obj.cc
index fa8fc27e4..d028ef9cf 100644
--- a/tests/cpp/objective/test_multiclass_obj.cc
+++ b/tests/cpp/objective/test_multiclass_obj.cc
@@ -9,7 +9,7 @@
 namespace xgboost {
 
 TEST(Objective, DeclareUnifiedTest(SoftmaxMultiClassObjGPair)) {
-  Context ctx = MakeCUDACtx(GetGPUId());
+  Context ctx = MakeCUDACtx(GPUIDX);
   std::vector<std::pair<std::string, std::string>> args {{"num_class", "3"}};
   std::unique_ptr<ObjFunction> obj {
     ObjFunction::Create("multi:softmax", &ctx)
@@ -36,7 +36,7 @@ TEST(Objective, DeclareUnifiedTest(SoftmaxMultiClassObjGPair)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(SoftmaxMultiClassBasic)) {
-  auto ctx = MakeCUDACtx(GetGPUId());
+  auto ctx = MakeCUDACtx(GPUIDX);
   std::vector<std::pair<std::string, std::string>> args{
       std::pair<std::string, std::string>("num_class", "3")};
 
@@ -57,7 +57,7 @@ TEST(Objective, DeclareUnifiedTest(SoftmaxMultiClassBasic)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(SoftprobMultiClassBasic)) {
-  Context ctx = MakeCUDACtx(GetGPUId());
+  Context ctx = MakeCUDACtx(GPUIDX);
   std::vector<std::pair<std::string, std::string>> args {
     std::pair<std::string, std::string>("num_class", "3")};
 
diff --git a/tests/cpp/objective/test_quantile_obj.cc b/tests/cpp/objective/test_quantile_obj.cc
index 5078440bb..b263b4a8f 100644
--- a/tests/cpp/objective/test_quantile_obj.cc
+++ b/tests/cpp/objective/test_quantile_obj.cc
@@ -14,7 +14,7 @@
 
 namespace xgboost {
 TEST(Objective, DeclareUnifiedTest(Quantile)) {
-  Context ctx = MakeCUDACtx(GetGPUId());
+  Context ctx = MakeCUDACtx(GPUIDX);
 
   {
     Args args{{"quantile_alpha", "[0.6, 0.8]"}};
@@ -37,7 +37,7 @@ TEST(Objective, DeclareUnifiedTest(Quantile)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(QuantileIntercept)) {
-  Context ctx = MakeCUDACtx(GetGPUId());
+  Context ctx = MakeCUDACtx(GPUIDX);
   Args args{{"quantile_alpha", "[0.6, 0.8]"}};
   std::unique_ptr<ObjFunction> obj{ObjFunction::Create("reg:quantileerror", &ctx)};
   obj->Configure(args);
diff --git a/tests/cpp/objective/test_regression_obj.cc b/tests/cpp/objective/test_regression_obj.cc
index 635fae997..b8a40603b 100644
--- a/tests/cpp/objective/test_regression_obj.cc
+++ b/tests/cpp/objective/test_regression_obj.cc
@@ -17,7 +17,7 @@
 namespace xgboost {
 
 TEST(Objective, DeclareUnifiedTest(LinearRegressionGPair)) {
-  Context ctx = MakeCUDACtx(GetGPUId());
+  Context ctx = MakeCUDACtx(GPUIDX);
   std::vector<std::pair<std::string, std::string>> args;
 
   std::unique_ptr<ObjFunction> obj{ObjFunction::Create("reg:squarederror", &ctx)};
@@ -39,7 +39,7 @@ TEST(Objective, DeclareUnifiedTest(LinearRegressionGPair)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(SquaredLog)) {
-  Context ctx = MakeCUDACtx(GetGPUId());
+  Context ctx = MakeCUDACtx(GPUIDX);
   std::vector<std::pair<std::string, std::string>> args;
 
   std::unique_ptr<ObjFunction> obj{ObjFunction::Create("reg:squaredlogerror", &ctx)};
@@ -62,7 +62,7 @@ TEST(Objective, DeclareUnifiedTest(SquaredLog)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(PseudoHuber)) {
-  Context ctx = MakeCUDACtx(GetGPUId());
+  Context ctx = MakeCUDACtx(GPUIDX);
   Args args;
 
   std::unique_ptr<ObjFunction> obj{ObjFunction::Create("reg:pseudohubererror", &ctx)};
@@ -91,7 +91,7 @@ TEST(Objective, DeclareUnifiedTest(PseudoHuber)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(LogisticRegressionGPair)) {
-  Context ctx = MakeCUDACtx(GetGPUId());
+  Context ctx = MakeCUDACtx(GPUIDX);
   std::vector<std::pair<std::string, std::string>> args;
   std::unique_ptr<ObjFunction> obj{ObjFunction::Create("reg:logistic", &ctx)};
 
@@ -107,7 +107,7 @@ TEST(Objective, DeclareUnifiedTest(LogisticRegressionGPair)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(LogisticRegressionBasic)) {
-  Context ctx = MakeCUDACtx(GetGPUId());
+  Context ctx = MakeCUDACtx(GPUIDX);
   std::vector<std::pair<std::string, std::string>> args;
   std::unique_ptr<ObjFunction> obj{ObjFunction::Create("reg:logistic", &ctx)};
 
@@ -136,7 +136,7 @@ TEST(Objective, DeclareUnifiedTest(LogisticRegressionBasic)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(LogisticRawGPair)) {
-  Context ctx = MakeCUDACtx(GetGPUId());
+  Context ctx = MakeCUDACtx(GPUIDX);
   std::vector<std::pair<std::string, std::string>> args;
   std::unique_ptr<ObjFunction>  obj {
     ObjFunction::Create("binary:logitraw", &ctx)
@@ -152,7 +152,7 @@ TEST(Objective, DeclareUnifiedTest(LogisticRawGPair)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(PoissonRegressionGPair)) {
-  Context ctx = MakeCUDACtx(GetGPUId());
+  Context ctx = MakeCUDACtx(GPUIDX);
   std::vector<std::pair<std::string, std::string>> args;
   std::unique_ptr<ObjFunction> obj {
     ObjFunction::Create("count:poisson", &ctx)
@@ -176,7 +176,7 @@ TEST(Objective, DeclareUnifiedTest(PoissonRegressionGPair)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(PoissonRegressionBasic)) {
-  Context ctx = MakeCUDACtx(GetGPUId());
+  Context ctx = MakeCUDACtx(GPUIDX);
   std::vector<std::pair<std::string, std::string>> args;
   std::unique_ptr<ObjFunction> obj {
     ObjFunction::Create("count:poisson", &ctx)
@@ -205,7 +205,7 @@ TEST(Objective, DeclareUnifiedTest(PoissonRegressionBasic)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(GammaRegressionGPair)) {
-  Context ctx = MakeCUDACtx(GetGPUId());
+  Context ctx = MakeCUDACtx(GPUIDX);
   std::vector<std::pair<std::string, std::string>> args;
   std::unique_ptr<ObjFunction> obj {
     ObjFunction::Create("reg:gamma", &ctx)
@@ -227,7 +227,7 @@ TEST(Objective, DeclareUnifiedTest(GammaRegressionGPair)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(GammaRegressionBasic)) {
-  Context ctx = MakeCUDACtx(GetGPUId());
+  Context ctx = MakeCUDACtx(GPUIDX);
   std::vector<std::pair<std::string, std::string>> args;
   std::unique_ptr<ObjFunction> obj{ObjFunction::Create("reg:gamma", &ctx)};
 
@@ -256,7 +256,7 @@ TEST(Objective, DeclareUnifiedTest(GammaRegressionBasic)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(TweedieRegressionGPair)) {
-  Context ctx = MakeCUDACtx(GetGPUId());
+  Context ctx = MakeCUDACtx(GPUIDX);
   std::vector<std::pair<std::string, std::string>> args;
   std::unique_ptr<ObjFunction> obj{ObjFunction::Create("reg:tweedie", &ctx)};
 
@@ -280,7 +280,7 @@ TEST(Objective, DeclareUnifiedTest(TweedieRegressionGPair)) {
 
 #if defined(__CUDACC__)
 TEST(Objective, CPU_vs_CUDA) {
-  Context ctx = MakeCUDACtx(GetGPUId());
+  Context ctx = MakeCUDACtx(GPUIDX);
 
   ObjFunction* obj = ObjFunction::Create("reg:squarederror", &ctx);
   HostDeviceVector<GradientPair> cpu_out_preds;
@@ -331,7 +331,7 @@ TEST(Objective, CPU_vs_CUDA) {
 #endif
 
 TEST(Objective, DeclareUnifiedTest(TweedieRegressionBasic)) {
-  Context ctx = MakeCUDACtx(GetGPUId());
+  Context ctx = MakeCUDACtx(GPUIDX);
   std::vector<std::pair<std::string, std::string>> args;
   std::unique_ptr<ObjFunction> obj{ObjFunction::Create("reg:tweedie", &ctx)};
 
@@ -360,7 +360,7 @@ TEST(Objective, DeclareUnifiedTest(TweedieRegressionBasic)) {
 // CoxRegression not implemented in GPU code, no need for testing.
 #if !defined(__CUDACC__)
 TEST(Objective, CoxRegressionGPair) {
-  Context ctx = MakeCUDACtx(GetGPUId());
+  Context ctx = MakeCUDACtx(GPUIDX);
   std::vector<std::pair<std::string, std::string>> args;
   std::unique_ptr<ObjFunction> obj{ObjFunction::Create("survival:cox", &ctx)};
 
@@ -375,7 +375,7 @@ TEST(Objective, CoxRegressionGPair) {
 #endif
 
 TEST(Objective, DeclareUnifiedTest(AbsoluteError)) {
-  Context ctx = MakeCUDACtx(GetGPUId());
+  Context ctx = MakeCUDACtx(GPUIDX);
   std::unique_ptr<ObjFunction> obj{ObjFunction::Create("reg:absoluteerror", &ctx)};
   obj->Configure({});
   CheckConfigReload(obj, "reg:absoluteerror");
@@ -419,7 +419,7 @@ TEST(Objective, DeclareUnifiedTest(AbsoluteError)) {
 }
 
 TEST(Objective, DeclareUnifiedTest(AbsoluteErrorLeaf)) {
-  Context ctx = MakeCUDACtx(GetGPUId());
+  Context ctx = MakeCUDACtx(GPUIDX);
   bst_target_t constexpr kTargets = 3, kRows = 16;
   std::unique_ptr<ObjFunction> obj{ObjFunction::Create("reg:absoluteerror", &ctx)};
   obj->Configure({});
diff --git a/tests/cpp/plugin/test_example_objective.cc b/tests/cpp/plugin/test_example_objective.cc
index 29fe2ad2b..ccb83c781 100644
--- a/tests/cpp/plugin/test_example_objective.cc
+++ b/tests/cpp/plugin/test_example_objective.cc
@@ -5,7 +5,7 @@
 
 namespace xgboost {
 TEST(Plugin, ExampleObjective) {
-  xgboost::Context ctx = MakeCUDACtx(GetGPUId());
+  xgboost::Context ctx = MakeCUDACtx(GPUIDX);
   auto* obj = xgboost::ObjFunction::Create("mylogistic", &ctx);
   ASSERT_EQ(obj->DefaultEvalMetric(), std::string{"logloss"});
   delete obj;
diff --git a/tests/cpp/plugin/test_federated_adapter.cu b/tests/cpp/plugin/test_federated_adapter.cu
index 75422fcca..8aa5304ea 100644
--- a/tests/cpp/plugin/test_federated_adapter.cu
+++ b/tests/cpp/plugin/test_federated_adapter.cu
@@ -27,7 +27,7 @@ namespace {
 void VerifyAllReduceSum() {
   auto const world_size = collective::GetWorldSize();
   auto const rank = collective::GetRank();
-  auto const device = GetGPUId();
+  auto const device = GPUIDX;
   int count = 3;
   common::SetDevice(device);
   thrust::device_vector<double> buffer(count, 0);
@@ -49,7 +49,7 @@ namespace {
 void VerifyAllGatherV() {
   auto const world_size = collective::GetWorldSize();
   auto const rank = collective::GetRank();
-  auto const device = GetGPUId();
+  auto const device = GPUIDX;
   int const count = rank + 2;
   common::SetDevice(device);
   thrust::device_vector<char> buffer(count, 0);
diff --git a/tests/cpp/predictor/test_gpu_predictor.cu b/tests/cpp/predictor/test_gpu_predictor.cu
index ecddf2288..3a65e3e06 100644
--- a/tests/cpp/predictor/test_gpu_predictor.cu
+++ b/tests/cpp/predictor/test_gpu_predictor.cu
@@ -60,7 +60,7 @@ void VerifyBasicColumnSplit(std::array<std::vector<float>, 32> const& expected_r
   auto const world_size = collective::GetWorldSize();
   auto const rank = collective::GetRank();
 
-  auto ctx = MakeCUDACtx(GetGPUId());
+  auto ctx = MakeCUDACtx(GPUIDX);
   std::unique_ptr<Predictor> predictor =
       std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", &ctx));
   predictor->Configure({});
@@ -283,7 +283,7 @@ TEST(GPUPredictor, CategoricalPredictLeaf) {
 TEST(GPUPredictor, PredictLeafBasic) {
   size_t constexpr kRows = 5, kCols = 5;
   auto dmat = RandomDataGenerator(kRows, kCols, 0).Device(0).GenerateDMatrix();
-  auto lparam = MakeCUDACtx(GetGPUId());
+  auto lparam = MakeCUDACtx(GPUIDX);
   std::unique_ptr<Predictor> gpu_predictor =
       std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", &lparam));
   gpu_predictor->Configure({});

From 7fc57f39743352990b5be66dae11c9f15e0f74c4 Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Fri, 4 Aug 2023 06:52:27 -0700
Subject: [PATCH 066/136] Remove Koffie Labs from Sponsors list (#9434)

---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index 92c246dfd..063b29125 100644
--- a/README.md
+++ b/README.md
@@ -48,7 +48,6 @@ Become a sponsor and get a logo here. See details at [Sponsoring the XGBoost Pro
 
 <a href="https://www.nvidia.com/en-us/" target="_blank"><img src="https://raw.githubusercontent.com/xgboost-ai/xgboost-ai.github.io/master/images/sponsors/nvidia.jpg" alt="NVIDIA" width="72" height="72"></a>
 <a href="https://www.intel.com/" target="_blank"><img src="https://images.opencollective.com/intel-corporation/2fa85c1/logo/256.png" width="72" height="72"></a>
-<a href="https://getkoffie.com/?utm_source=opencollective&utm_medium=github&utm_campaign=xgboost" target="_blank"><img src="https://images.opencollective.com/koffielabs/f391ab8/logo/256.png" width="72" height="72"></a>
 
 ### Backers
 [[Become a backer](https://opencollective.com/xgboost#backer)]

From 5bd163aa259d2fa19054fe47c6564f5aeb88ed40 Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Sat, 5 Aug 2023 14:15:44 -0700
Subject: [PATCH 067/136] Explicitly specify libcudart_static in CMake config
 (#9436)

---
 CMakeLists.txt    | 9 +++++++++
 cmake/Utils.cmake | 4 +++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a026888df..6a3fd3c73 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -232,6 +232,15 @@ add_subdirectory(${xgboost_SOURCE_DIR}/plugin)
 
 if (PLUGIN_RMM)
   find_package(rmm REQUIRED)
+
+  # Patch the rmm targets so they reference the static cudart
+  # Remove this patch once RMM stops specifying cudart requirement
+  # (since RMM is a header-only library, it should not specify cudart in its CMake config)
+  get_target_property(rmm_link_libs rmm::rmm INTERFACE_LINK_LIBRARIES)
+  list(REMOVE_ITEM rmm_link_libs CUDA::cudart)
+  list(APPEND rmm_link_libs CUDA::cudart_static)
+  set_target_properties(rmm::rmm PROPERTIES INTERFACE_LINK_LIBRARIES "${rmm_link_libs}")
+  get_target_property(rmm_link_libs rmm::rmm INTERFACE_LINK_LIBRARIES)
 endif (PLUGIN_RMM)
 
 #-- library
diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
index 98e96e304..08050205c 100644
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@@ -177,7 +177,8 @@ function(xgboost_set_cuda_flags target)
   set_target_properties(${target} PROPERTIES
     CUDA_STANDARD 17
     CUDA_STANDARD_REQUIRED ON
-    CUDA_SEPARABLE_COMPILATION OFF)
+    CUDA_SEPARABLE_COMPILATION OFF
+    CUDA_RUNTIME_LIBRARY Static)
 endfunction(xgboost_set_cuda_flags)
 
 macro(xgboost_link_nccl target)
@@ -279,6 +280,7 @@ macro(xgboost_target_link_libraries target)
 
   if (USE_CUDA)
     xgboost_set_cuda_flags(${target})
+    target_link_libraries(${target} PUBLIC CUDA::cudart_static)
   endif (USE_CUDA)
 
   if (PLUGIN_RMM)

From 54029a59af1632bd953e058855e0277292d8bee0 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 8 Aug 2023 03:21:26 +0800
Subject: [PATCH 068/136] Bound the size of the histogram cache. (#9440)

- A new histogram collection with a limit in size.
- Unify histogram building logic between hist, multi-hist, and approx.
---
 R-package/src/Makevars.in                     |   1 +
 R-package/src/Makevars.win                    |   1 +
 include/xgboost/base.h                        |   8 +-
 python-package/xgboost/testing/data_iter.py   |  34 ++
 python-package/xgboost/testing/params.py      |   4 +
 src/common/hist_util.cc                       |  11 -
 src/common/hist_util.h                        |  28 +-
 src/common/threading_utils.h                  |  37 +-
 src/data/adapter.h                            |  11 +-
 src/tree/hist/evaluate_splits.h               |  19 +-
 src/tree/hist/expand_entry.h                  |   4 +-
 src/tree/hist/hist_cache.h                    | 109 +++++
 src/tree/hist/histogram.cc                    |  63 +++
 src/tree/hist/histogram.h                     | 399 ++++++++++++------
 src/tree/hist/param.h                         |  15 +-
 src/tree/param.h                              |   2 +-
 src/tree/updater_approx.cc                    | 104 ++---
 src/tree/updater_quantile_hist.cc             | 155 ++-----
 tests/cpp/common/test_hist_util.cc            |   6 +-
 tests/cpp/test_learner.cc                     |   2 +-
 tests/cpp/tree/hist/test_evaluate_splits.cc   |  48 ++-
 tests/cpp/tree/hist/test_histogram.cc         | 377 +++++++++++------
 tests/cpp/tree/test_evaluate_splits.h         |  36 +-
 .../test_device_quantile_dmatrix.py           |   4 +
 tests/python/test_quantile_dmatrix.py         |   4 +
 tests/python/test_updaters.py                 |  45 +-
 .../test_with_dask/test_with_dask.py          |  32 +-
 27 files changed, 994 insertions(+), 565 deletions(-)
 create mode 100644 python-package/xgboost/testing/data_iter.py
 create mode 100644 src/tree/hist/hist_cache.h
 create mode 100644 src/tree/hist/histogram.cc

diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in
index f03bbc73f..a93f773f9 100644
--- a/R-package/src/Makevars.in
+++ b/R-package/src/Makevars.in
@@ -69,6 +69,7 @@ OBJECTS= \
     $(PKGROOT)/src/tree/updater_refresh.o \
     $(PKGROOT)/src/tree/updater_sync.o \
     $(PKGROOT)/src/tree/hist/param.o \
+    $(PKGROOT)/src/tree/hist/histogram.o \
     $(PKGROOT)/src/linear/linear_updater.o \
     $(PKGROOT)/src/linear/updater_coordinate.o \
     $(PKGROOT)/src/linear/updater_shotgun.o \
diff --git a/R-package/src/Makevars.win b/R-package/src/Makevars.win
index 9f4d0d5f3..d2f47b2aa 100644
--- a/R-package/src/Makevars.win
+++ b/R-package/src/Makevars.win
@@ -69,6 +69,7 @@ OBJECTS= \
     $(PKGROOT)/src/tree/updater_refresh.o \
     $(PKGROOT)/src/tree/updater_sync.o \
     $(PKGROOT)/src/tree/hist/param.o \
+    $(PKGROOT)/src/tree/hist/histogram.o \
     $(PKGROOT)/src/linear/linear_updater.o \
     $(PKGROOT)/src/linear/updater_coordinate.o \
     $(PKGROOT)/src/linear/updater_shotgun.o \
diff --git a/include/xgboost/base.h b/include/xgboost/base.h
index 9a61151f4..a5edadb6c 100644
--- a/include/xgboost/base.h
+++ b/include/xgboost/base.h
@@ -91,8 +91,6 @@ namespace xgboost {
 
 /*! \brief unsigned integer type used for feature index. */
 using bst_uint = uint32_t;  // NOLINT
-/*! \brief integer type. */
-using bst_int = int32_t;    // NOLINT
 /*! \brief unsigned long integers */
 using bst_ulong = uint64_t;  // NOLINT
 /*! \brief float type, used for storing statistics */
@@ -138,9 +136,9 @@ namespace detail {
 template <typename T>
 class GradientPairInternal {
   /*! \brief gradient statistics */
-  T grad_;
+  T grad_{0};
   /*! \brief second order gradient statistics */
-  T hess_;
+  T hess_{0};
 
   XGBOOST_DEVICE void SetGrad(T g) { grad_ = g; }
   XGBOOST_DEVICE void SetHess(T h) { hess_ = h; }
@@ -157,7 +155,7 @@ class GradientPairInternal {
     a += b;
   }
 
-  XGBOOST_DEVICE GradientPairInternal() : grad_(0), hess_(0) {}
+  GradientPairInternal() = default;
 
   XGBOOST_DEVICE GradientPairInternal(T grad, T hess) {
     SetGrad(grad);
diff --git a/python-package/xgboost/testing/data_iter.py b/python-package/xgboost/testing/data_iter.py
new file mode 100644
index 000000000..18f8eb378
--- /dev/null
+++ b/python-package/xgboost/testing/data_iter.py
@@ -0,0 +1,34 @@
+"""Tests related to the `DataIter` interface."""
+import numpy as np
+
+import xgboost
+from xgboost import testing as tm
+
+
+def run_mixed_sparsity(device: str) -> None:
+    """Check QDM with mixed batches."""
+    X_0, y_0, _ = tm.make_regression(128, 16, False)
+    if device.startswith("cuda"):
+        X_1, y_1 = tm.make_sparse_regression(256, 16, 0.1, True)
+    else:
+        X_1, y_1 = tm.make_sparse_regression(256, 16, 0.1, False)
+    X_2, y_2 = tm.make_sparse_regression(512, 16, 0.9, True)
+    X = [X_0, X_1, X_2]
+    y = [y_0, y_1, y_2]
+
+    if device.startswith("cuda"):
+        import cupy as cp  # pylint: disable=import-error
+
+        X = [cp.array(batch) for batch in X]
+
+    it = tm.IteratorForTest(X, y, None, None)
+    Xy_0 = xgboost.QuantileDMatrix(it)
+
+    X_1, y_1 = tm.make_sparse_regression(256, 16, 0.1, True)
+    X = [X_0, X_1, X_2]
+    y = [y_0, y_1, y_2]
+    X_arr = np.concatenate(X, axis=0)
+    y_arr = np.concatenate(y, axis=0)
+    Xy_1 = xgboost.QuantileDMatrix(X_arr, y_arr)
+
+    assert tm.predictor_equal(Xy_0, Xy_1)
diff --git a/python-package/xgboost/testing/params.py b/python-package/xgboost/testing/params.py
index 8dc91b601..4ed8f4c4e 100644
--- a/python-package/xgboost/testing/params.py
+++ b/python-package/xgboost/testing/params.py
@@ -41,6 +41,10 @@ hist_parameter_strategy = strategies.fixed_dictionaries(
     and (cast(int, x["max_depth"]) > 0 or x["grow_policy"] == "lossguide")
 )
 
+hist_cache_strategy = strategies.fixed_dictionaries(
+    {"internal_max_cached_hist_node": strategies.sampled_from([1, 4, 1024, 2**31])}
+)
+
 hist_multi_parameter_strategy = strategies.fixed_dictionaries(
     {
         "max_depth": strategies.integers(1, 11),
diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc
index e52ce1f66..65ab18630 100644
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -67,17 +67,6 @@ HistogramCuts SketchOnDMatrix(Context const *ctx, DMatrix *m, bst_bin_t max_bins
   return out;
 }
 
-/*!
- * \brief fill a histogram by zeros in range [begin, end)
- */
-void InitilizeHistByZeroes(GHistRow hist, size_t begin, size_t end) {
-#if defined(XGBOOST_STRICT_R_MODE) && XGBOOST_STRICT_R_MODE == 1
-  std::fill(hist.begin() + begin, hist.begin() + end, xgboost::GradientPairPrecise());
-#else  // defined(XGBOOST_STRICT_R_MODE) && XGBOOST_STRICT_R_MODE == 1
-  memset(hist.data() + begin, '\0', (end - begin) * sizeof(xgboost::GradientPairPrecise));
-#endif  // defined(XGBOOST_STRICT_R_MODE) && XGBOOST_STRICT_R_MODE == 1
-}
-
 /*!
  * \brief Increment hist as dst += add in range [begin, end)
  */
diff --git a/src/common/hist_util.h b/src/common/hist_util.h
index 9bc44409e..fbbd15b49 100644
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -364,11 +364,6 @@ bst_bin_t XGBOOST_HOST_DEV_INLINE BinarySearchBin(std::size_t begin, std::size_t
 using GHistRow = Span<xgboost::GradientPairPrecise>;
 using ConstGHistRow = Span<xgboost::GradientPairPrecise const>;
 
-/*!
- * \brief fill a histogram by zeros
- */
-void InitilizeHistByZeroes(GHistRow hist, size_t begin, size_t end);
-
 /*!
  * \brief Increment hist as dst += add in range [begin, end)
  */
@@ -395,12 +390,7 @@ class HistCollection {
     constexpr uint32_t kMax = std::numeric_limits<uint32_t>::max();
     const size_t id = row_ptr_.at(nid);
     CHECK_NE(id, kMax);
-    GradientPairPrecise* ptr = nullptr;
-    if (contiguous_allocation_) {
-      ptr = const_cast<GradientPairPrecise*>(data_[0].data() + nbins_*id);
-    } else {
-      ptr = const_cast<GradientPairPrecise*>(data_[id].data());
-    }
+    GradientPairPrecise* ptr = const_cast<GradientPairPrecise*>(data_[id].data());
     return {ptr, nbins_};
   }
 
@@ -445,24 +435,12 @@ class HistCollection {
       data_[row_ptr_[nid]].resize(nbins_, {0, 0});
     }
   }
-  // allocate common buffer contiguously for all nodes, need for single Allreduce call
-  void AllocateAllData() {
-    const size_t new_size = nbins_*data_.size();
-    contiguous_allocation_ = true;
-    if (data_[0].size() != new_size) {
-      data_[0].resize(new_size);
-    }
-  }
-  [[nodiscard]] bool IsContiguous() const { return contiguous_allocation_; }
 
  private:
   /*! \brief number of all bins over all features */
   uint32_t nbins_ = 0;
   /*! \brief amount of active nodes in hist collection */
   uint32_t n_nodes_added_ = 0;
-  /*! \brief flag to identify contiguous memory allocation */
-  bool contiguous_allocation_ = false;
-
   std::vector<std::vector<GradientPairPrecise>> data_;
 
   /*! \brief row_ptr_[nid] locates bin for histogram of node nid */
@@ -518,7 +496,7 @@ class ParallelGHistBuilder {
     GHistRow hist = idx == -1 ? targeted_hists_[nid] : hist_buffer_[idx];
 
     if (!hist_was_used_[tid * nodes_ + nid]) {
-      InitilizeHistByZeroes(hist, 0, hist.size());
+      std::fill_n(hist.data(), hist.size(), GradientPairPrecise{});
       hist_was_used_[tid * nodes_ + nid] = static_cast<int>(true);
     }
 
@@ -548,7 +526,7 @@ class ParallelGHistBuilder {
     if (!is_updated) {
       // In distributed mode - some tree nodes can be empty on local machines,
       // So we need just set local hist by zeros in this case
-      InitilizeHistByZeroes(dst, begin, end);
+      std::fill(dst.data() + begin, dst.data() + end, GradientPairPrecise{});
     }
   }
 
diff --git a/src/common/threading_utils.h b/src/common/threading_utils.h
index 9c7483847..3c1636906 100644
--- a/src/common/threading_utils.h
+++ b/src/common/threading_utils.h
@@ -7,13 +7,14 @@
 #include <dmlc/common.h>
 #include <dmlc/omp.h>
 
-#include <algorithm>
-#include <cstdint>  // for int32_t
-#include <cstdlib>  // for malloc, free
-#include <limits>
+#include <algorithm>    // for min
+#include <cstddef>      // for size_t
+#include <cstdint>      // for int32_t
+#include <cstdlib>      // for malloc, free
+#include <functional>   // for function
 #include <new>          // for bad_alloc
-#include <type_traits>  // for is_signed
-#include <vector>
+#include <type_traits>  // for is_signed, conditional_t
+#include <vector>       // for vector
 
 #include "xgboost/logging.h"
 
@@ -25,6 +26,8 @@ inline int32_t omp_get_thread_limit() __GOMP_NOTHROW { return 1; }  // NOLINT
 
 // MSVC doesn't implement the thread limit.
 #if defined(_OPENMP) && defined(_MSC_VER)
+#include <limits>
+
 extern "C" {
 inline int32_t omp_get_thread_limit() { return std::numeric_limits<int32_t>::max(); }  // NOLINT
 }
@@ -84,8 +87,8 @@ class BlockedSpace2d {
   // dim1 - size of the first dimension in the space
   // getter_size_dim2 - functor to get the second dimensions for each 'row' by row-index
   // grain_size - max size of produced blocks
-  template <typename Func>
-  BlockedSpace2d(std::size_t dim1, Func getter_size_dim2, std::size_t grain_size) {
+  BlockedSpace2d(std::size_t dim1, std::function<std::size_t(std::size_t)> getter_size_dim2,
+                 std::size_t grain_size) {
     for (std::size_t i = 0; i < dim1; ++i) {
       std::size_t size = getter_size_dim2(i);
       // Each row (second dim) is divided into n_blocks
@@ -104,13 +107,13 @@ class BlockedSpace2d {
   }
 
   // get index of the first dimension of i-th block(task)
-  [[nodiscard]] std::size_t GetFirstDimension(size_t i) const {
+  [[nodiscard]] std::size_t GetFirstDimension(std::size_t i) const {
     CHECK_LT(i, first_dimension_.size());
     return first_dimension_[i];
   }
 
   // get a range of indexes for the second dimension of i-th block(task)
-  [[nodiscard]] Range1d GetRange(size_t i) const {
+  [[nodiscard]] Range1d GetRange(std::size_t i) const {
     CHECK_LT(i, ranges_.size());
     return ranges_[i];
   }
@@ -129,22 +132,22 @@ class BlockedSpace2d {
   }
 
   std::vector<Range1d> ranges_;
-  std::vector<size_t> first_dimension_;
+  std::vector<std::size_t> first_dimension_;
 };
 
 
 // Wrapper to implement nested parallelism with simple omp parallel for
-template <typename Func>
-void ParallelFor2d(const BlockedSpace2d& space, int nthreads, Func func) {
+inline void ParallelFor2d(BlockedSpace2d const& space, std::int32_t n_threads,
+                          std::function<void(std::size_t, Range1d)> func) {
   std::size_t n_blocks_in_space = space.Size();
-  CHECK_GE(nthreads, 1);
+  CHECK_GE(n_threads, 1);
 
   dmlc::OMPException exc;
-#pragma omp parallel num_threads(nthreads)
+#pragma omp parallel num_threads(n_threads)
   {
     exc.Run([&]() {
-      size_t tid = omp_get_thread_num();
-      size_t chunck_size = n_blocks_in_space / nthreads + !!(n_blocks_in_space % nthreads);
+      std::size_t tid = omp_get_thread_num();
+      std::size_t chunck_size = n_blocks_in_space / n_threads + !!(n_blocks_in_space % n_threads);
 
       std::size_t begin = chunck_size * tid;
       std::size_t end = std::min(begin + chunck_size, n_blocks_in_space);
diff --git a/src/data/adapter.h b/src/data/adapter.h
index 7776177ab..1463a13a7 100644
--- a/src/data/adapter.h
+++ b/src/data/adapter.h
@@ -477,7 +477,6 @@ class CSCArrayAdapterBatch : public detail::NoMetaInfo {
   ArrayInterface<1> indptr_;
   ArrayInterface<1> indices_;
   ArrayInterface<1> values_;
-  bst_row_t n_rows_;
 
   class Line {
     std::size_t column_idx_;
@@ -503,11 +502,8 @@ class CSCArrayAdapterBatch : public detail::NoMetaInfo {
   static constexpr bool kIsRowMajor = false;
 
   CSCArrayAdapterBatch(ArrayInterface<1> indptr, ArrayInterface<1> indices,
-                       ArrayInterface<1> values, bst_row_t n_rows)
-      : indptr_{std::move(indptr)},
-        indices_{std::move(indices)},
-        values_{std::move(values)},
-        n_rows_{n_rows} {}
+                       ArrayInterface<1> values)
+      : indptr_{std::move(indptr)}, indices_{std::move(indices)}, values_{std::move(values)} {}
 
   std::size_t Size() const { return indptr_.n - 1; }
   Line GetLine(std::size_t idx) const {
@@ -542,8 +538,7 @@ class CSCArrayAdapter : public detail::SingleBatchDataIter<CSCArrayAdapterBatch>
         indices_{indices},
         values_{values},
         num_rows_{num_rows},
-        batch_{
-            CSCArrayAdapterBatch{indptr_, indices_, values_, static_cast<bst_row_t>(num_rows_)}} {}
+        batch_{CSCArrayAdapterBatch{indptr_, indices_, values_}} {}
 
   // JVM package sends 0 as unknown
   size_t NumRows() const { return num_rows_ == 0 ? kAdapterUnknownSize : num_rows_; }
diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h
index f4e44fa52..82dc99b12 100644
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -4,13 +4,13 @@
 #ifndef XGBOOST_TREE_HIST_EVALUATE_SPLITS_H_
 #define XGBOOST_TREE_HIST_EVALUATE_SPLITS_H_
 
-#include <algorithm>                   // for copy
-#include <cstddef>                     // for size_t
-#include <limits>                      // for numeric_limits
-#include <memory>                      // for shared_ptr
-#include <numeric>                     // for accumulate
-#include <utility>                     // for move
-#include <vector>                      // for vector
+#include <algorithm>  // for copy
+#include <cstddef>    // for size_t
+#include <limits>     // for numeric_limits
+#include <memory>     // for shared_ptr
+#include <numeric>    // for accumulate
+#include <utility>    // for move
+#include <vector>     // for vector
 
 #include "../../common/categorical.h"  // for CatBitField
 #include "../../common/hist_util.h"    // for GHistRow, HistogramCuts
@@ -20,6 +20,7 @@
 #include "../param.h"                  // for TrainParam
 #include "../split_evaluator.h"        // for TreeEvaluator
 #include "expand_entry.h"              // for MultiExpandEntry
+#include "hist_cache.h"                // for BoundedHistCollection
 #include "xgboost/base.h"              // for bst_node_t, bst_target_t, bst_feature_t
 #include "xgboost/context.h"           // for COntext
 #include "xgboost/linalg.h"            // for Constants, Vector
@@ -317,7 +318,7 @@ class HistEvaluator {
   }
 
  public:
-  void EvaluateSplits(const common::HistCollection &hist, common::HistogramCuts const &cut,
+  void EvaluateSplits(const BoundedHistCollection &hist, common::HistogramCuts const &cut,
                       common::Span<FeatureType const> feature_types, const RegTree &tree,
                       std::vector<CPUExpandEntry> *p_entries) {
     auto n_threads = ctx_->Threads();
@@ -623,7 +624,7 @@ class HistMultiEvaluator {
   }
 
  public:
-  void EvaluateSplits(RegTree const &tree, common::Span<const common::HistCollection *> hist,
+  void EvaluateSplits(RegTree const &tree, common::Span<const BoundedHistCollection *> hist,
                       common::HistogramCuts const &cut, std::vector<MultiExpandEntry> *p_entries) {
     auto &entries = *p_entries;
     std::vector<std::shared_ptr<HostDeviceVector<bst_feature_t>>> features(entries.size());
diff --git a/src/tree/hist/expand_entry.h b/src/tree/hist/expand_entry.h
index e7e19be06..0225a5110 100644
--- a/src/tree/hist/expand_entry.h
+++ b/src/tree/hist/expand_entry.h
@@ -18,8 +18,8 @@ namespace xgboost::tree {
  */
 template <typename Impl>
 struct ExpandEntryImpl {
-  bst_node_t nid;
-  bst_node_t depth;
+  bst_node_t nid{0};
+  bst_node_t depth{0};
 
   [[nodiscard]] float GetLossChange() const {
     return static_cast<Impl const*>(this)->split.loss_chg;
diff --git a/src/tree/hist/hist_cache.h b/src/tree/hist/hist_cache.h
new file mode 100644
index 000000000..79e5d9bad
--- /dev/null
+++ b/src/tree/hist/hist_cache.h
@@ -0,0 +1,109 @@
+/**
+ * Copyright 2023 by XGBoost Contributors
+ */
+#ifndef XGBOOST_TREE_HIST_HIST_CACHE_H_
+#define XGBOOST_TREE_HIST_HIST_CACHE_H_
+#include <cstddef>  // for size_t
+#include <map>      // for map
+#include <vector>   // for vector
+
+#include "../../common/hist_util.h"  // for GHistRow, ConstGHistRow
+#include "xgboost/base.h"            // for bst_node_t, bst_bin_t
+#include "xgboost/logging.h"         // for CHECK_GT
+#include "xgboost/span.h"            // for Span
+
+namespace xgboost::tree {
+/**
+ * @brief A persistent cache for CPU histogram.
+ *
+ *   The size of the cache is first bounded by the `Driver` class then by this cache
+ *   implementaiton. The former limits the number of nodes that can be built for each node
+ *   batch, while this cache limits the number of all nodes up to the size of
+ *   max(|node_batch|, n_cached_node).
+ *
+ *   The caller is responsible for clearing up the cache as it needs to rearrange the
+ *   nodes before making overflowed allocations. The strcut only reports whether the size
+ *   limit has benn reached.
+ */
+class BoundedHistCollection {
+  // maps node index to offset in `data_`.
+  std::map<bst_node_t, std::size_t> node_map_;
+  // currently allocated bins, used for tracking consistentcy.
+  std::size_t current_size_{0};
+
+  // stores the histograms in a contiguous buffer
+  std::vector<GradientPairPrecise> data_;
+
+  // number of histogram bins across all features
+  bst_bin_t n_total_bins_{0};
+  // limits the number of nodes that can be in the cache for each tree
+  std::size_t n_cached_nodes_{0};
+  // whether the tree has grown beyond the cache limit
+  bool has_exceeded_{false};
+
+ public:
+  common::GHistRow operator[](std::size_t idx) {
+    auto offset = node_map_.at(idx);
+    return common::Span{data_.data(), data_.size()}.subspan(offset, n_total_bins_);
+  }
+  common::ConstGHistRow operator[](std::size_t idx) const {
+    auto offset = node_map_.at(idx);
+    return common::Span{data_.data(), data_.size()}.subspan(offset, n_total_bins_);
+  }
+  void Reset(bst_bin_t n_total_bins, std::size_t n_cached_nodes) {
+    n_total_bins_ = n_total_bins;
+    n_cached_nodes_ = n_cached_nodes;
+    this->Clear(false);
+  }
+  /**
+   * @brief Clear the cache, mark whether the cache is exceeded the limit.
+   */
+  void Clear(bool exceeded) {
+    node_map_.clear();
+    current_size_ = 0;
+    has_exceeded_ = exceeded;
+  }
+
+  [[nodiscard]] bool CanHost(common::Span<bst_node_t const> nodes_to_build,
+                             common::Span<bst_node_t const> nodes_to_sub) const {
+    auto n_new_nodes = nodes_to_build.size() + nodes_to_sub.size();
+    return n_new_nodes + node_map_.size() <= n_cached_nodes_;
+  }
+
+  /**
+   * @brief Allocate histogram buffers for all nodes.
+   *
+   *   The resulting histogram buffer is contiguous for all nodes in the order of
+   *   allocation.
+   */
+  void AllocateHistograms(common::Span<bst_node_t const> nodes_to_build,
+                          common::Span<bst_node_t const> nodes_to_sub) {
+    auto n_new_nodes = nodes_to_build.size() + nodes_to_sub.size();
+    auto alloc_size = n_new_nodes * n_total_bins_;
+    auto new_size = alloc_size + current_size_;
+    if (new_size > data_.size()) {
+      data_.resize(new_size);
+    }
+    for (auto nidx : nodes_to_build) {
+      node_map_[nidx] = current_size_;
+      current_size_ += n_total_bins_;
+    }
+    for (auto nidx : nodes_to_sub) {
+      node_map_[nidx] = current_size_;
+      current_size_ += n_total_bins_;
+    }
+    CHECK_EQ(current_size_, new_size);
+  }
+  void AllocateHistograms(std::vector<bst_node_t> const& nodes) {
+    this->AllocateHistograms(common::Span<bst_node_t const>{nodes},
+                             common::Span<bst_node_t const>{});
+  }
+
+  [[nodiscard]] bool HasExceeded() const { return has_exceeded_; }
+  [[nodiscard]] bool HistogramExists(bst_node_t nidx) const {
+    return node_map_.find(nidx) != node_map_.cend();
+  }
+  [[nodiscard]] std::size_t Size() const { return current_size_; }
+};
+}  // namespace xgboost::tree
+#endif  // XGBOOST_TREE_HIST_HIST_CACHE_H_
diff --git a/src/tree/hist/histogram.cc b/src/tree/hist/histogram.cc
new file mode 100644
index 000000000..96abc039c
--- /dev/null
+++ b/src/tree/hist/histogram.cc
@@ -0,0 +1,63 @@
+/**
+ * Copyright 2023 by XGBoost Contributors
+ */
+#include "histogram.h"
+
+#include <cstddef>  // for size_t
+#include <numeric>  // for accumulate
+#include <utility>  // for swap
+#include <vector>   // for vector
+
+#include "../../common/transform_iterator.h"  // for MakeIndexTransformIter
+#include "expand_entry.h"                     // for MultiExpandEntry, CPUExpandEntry
+#include "xgboost/logging.h"                  // for CHECK_NE
+#include "xgboost/span.h"                     // for Span
+#include "xgboost/tree_model.h"               // for RegTree
+
+namespace xgboost::tree {
+void AssignNodes(RegTree const *p_tree, std::vector<MultiExpandEntry> const &valid_candidates,
+                 common::Span<bst_node_t> nodes_to_build, common::Span<bst_node_t> nodes_to_sub) {
+  CHECK_EQ(nodes_to_build.size(), valid_candidates.size());
+
+  std::size_t n_idx = 0;
+  for (auto const &c : valid_candidates) {
+    auto left_nidx = p_tree->LeftChild(c.nid);
+    auto right_nidx = p_tree->RightChild(c.nid);
+
+    auto build_nidx = left_nidx;
+    auto subtract_nidx = right_nidx;
+    auto lit =
+        common::MakeIndexTransformIter([&](auto i) { return c.split.left_sum[i].GetHess(); });
+    auto left_sum = std::accumulate(lit, lit + c.split.left_sum.size(), .0);
+    auto rit =
+        common::MakeIndexTransformIter([&](auto i) { return c.split.right_sum[i].GetHess(); });
+    auto right_sum = std::accumulate(rit, rit + c.split.right_sum.size(), .0);
+    auto fewer_right = right_sum < left_sum;
+    if (fewer_right) {
+      std::swap(build_nidx, subtract_nidx);
+    }
+    nodes_to_build[n_idx] = build_nidx;
+    nodes_to_sub[n_idx] = subtract_nidx;
+    ++n_idx;
+  }
+}
+
+void AssignNodes(RegTree const *p_tree, std::vector<CPUExpandEntry> const &candidates,
+                 common::Span<bst_node_t> nodes_to_build, common::Span<bst_node_t> nodes_to_sub) {
+  std::size_t n_idx = 0;
+  for (auto const &c : candidates) {
+    auto left_nidx = (*p_tree)[c.nid].LeftChild();
+    auto right_nidx = (*p_tree)[c.nid].RightChild();
+    auto fewer_right = c.split.right_sum.GetHess() < c.split.left_sum.GetHess();
+
+    auto build_nidx = left_nidx;
+    auto subtract_nidx = right_nidx;
+    if (fewer_right) {
+      std::swap(build_nidx, subtract_nidx);
+    }
+    nodes_to_build[n_idx] = build_nidx;
+    nodes_to_sub[n_idx] = subtract_nidx;
+    ++n_idx;
+  }
+}
+}  // namespace xgboost::tree
diff --git a/src/tree/hist/histogram.h b/src/tree/hist/histogram.h
index aef7f6df1..54c716887 100644
--- a/src/tree/hist/histogram.h
+++ b/src/tree/hist/histogram.h
@@ -4,80 +4,85 @@
 #ifndef XGBOOST_TREE_HIST_HISTOGRAM_H_
 #define XGBOOST_TREE_HIST_HISTOGRAM_H_
 
-#include <algorithm>
-#include <limits>
-#include <vector>
+#include <algorithm>   // for max
+#include <cstddef>     // for size_t
+#include <cstdint>     // for int32_t
+#include <functional>  // for function
+#include <utility>     // for move
+#include <vector>      // for vector
 
-#include "../../collective/communicator-inl.h"
-#include "../../common/hist_util.h"
-#include "../../data/gradient_index.h"
-#include "expand_entry.h"
-#include "xgboost/tree_model.h"  // for RegTree
+#include "../../collective/communicator-inl.h"  // for Allreduce
+#include "../../collective/communicator.h"      // for Operation
+#include "../../common/hist_util.h"             // for GHistRow, ParallelGHi...
+#include "../../common/row_set.h"               // for RowSetCollection
+#include "../../common/threading_utils.h"       // for ParallelFor2d, Range1d, BlockedSpace2d
+#include "../../data/gradient_index.h"          // for GHistIndexMatrix
+#include "expand_entry.h"                       // for MultiExpandEntry, CPUExpandEntry
+#include "hist_cache.h"                         // for BoundedHistCollection
+#include "param.h"                              // for HistMakerTrainParam
+#include "xgboost/base.h"                       // for bst_node_t, bst_target_t, bst_bin_t
+#include "xgboost/context.h"                    // for Context
+#include "xgboost/data.h"                       // for BatchIterator, BatchSet
+#include "xgboost/linalg.h"                     // for MatrixView, All, Vect...
+#include "xgboost/logging.h"                    // for CHECK_GE
+#include "xgboost/span.h"                       // for Span
+#include "xgboost/tree_model.h"                 // for RegTree
 
 namespace xgboost::tree {
-template <typename ExpandEntry>
+/**
+ * @brief Decide which node as the build node for multi-target trees.
+ */
+void AssignNodes(RegTree const *p_tree, std::vector<MultiExpandEntry> const &valid_candidates,
+                 common::Span<bst_node_t> nodes_to_build, common::Span<bst_node_t> nodes_to_sub);
+
+/**
+ * @brief Decide which node as the build node.
+ */
+void AssignNodes(RegTree const *p_tree, std::vector<CPUExpandEntry> const &candidates,
+                 common::Span<bst_node_t> nodes_to_build, common::Span<bst_node_t> nodes_to_sub);
+
 class HistogramBuilder {
   /*! \brief culmulative histogram of gradients. */
-  common::HistCollection hist_;
+  BoundedHistCollection hist_;
   common::ParallelGHistBuilder buffer_;
   BatchParam param_;
   int32_t n_threads_{-1};
-  size_t n_batches_{0};
   // Whether XGBoost is running in distributed environment.
   bool is_distributed_{false};
   bool is_col_split_{false};
 
  public:
   /**
-   * \param total_bins       Total number of bins across all features
-   * \param max_bin_per_feat Maximum number of bins per feature, same as the `max_bin`
-   *                         training parameter.
-   * \param n_threads        Number of threads.
-   * \param is_distributed   Mostly used for testing to allow injecting parameters instead
+   * @brief Reset the builder, should be called before growing a new tree.
+   *
+   * @param total_bins       Total number of bins across all features
+   * @param is_distributed   Mostly used for testing to allow injecting parameters instead
    *                         of using global rabit variable.
    */
-  void Reset(uint32_t total_bins, BatchParam p, int32_t n_threads, size_t n_batches,
-             bool is_distributed, bool is_col_split) {
-    CHECK_GE(n_threads, 1);
-    n_threads_ = n_threads;
-    n_batches_ = n_batches;
+  void Reset(Context const *ctx, bst_bin_t total_bins, BatchParam const &p, bool is_distributed,
+             bool is_col_split, HistMakerTrainParam const *param) {
+    n_threads_ = ctx->Threads();
     param_ = p;
-    hist_.Init(total_bins);
+    hist_.Reset(total_bins, param->internal_max_cached_hist_node);
     buffer_.Init(total_bins);
     is_distributed_ = is_distributed;
     is_col_split_ = is_col_split;
   }
 
   template <bool any_missing>
-  void BuildLocalHistograms(size_t page_idx, common::BlockedSpace2d space,
-                            GHistIndexMatrix const &gidx,
-                            std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
+  void BuildLocalHistograms(common::BlockedSpace2d const &space, GHistIndexMatrix const &gidx,
+                            std::vector<bst_node_t> const &nodes_to_build,
                             common::RowSetCollection const &row_set_collection,
                             common::Span<GradientPair const> gpair_h, bool force_read_by_column) {
-    const size_t n_nodes = nodes_for_explicit_hist_build.size();
-    CHECK_GT(n_nodes, 0);
-
-    std::vector<common::GHistRow> target_hists(n_nodes);
-    for (size_t i = 0; i < n_nodes; ++i) {
-      auto const nidx = nodes_for_explicit_hist_build[i].nid;
-      target_hists[i] = hist_[nidx];
-    }
-    if (page_idx == 0) {
-      // FIXME(jiamingy): Handle different size of space.  Right now we use the maximum
-      // partition size for the buffer, which might not be efficient if partition sizes
-      // has significant variance.
-      buffer_.Reset(this->n_threads_, n_nodes, space, target_hists);
-    }
-
     // Parallel processing by nodes and data in each node
     common::ParallelFor2d(space, this->n_threads_, [&](size_t nid_in_set, common::Range1d r) {
       const auto tid = static_cast<unsigned>(omp_get_thread_num());
-      const int32_t nid = nodes_for_explicit_hist_build[nid_in_set].nid;
-      auto elem = row_set_collection[nid];
+      bst_node_t const nidx = nodes_to_build[nid_in_set];
+      auto elem = row_set_collection[nidx];
       auto start_of_row_set = std::min(r.begin(), elem.Size());
       auto end_of_row_set = std::min(r.end(), elem.Size());
       auto rid_set = common::RowSetCollection::Elem(elem.begin + start_of_row_set,
-                                                    elem.begin + end_of_row_set, nid);
+                                                    elem.begin + end_of_row_set, nidx);
       auto hist = buffer_.GetInitializedHist(tid, nid_in_set);
       if (rid_set.Size() != 0) {
         common::BuildHist<any_missing>(gpair_h, rid_set, gidx, hist, force_read_by_column);
@@ -85,117 +90,143 @@ class HistogramBuilder {
     });
   }
 
-  void AddHistRows(int *starting_index,
-                   std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
-                   std::vector<ExpandEntry> const &nodes_for_subtraction_trick) {
-    for (auto const &entry : nodes_for_explicit_hist_build) {
-      int nid = entry.nid;
-      this->hist_.AddHistRow(nid);
-      (*starting_index) = std::min(nid, (*starting_index));
+  /**
+   * @brief Allocate histogram, rearrange the nodes if `rearrange` is true and the tree
+   *        has reached the cache size limit.
+   */
+  void AddHistRows(RegTree const *p_tree, std::vector<bst_node_t> *p_nodes_to_build,
+                   std::vector<bst_node_t> *p_nodes_to_sub, bool rearrange) {
+    CHECK(p_nodes_to_build);
+    auto &nodes_to_build = *p_nodes_to_build;
+    CHECK(p_nodes_to_sub);
+    auto &nodes_to_sub = *p_nodes_to_sub;
+
+    // We first check whether the cache size is already exceeded or about to be exceeded.
+    // If not, then we can allocate histograms without clearing the cache and without
+    // worrying about missing parent histogram.
+    //
+    // Otherwise, we need to rearrange the nodes before the allocation to make sure the
+    // resulting buffer is contiguous. This is to facilitate efficient allreduce.
+
+    bool can_host = this->hist_.CanHost(nodes_to_build, nodes_to_sub);
+    // True if the tree is still within the size of cache limit. Allocate histogram as
+    // usual.
+    auto cache_is_valid = can_host && !this->hist_.HasExceeded();
+
+    if (!can_host) {
+      this->hist_.Clear(true);
     }
 
-    for (auto const &node : nodes_for_subtraction_trick) {
-      this->hist_.AddHistRow(node.nid);
-    }
-    this->hist_.AllocateAllData();
-  }
-
-  /** Main entry point of this class, build histogram for tree nodes. */
-  void BuildHist(size_t page_id, common::BlockedSpace2d space, GHistIndexMatrix const &gidx,
-                 RegTree const *p_tree, common::RowSetCollection const &row_set_collection,
-                 std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
-                 std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
-                 common::Span<GradientPair const> gpair, bool force_read_by_column = false) {
-    int starting_index = std::numeric_limits<int>::max();
-    if (page_id == 0) {
-      this->AddHistRows(&starting_index, nodes_for_explicit_hist_build,
-                        nodes_for_subtraction_trick);
-    }
-    if (gidx.IsDense()) {
-      this->BuildLocalHistograms<false>(page_id, space, gidx, nodes_for_explicit_hist_build,
-                                        row_set_collection, gpair, force_read_by_column);
-    } else {
-      this->BuildLocalHistograms<true>(page_id, space, gidx, nodes_for_explicit_hist_build,
-                                       row_set_collection, gpair, force_read_by_column);
-    }
-
-    CHECK_GE(n_batches_, 1);
-    if (page_id != n_batches_ - 1) {
+    if (!rearrange || cache_is_valid) {
+      // If not rearrange, we allocate the histogram as usual, assuming the nodes have
+      // been properly arranged by other builders.
+      this->hist_.AllocateHistograms(nodes_to_build, nodes_to_sub);
+      if (rearrange) {
+        CHECK(!this->hist_.HasExceeded());
+      }
       return;
     }
 
-    this->SyncHistogram(p_tree, nodes_for_explicit_hist_build,
-                                   nodes_for_subtraction_trick, starting_index);
-  }
-  /** same as the other build hist but handles only single batch data (in-core) */
-  void BuildHist(size_t page_id, GHistIndexMatrix const &gidx, RegTree *p_tree,
-                 common::RowSetCollection const &row_set_collection,
-                 std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
-                 std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
-                 common::Span<GradientPair const> gpair, bool force_read_by_column = false) {
-    const size_t n_nodes = nodes_for_explicit_hist_build.size();
-    // create space of size (# rows in each node)
-    common::BlockedSpace2d space(
-        n_nodes,
-        [&](size_t nidx_in_set) {
-          const int32_t nidx = nodes_for_explicit_hist_build[nidx_in_set].nid;
-          return row_set_collection[nidx].Size();
-        },
-        256);
-    this->BuildHist(page_id, space, gidx, p_tree, row_set_collection, nodes_for_explicit_hist_build,
-                    nodes_for_subtraction_trick, gpair, force_read_by_column);
-  }
-
-  void SyncHistogram(RegTree const *p_tree,
-                     std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
-                     std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
-                     int starting_index) {
-    auto n_bins = buffer_.TotalBins();
-    common::BlockedSpace2d space(
-        nodes_for_explicit_hist_build.size(), [&](size_t) { return n_bins; }, 1024);
-    CHECK(hist_.IsContiguous());
-    common::ParallelFor2d(space, this->n_threads_, [&](size_t node, common::Range1d r) {
-      const auto &entry = nodes_for_explicit_hist_build[node];
-      auto this_hist = this->hist_[entry.nid];
-      // Merging histograms from each thread into once
-      this->buffer_.ReduceHist(node, r.begin(), r.end());
-    });
-
-    if (is_distributed_ && !is_col_split_) {
-      collective::Allreduce<collective::Operation::kSum>(
-          reinterpret_cast<double *>(this->hist_[starting_index].data()),
-          n_bins * nodes_for_explicit_hist_build.size() * 2);
+    // The cache is full, parent histogram might be removed in previous iterations to
+    // saved memory.
+    std::vector<bst_node_t> can_subtract;
+    for (auto const &v : nodes_to_sub) {
+      if (this->hist_.HistogramExists(p_tree->Parent(v))) {
+        // We can still use the subtraction trick for this node
+        can_subtract.push_back(v);
+      } else {
+        // This node requires a full build
+        nodes_to_build.push_back(v);
+      }
     }
 
-    common::ParallelFor2d(space, this->n_threads_, [&](std::size_t nidx_in_set, common::Range1d r) {
-      const auto &entry = nodes_for_explicit_hist_build[nidx_in_set];
-      auto this_hist = this->hist_[entry.nid];
-      if (!p_tree->IsRoot(entry.nid)) {
-        auto const parent_id = p_tree->Parent(entry.nid);
-        auto const subtraction_node_id = nodes_for_subtraction_trick[nidx_in_set].nid;
-        auto parent_hist = this->hist_[parent_id];
-        auto sibling_hist = this->hist_[subtraction_node_id];
-        common::SubtractionHist(sibling_hist, parent_hist, this_hist, r.begin(), r.end());
+    nodes_to_sub = std::move(can_subtract);
+    this->hist_.AllocateHistograms(nodes_to_build, nodes_to_sub);
+  }
+
+  /** Main entry point of this class, build histogram for tree nodes. */
+  void BuildHist(std::size_t page_idx, common::BlockedSpace2d const &space,
+                 GHistIndexMatrix const &gidx, common::RowSetCollection const &row_set_collection,
+                 std::vector<bst_node_t> const &nodes_to_build,
+                 linalg::VectorView<GradientPair const> gpair, bool force_read_by_column = false) {
+    CHECK(gpair.Contiguous());
+
+    if (page_idx == 0) {
+      // Add the local histogram cache to the parallel buffer before processing the first page.
+      auto n_nodes = nodes_to_build.size();
+      std::vector<common::GHistRow> target_hists(n_nodes);
+      for (size_t i = 0; i < n_nodes; ++i) {
+        auto const nidx = nodes_to_build[i];
+        target_hists[i] = hist_[nidx];
       }
+      buffer_.Reset(this->n_threads_, n_nodes, space, target_hists);
+    }
+
+    if (gidx.IsDense()) {
+      this->BuildLocalHistograms<false>(space, gidx, nodes_to_build, row_set_collection,
+                                        gpair.Values(), force_read_by_column);
+    } else {
+      this->BuildLocalHistograms<true>(space, gidx, nodes_to_build, row_set_collection,
+                                       gpair.Values(), force_read_by_column);
+    }
+  }
+
+  void SyncHistogram(RegTree const *p_tree, std::vector<bst_node_t> const &nodes_to_build,
+                     std::vector<bst_node_t> const &nodes_to_trick) {
+    auto n_total_bins = buffer_.TotalBins();
+    common::BlockedSpace2d space(
+        nodes_to_build.size(), [&](std::size_t) { return n_total_bins; }, 1024);
+    common::ParallelFor2d(space, this->n_threads_, [&](size_t node, common::Range1d r) {
+      // Merging histograms from each thread.
+      this->buffer_.ReduceHist(node, r.begin(), r.end());
     });
+    if (is_distributed_ && !is_col_split_) {
+      // The cache is contiguous, we can perform allreduce for all nodes in one go.
+      CHECK(!nodes_to_build.empty());
+      auto first_nidx = nodes_to_build.front();
+      std::size_t n = n_total_bins * nodes_to_build.size() * 2;
+      collective::Allreduce<collective::Operation::kSum>(
+          reinterpret_cast<double *>(this->hist_[first_nidx].data()), n);
+    }
+
+    common::BlockedSpace2d const &subspace =
+        nodes_to_trick.size() == nodes_to_build.size()
+            ? space
+            : common::BlockedSpace2d{nodes_to_trick.size(),
+                                     [&](std::size_t) { return n_total_bins; }, 1024};
+    common::ParallelFor2d(
+        subspace, this->n_threads_, [&](std::size_t nidx_in_set, common::Range1d r) {
+          auto subtraction_nidx = nodes_to_trick[nidx_in_set];
+          auto parent_id = p_tree->Parent(subtraction_nidx);
+          auto sibling_nidx = p_tree->IsLeftChild(subtraction_nidx) ? p_tree->RightChild(parent_id)
+                                                                    : p_tree->LeftChild(parent_id);
+          auto sibling_hist = this->hist_[sibling_nidx];
+          auto parent_hist = this->hist_[parent_id];
+          auto subtract_hist = this->hist_[subtraction_nidx];
+          common::SubtractionHist(subtract_hist, parent_hist, sibling_hist, r.begin(), r.end());
+        });
   }
 
  public:
   /* Getters for tests. */
-  common::HistCollection const &Histogram() { return hist_; }
+  [[nodiscard]] BoundedHistCollection const &Histogram() const { return hist_; }
+  [[nodiscard]] BoundedHistCollection &Histogram() { return hist_; }
   auto &Buffer() { return buffer_; }
 };
 
 // Construct a work space for building histogram.  Eventually we should move this
 // function into histogram builder once hist tree method supports external memory.
-template <typename Partitioner, typename ExpandEntry = CPUExpandEntry>
+template <typename Partitioner>
 common::BlockedSpace2d ConstructHistSpace(Partitioner const &partitioners,
-                                          std::vector<ExpandEntry> const &nodes_to_build) {
-  std::vector<size_t> partition_size(nodes_to_build.size(), 0);
+                                          std::vector<bst_node_t> const &nodes_to_build) {
+  // FIXME(jiamingy): Handle different size of space.  Right now we use the maximum
+  // partition size for the buffer, which might not be efficient if partition sizes
+  // has significant variance.
+  std::vector<std::size_t> partition_size(nodes_to_build.size(), 0);
   for (auto const &partition : partitioners) {
     size_t k = 0;
-    for (auto node : nodes_to_build) {
-      auto n_rows_in_node = partition.Partitions()[node.nid].Size();
+    for (auto nidx : nodes_to_build) {
+      auto n_rows_in_node = partition.Partitions()[nidx].Size();
       partition_size[k] = std::max(partition_size[k], n_rows_in_node);
       k++;
     }
@@ -204,5 +235,107 @@ common::BlockedSpace2d ConstructHistSpace(Partitioner const &partitioners,
       nodes_to_build.size(), [&](size_t nidx_in_set) { return partition_size[nidx_in_set]; }, 256};
   return space;
 }
+
+/**
+ * @brief Histogram builder that can handle multiple targets.
+ */
+class MultiHistogramBuilder {
+  std::vector<HistogramBuilder> target_builders_;
+  Context const *ctx_;
+
+ public:
+  /**
+   * @brief Build the histogram for root node.
+   */
+  template <typename Partitioner, typename ExpandEntry>
+  void BuildRootHist(DMatrix *p_fmat, RegTree const *p_tree,
+                     std::vector<Partitioner> const &partitioners,
+                     linalg::MatrixView<GradientPair const> gpair, ExpandEntry const &best,
+                     BatchParam const &param, bool force_read_by_column = false) {
+    auto n_targets = p_tree->NumTargets();
+    CHECK_EQ(gpair.Shape(1), n_targets);
+    CHECK_EQ(p_fmat->Info().num_row_, gpair.Shape(0));
+    CHECK_EQ(target_builders_.size(), n_targets);
+    std::vector<bst_node_t> nodes{best.nid};
+    std::vector<bst_node_t> dummy_sub;
+
+    auto space = ConstructHistSpace(partitioners, nodes);
+    for (bst_target_t t{0}; t < n_targets; ++t) {
+      this->target_builders_[t].AddHistRows(p_tree, &nodes, &dummy_sub, false);
+    }
+    CHECK(dummy_sub.empty());
+
+    std::size_t page_idx{0};
+    for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, param)) {
+      for (bst_target_t t{0}; t < n_targets; ++t) {
+        auto t_gpair = gpair.Slice(linalg::All(), t);
+        this->target_builders_[t].BuildHist(page_idx, space, gidx,
+                                            partitioners[page_idx].Partitions(), nodes, t_gpair,
+                                            force_read_by_column);
+      }
+      ++page_idx;
+    }
+
+    for (bst_target_t t = 0; t < p_tree->NumTargets(); ++t) {
+      this->target_builders_[t].SyncHistogram(p_tree, nodes, dummy_sub);
+    }
+  }
+  /**
+   * @brief Build histogram for left and right child of valid candidates
+   */
+  template <typename Partitioner, typename ExpandEntry>
+  void BuildHistLeftRight(DMatrix *p_fmat, RegTree const *p_tree,
+                          std::vector<Partitioner> const &partitioners,
+                          std::vector<ExpandEntry> const &valid_candidates,
+                          linalg::MatrixView<GradientPair const> gpair, BatchParam const &param,
+                          bool force_read_by_column = false) {
+    std::vector<bst_node_t> nodes_to_build(valid_candidates.size());
+    std::vector<bst_node_t> nodes_to_sub(valid_candidates.size());
+    AssignNodes(p_tree, valid_candidates, nodes_to_build, nodes_to_sub);
+
+    // use the first builder for getting number of valid nodes.
+    target_builders_.front().AddHistRows(p_tree, &nodes_to_build, &nodes_to_sub, true);
+    CHECK_GE(nodes_to_build.size(), nodes_to_sub.size());
+    CHECK_EQ(nodes_to_sub.size() + nodes_to_build.size(), valid_candidates.size() * 2);
+
+    // allocate storage for the rest of the builders
+    for (bst_target_t t = 1; t < target_builders_.size(); ++t) {
+      target_builders_[t].AddHistRows(p_tree, &nodes_to_build, &nodes_to_sub, false);
+    }
+
+    auto space = ConstructHistSpace(partitioners, nodes_to_build);
+    std::size_t page_idx{0};
+    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, param)) {
+      CHECK_EQ(gpair.Shape(1), p_tree->NumTargets());
+      for (bst_target_t t = 0; t < p_tree->NumTargets(); ++t) {
+        auto t_gpair = gpair.Slice(linalg::All(), t);
+        CHECK_EQ(t_gpair.Shape(0), p_fmat->Info().num_row_);
+        this->target_builders_[t].BuildHist(page_idx, space, page,
+                                            partitioners[page_idx].Partitions(), nodes_to_build,
+                                            t_gpair, force_read_by_column);
+      }
+      page_idx++;
+    }
+
+    for (bst_target_t t = 0; t < p_tree->NumTargets(); ++t) {
+      this->target_builders_[t].SyncHistogram(p_tree, nodes_to_build, nodes_to_sub);
+    }
+  }
+
+  [[nodiscard]] auto const &Histogram(bst_target_t t) const {
+    return target_builders_[t].Histogram();
+  }
+  [[nodiscard]] auto &Histogram(bst_target_t t) { return target_builders_[t].Histogram(); }
+
+  void Reset(Context const *ctx, bst_bin_t total_bins, bst_target_t n_targets, BatchParam const &p,
+             bool is_distributed, bool is_col_split, HistMakerTrainParam const *param) {
+    ctx_ = ctx;
+    target_builders_.resize(n_targets);
+    CHECK_GE(n_targets, 1);
+    for (auto &v : target_builders_) {
+      v.Reset(ctx, total_bins, p, is_distributed, is_col_split, param);
+    }
+  }
+};
 }  // namespace xgboost::tree
 #endif  // XGBOOST_TREE_HIST_HISTOGRAM_H_
diff --git a/src/tree/hist/param.h b/src/tree/hist/param.h
index 3dfbf68e1..0f2f4ac00 100644
--- a/src/tree/hist/param.h
+++ b/src/tree/hist/param.h
@@ -2,12 +2,19 @@
  * Copyright 2021-2023, XGBoost Contributors
  */
 #pragma once
-#include "xgboost/parameter.h"
+
+#include <cstddef>  // for size_t
+
+#include "xgboost/parameter.h"   // for XGBoostParameter
 #include "xgboost/tree_model.h"  // for RegTree
 
 namespace xgboost::tree {
 struct HistMakerTrainParam : public XGBoostParameter<HistMakerTrainParam> {
-  bool debug_synchronize;
+  constexpr static std::size_t DefaultNodes() { return static_cast<std::size_t>(1) << 16; }
+
+  bool debug_synchronize{false};
+  std::size_t internal_max_cached_hist_node{DefaultNodes()};
+
   void CheckTreesSynchronized(RegTree const* local_tree) const;
 
   // declare parameters
@@ -15,6 +22,10 @@ struct HistMakerTrainParam : public XGBoostParameter<HistMakerTrainParam> {
     DMLC_DECLARE_FIELD(debug_synchronize)
         .set_default(false)
         .describe("Check if all distributed tree are identical after tree construction.");
+    DMLC_DECLARE_FIELD(internal_max_cached_hist_node)
+        .set_default(DefaultNodes())
+        .set_lower_bound(1)
+        .describe("Maximum number of nodes in CPU histogram cache. Only for internal usage.");
   }
 };
 }  // namespace xgboost::tree
diff --git a/src/tree/param.h b/src/tree/param.h
index e182fe539..5e2a36dfe 100644
--- a/src/tree/param.h
+++ b/src/tree/param.h
@@ -526,7 +526,7 @@ struct SplitEntryContainer {
    * \return whether the proposed split is better and can replace current split
    */
   template <typename GradientSumT>
-  bool Update(bst_float new_loss_chg, unsigned split_index, bst_float new_split_value,
+  bool Update(bst_float new_loss_chg, bst_feature_t split_index, float new_split_value,
               bool default_left, bool is_cat, GradientSumT const &left_sum,
               GradientSumT const &right_sum) {
     if (this->NeedReplace(new_loss_chg, split_index)) {
diff --git a/src/tree/updater_approx.cc b/src/tree/updater_approx.cc
index 9f496d052..2110cd6e6 100644
--- a/src/tree/updater_approx.cc
+++ b/src/tree/updater_approx.cc
@@ -3,27 +3,39 @@
  *
  * \brief Implementation for the approx tree method.
  */
-#include <algorithm>
-#include <memory>
-#include <vector>
+#include <algorithm>  // for max, transform, fill_n
+#include <cstddef>    // for size_t
+#include <map>        // for map
+#include <memory>     // for allocator, unique_ptr, make_shared, make_unique
+#include <utility>    // for move
+#include <vector>     // for vector
 
-#include "../collective/aggregator.h"
-#include "../common/random.h"
-#include "../data/gradient_index.h"
-#include "common_row_partitioner.h"
-#include "driver.h"
-#include "hist/evaluate_splits.h"
-#include "hist/histogram.h"
-#include "hist/param.h"
-#include "hist/sampler.h"  // for SampleGradient
-#include "param.h"         // for HistMakerTrainParam
-#include "xgboost/base.h"
-#include "xgboost/data.h"
-#include "xgboost/json.h"
-#include "xgboost/linalg.h"
-#include "xgboost/task.h"  // for ObjInfo
-#include "xgboost/tree_model.h"
-#include "xgboost/tree_updater.h"  // for TreeUpdater
+#include "../collective/aggregator.h"        // for GlobalSum
+#include "../collective/communicator-inl.h"  // for IsDistributed
+#include "../common/hist_util.h"             // for HistogramCuts
+#include "../common/random.h"                // for ColumnSampler
+#include "../common/timer.h"                 // for Monitor
+#include "../data/gradient_index.h"          // for GHistIndexMatrix
+#include "common_row_partitioner.h"          // for CommonRowPartitioner
+#include "dmlc/registry.h"                   // for DMLC_REGISTRY_FILE_TAG
+#include "driver.h"                          // for Driver
+#include "hist/evaluate_splits.h"            // for HistEvaluator, UpdatePredictionCacheImpl
+#include "hist/expand_entry.h"               // for CPUExpandEntry
+#include "hist/histogram.h"                  // for MultiHistogramBuilder
+#include "hist/param.h"                      // for HistMakerTrainParam
+#include "hist/sampler.h"                    // for SampleGradient
+#include "param.h"                           // for GradStats, TrainParam
+#include "xgboost/base.h"                    // for Args, GradientPair, bst_node_t, bst_bin_t
+#include "xgboost/context.h"                 // for Context
+#include "xgboost/data.h"                    // for DMatrix, BatchSet, BatchIterator, MetaInfo
+#include "xgboost/host_device_vector.h"      // for HostDeviceVector
+#include "xgboost/json.h"                    // for Object, Json, FromJson, ToJson, get
+#include "xgboost/linalg.h"                  // for Matrix, MakeTensorView, Empty, MatrixView
+#include "xgboost/logging.h"                 // for LogCheck_EQ, CHECK_EQ, CHECK
+#include "xgboost/span.h"                    // for Span
+#include "xgboost/task.h"                    // for ObjInfo
+#include "xgboost/tree_model.h"              // for RegTree, RTreeNodeStat
+#include "xgboost/tree_updater.h"            // for TreeUpdater, TreeUpdaterReg, XGBOOST_REGISTE...
 
 namespace xgboost::tree {
 
@@ -46,7 +58,7 @@ class GloablApproxBuilder {
   HistMakerTrainParam const *hist_param_{nullptr};
   std::shared_ptr<common::ColumnSampler> col_sampler_;
   HistEvaluator evaluator_;
-  HistogramBuilder<CPUExpandEntry> histogram_builder_;
+  MultiHistogramBuilder histogram_builder_;
   Context const *ctx_;
   ObjInfo const *const task_;
 
@@ -59,7 +71,7 @@ class GloablApproxBuilder {
   common::HistogramCuts feature_values_;
 
  public:
-  void InitData(DMatrix *p_fmat, common::Span<float> hess) {
+  void InitData(DMatrix *p_fmat, RegTree const *p_tree, common::Span<float> hess) {
     monitor_->Start(__func__);
 
     n_batches_ = 0;
@@ -79,8 +91,9 @@ class GloablApproxBuilder {
       n_batches_++;
     }
 
-    histogram_builder_.Reset(n_total_bins, BatchSpec(*param_, hess), ctx_->Threads(), n_batches_,
-                             collective::IsDistributed(), p_fmat->Info().IsColumnSplit());
+    histogram_builder_.Reset(ctx_, n_total_bins, p_tree->NumTargets(), BatchSpec(*param_, hess),
+                             collective::IsDistributed(), p_fmat->Info().IsColumnSplit(),
+                             hist_param_);
     monitor_->Stop(__func__);
   }
 
@@ -96,20 +109,16 @@ class GloablApproxBuilder {
     }
     collective::GlobalSum(p_fmat->Info(), reinterpret_cast<double *>(&root_sum), 2);
     std::vector<CPUExpandEntry> nodes{best};
-    size_t i = 0;
-    auto space = ConstructHistSpace(partitioner_, nodes);
-    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, BatchSpec(*param_, hess))) {
-      histogram_builder_.BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(), nodes,
-                                   {}, gpair);
-      i++;
-    }
+    this->histogram_builder_.BuildRootHist(p_fmat, p_tree, partitioner_,
+                                           linalg::MakeTensorView(ctx_, gpair, gpair.size(), 1),
+                                           best, BatchSpec(*param_, hess));
 
     auto weight = evaluator_.InitRoot(root_sum);
     p_tree->Stat(RegTree::kRoot).sum_hess = root_sum.GetHess();
     p_tree->Stat(RegTree::kRoot).base_weight = weight;
     (*p_tree)[RegTree::kRoot].SetLeaf(param_->learning_rate * weight);
 
-    auto const &histograms = histogram_builder_.Histogram();
+    auto const &histograms = histogram_builder_.Histogram(0);
     auto ft = p_fmat->Info().feature_types.ConstHostSpan();
     evaluator_.EvaluateSplits(histograms, feature_values_, ft, *p_tree, &nodes);
     monitor_->Stop(__func__);
@@ -130,30 +139,9 @@ class GloablApproxBuilder {
                       std::vector<CPUExpandEntry> const &valid_candidates,
                       std::vector<GradientPair> const &gpair, common::Span<float> hess) {
     monitor_->Start(__func__);
-    std::vector<CPUExpandEntry> nodes_to_build;
-    std::vector<CPUExpandEntry> nodes_to_sub;
-
-    for (auto const &c : valid_candidates) {
-      auto left_nidx = (*p_tree)[c.nid].LeftChild();
-      auto right_nidx = (*p_tree)[c.nid].RightChild();
-      auto fewer_right = c.split.right_sum.GetHess() < c.split.left_sum.GetHess();
-
-      auto build_nidx = left_nidx;
-      auto subtract_nidx = right_nidx;
-      if (fewer_right) {
-        std::swap(build_nidx, subtract_nidx);
-      }
-      nodes_to_build.push_back(CPUExpandEntry{build_nidx, p_tree->GetDepth(build_nidx), {}});
-      nodes_to_sub.push_back(CPUExpandEntry{subtract_nidx, p_tree->GetDepth(subtract_nidx), {}});
-    }
-
-    size_t i = 0;
-    auto space = ConstructHistSpace(partitioner_, nodes_to_build);
-    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, BatchSpec(*param_, hess))) {
-      histogram_builder_.BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(),
-                                   nodes_to_build, nodes_to_sub, gpair);
-      i++;
-    }
+    this->histogram_builder_.BuildHistLeftRight(
+        p_fmat, p_tree, partitioner_, valid_candidates,
+        linalg::MakeTensorView(ctx_, gpair, gpair.size(), 1), BatchSpec(*param_, hess));
     monitor_->Stop(__func__);
   }
 
@@ -185,7 +173,7 @@ class GloablApproxBuilder {
   void UpdateTree(DMatrix *p_fmat, std::vector<GradientPair> const &gpair, common::Span<float> hess,
                   RegTree *p_tree, HostDeviceVector<bst_node_t> *p_out_position) {
     p_last_tree_ = p_tree;
-    this->InitData(p_fmat, hess);
+    this->InitData(p_fmat, p_tree, hess);
 
     Driver<CPUExpandEntry> driver(*param_);
     auto &tree = *p_tree;
@@ -235,7 +223,7 @@ class GloablApproxBuilder {
           best_splits.push_back(l_best);
           best_splits.push_back(r_best);
         }
-        auto const &histograms = histogram_builder_.Histogram();
+        auto const &histograms = histogram_builder_.Histogram(0);
         auto ft = p_fmat->Info().feature_types.ConstHostSpan();
         monitor_->Start("EvaluateSplits");
         evaluator_.EvaluateSplits(histograms, feature_values_, ft, *p_tree, &best_splits);
diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index 63aaf27f6..883c18f36 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -7,35 +7,37 @@
 #include <algorithm>  // for max, copy, transform
 #include <cstddef>    // for size_t
 #include <cstdint>    // for uint32_t, int32_t
-#include <memory>     // for unique_ptr, allocator, make_unique, shared_ptr
-#include <numeric>    // for accumulate
-#include <ostream>    // for basic_ostream, char_traits, operator<<
-#include <utility>    // for move, swap
+#include <exception>  // for exception
+#include <memory>     // for allocator, unique_ptr, make_unique, shared_ptr
+#include <ostream>    // for operator<<, basic_ostream, char_traits
+#include <utility>    // for move
 #include <vector>     // for vector
 
 #include "../collective/aggregator.h"        // for GlobalSum
-#include "../collective/communicator-inl.h"  // for Allreduce, IsDistributed
-#include "../common/hist_util.h"             // for HistogramCuts, HistCollection
+#include "../collective/communicator-inl.h"  // for IsDistributed
+#include "../common/hist_util.h"             // for HistogramCuts, GHistRow
 #include "../common/linalg_op.h"             // for begin, cbegin, cend
 #include "../common/random.h"                // for ColumnSampler
 #include "../common/threading_utils.h"       // for ParallelFor
 #include "../common/timer.h"                 // for Monitor
-#include "../common/transform_iterator.h"    // for IndexTransformIter, MakeIndexTransformIter
+#include "../common/transform_iterator.h"    // for IndexTransformIter
 #include "../data/gradient_index.h"          // for GHistIndexMatrix
 #include "common_row_partitioner.h"          // for CommonRowPartitioner
 #include "dmlc/registry.h"                   // for DMLC_REGISTRY_FILE_TAG
 #include "driver.h"                          // for Driver
 #include "hist/evaluate_splits.h"            // for HistEvaluator, HistMultiEvaluator, UpdatePre...
 #include "hist/expand_entry.h"               // for MultiExpandEntry, CPUExpandEntry
-#include "hist/histogram.h"                  // for HistogramBuilder, ConstructHistSpace
+#include "hist/hist_cache.h"                 // for BoundedHistCollection
+#include "hist/histogram.h"                  // for MultiHistogramBuilder
 #include "hist/param.h"                      // for HistMakerTrainParam
 #include "hist/sampler.h"                    // for SampleGradient
-#include "param.h"                           // for TrainParam, SplitEntryContainer, GradStats
-#include "xgboost/base.h"                    // for GradientPairInternal, GradientPair, bst_targ...
+#include "param.h"                           // for TrainParam, GradStats
+#include "xgboost/base.h"                    // for Args, GradientPairPrecise, GradientPair, Gra...
 #include "xgboost/context.h"                 // for Context
-#include "xgboost/data.h"                    // for BatchIterator, BatchSet, DMatrix, MetaInfo
+#include "xgboost/data.h"                    // for BatchSet, DMatrix, BatchIterator, MetaInfo
 #include "xgboost/host_device_vector.h"      // for HostDeviceVector
-#include "xgboost/linalg.h"                  // for All, MatrixView, TensorView, Matrix, Empty
+#include "xgboost/json.h"                    // for Object, Json, FromJson, ToJson, get
+#include "xgboost/linalg.h"                  // for MatrixView, TensorView, All, Matrix, Empty
 #include "xgboost/logging.h"                 // for LogCheck_EQ, CHECK_EQ, CHECK, LogCheck_GE
 #include "xgboost/span.h"                    // for Span, operator!=, SpanIterator
 #include "xgboost/string_view.h"             // for operator<<
@@ -120,7 +122,7 @@ class MultiTargetHistBuilder {
   std::shared_ptr<common::ColumnSampler> col_sampler_;
   std::unique_ptr<HistMultiEvaluator> evaluator_;
   // Histogram builder for each target.
-  std::vector<HistogramBuilder<MultiExpandEntry>> histogram_builder_;
+  std::unique_ptr<MultiHistogramBuilder> histogram_builder_;
   Context const *ctx_{nullptr};
   // Partitioner for each data batch.
   std::vector<CommonRowPartitioner> partitioner_;
@@ -150,7 +152,6 @@ class MultiTargetHistBuilder {
     monitor_->Start(__func__);
 
     p_last_fmat_ = p_fmat;
-    std::size_t page_id = 0;
     bst_bin_t n_total_bins = 0;
     partitioner_.clear();
     for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
@@ -160,16 +161,13 @@ class MultiTargetHistBuilder {
         CHECK_EQ(n_total_bins, page.cut.TotalBins());
       }
       partitioner_.emplace_back(ctx_, page.Size(), page.base_rowid, p_fmat->Info().IsColumnSplit());
-      page_id++;
     }
 
     bst_target_t n_targets = p_tree->NumTargets();
-    histogram_builder_.clear();
-    for (std::size_t i = 0; i < n_targets; ++i) {
-      histogram_builder_.emplace_back();
-      histogram_builder_.back().Reset(n_total_bins, HistBatch(param_), ctx_->Threads(), page_id,
-                                      collective::IsDistributed(), p_fmat->Info().IsColumnSplit());
-    }
+    histogram_builder_ = std::make_unique<MultiHistogramBuilder>();
+    histogram_builder_->Reset(ctx_, n_total_bins, n_targets, HistBatch(param_),
+                              collective::IsDistributed(), p_fmat->Info().IsColumnSplit(),
+                              hist_param_);
 
     evaluator_ = std::make_unique<HistMultiEvaluator>(ctx_, p_fmat->Info(), param_, col_sampler_);
     p_last_tree_ = p_tree;
@@ -204,17 +202,7 @@ class MultiTargetHistBuilder {
     collective::GlobalSum(p_fmat->Info(), reinterpret_cast<double *>(root_sum.Values().data()),
                           root_sum.Size() * 2);
 
-    std::vector<MultiExpandEntry> nodes{best};
-    std::size_t i = 0;
-    auto space = ConstructHistSpace(partitioner_, nodes);
-    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
-      for (bst_target_t t{0}; t < n_targets; ++t) {
-        auto t_gpair = gpair.Slice(linalg::All(), t);
-        histogram_builder_[t].BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(),
-                                        nodes, {}, t_gpair.Values());
-      }
-      i++;
-    }
+    histogram_builder_->BuildRootHist(p_fmat, p_tree, partitioner_, gpair, best, HistBatch(param_));
 
     auto weight = evaluator_->InitRoot(root_sum);
     auto weight_t = weight.HostView();
@@ -222,9 +210,10 @@ class MultiTargetHistBuilder {
                    [&](float w) { return w * param_->learning_rate; });
 
     p_tree->SetLeaf(RegTree::kRoot, weight_t);
-    std::vector<common::HistCollection const *> hists;
+    std::vector<BoundedHistCollection const *> hists;
+    std::vector<MultiExpandEntry> nodes{{RegTree::kRoot, 0}};
     for (bst_target_t t{0}; t < p_tree->NumTargets(); ++t) {
-      hists.push_back(&histogram_builder_[t].Histogram());
+      hists.push_back(&(*histogram_builder_).Histogram(t));
     }
     for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
       evaluator_->EvaluateSplits(*p_tree, hists, gmat.cut, &nodes);
@@ -239,50 +228,17 @@ class MultiTargetHistBuilder {
                       std::vector<MultiExpandEntry> const &valid_candidates,
                       linalg::MatrixView<GradientPair const> gpair) {
     monitor_->Start(__func__);
-    std::vector<MultiExpandEntry> nodes_to_build;
-    std::vector<MultiExpandEntry> nodes_to_sub;
-
-    for (auto const &c : valid_candidates) {
-      auto left_nidx = p_tree->LeftChild(c.nid);
-      auto right_nidx = p_tree->RightChild(c.nid);
-
-      auto build_nidx = left_nidx;
-      auto subtract_nidx = right_nidx;
-      auto lit =
-          common::MakeIndexTransformIter([&](auto i) { return c.split.left_sum[i].GetHess(); });
-      auto left_sum = std::accumulate(lit, lit + c.split.left_sum.size(), .0);
-      auto rit =
-          common::MakeIndexTransformIter([&](auto i) { return c.split.right_sum[i].GetHess(); });
-      auto right_sum = std::accumulate(rit, rit + c.split.right_sum.size(), .0);
-      auto fewer_right = right_sum < left_sum;
-      if (fewer_right) {
-        std::swap(build_nidx, subtract_nidx);
-      }
-      nodes_to_build.emplace_back(build_nidx, p_tree->GetDepth(build_nidx));
-      nodes_to_sub.emplace_back(subtract_nidx, p_tree->GetDepth(subtract_nidx));
-    }
-
-    std::size_t i = 0;
-    auto space = ConstructHistSpace(partitioner_, nodes_to_build);
-    for (auto const &page : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
-      for (std::size_t t = 0; t < p_tree->NumTargets(); ++t) {
-        auto t_gpair = gpair.Slice(linalg::All(), t);
-        // Make sure the gradient matrix is f-order.
-        CHECK(t_gpair.Contiguous());
-        histogram_builder_[t].BuildHist(i, space, page, p_tree, partitioner_.at(i).Partitions(),
-                                        nodes_to_build, nodes_to_sub, t_gpair.Values());
-      }
-      i++;
-    }
+    histogram_builder_->BuildHistLeftRight(p_fmat, p_tree, partitioner_, valid_candidates, gpair,
+                                           HistBatch(param_));
     monitor_->Stop(__func__);
   }
 
   void EvaluateSplits(DMatrix *p_fmat, RegTree const *p_tree,
                       std::vector<MultiExpandEntry> *best_splits) {
     monitor_->Start(__func__);
-    std::vector<common::HistCollection const *> hists;
+    std::vector<BoundedHistCollection const *> hists;
     for (bst_target_t t{0}; t < p_tree->NumTargets(); ++t) {
-      hists.push_back(&histogram_builder_[t].Histogram());
+      hists.push_back(&(*histogram_builder_).Histogram(t));
     }
     for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
       evaluator_->EvaluateSplits(*p_tree, hists, gmat.cut, best_splits);
@@ -349,7 +305,7 @@ class HistUpdater {
   const RegTree *p_last_tree_{nullptr};
   DMatrix const *const p_last_fmat_{nullptr};
 
-  std::unique_ptr<HistogramBuilder<CPUExpandEntry>> histogram_builder_;
+  std::unique_ptr<MultiHistogramBuilder> histogram_builder_;
   ObjInfo const *task_{nullptr};
   // Context for number of threads
   Context const *ctx_{nullptr};
@@ -364,7 +320,7 @@ class HistUpdater {
         col_sampler_{std::move(column_sampler)},
         evaluator_{std::make_unique<HistEvaluator>(ctx, param, fmat->Info(), col_sampler_)},
         p_last_fmat_(fmat),
-        histogram_builder_{new HistogramBuilder<CPUExpandEntry>},
+        histogram_builder_{new MultiHistogramBuilder},
         task_{task},
         ctx_{ctx} {
     monitor_->Init(__func__);
@@ -387,7 +343,6 @@ class HistUpdater {
   // initialize temp data structure
   void InitData(DMatrix *fmat, RegTree const *p_tree) {
     monitor_->Start(__func__);
-    std::size_t page_id{0};
     bst_bin_t n_total_bins{0};
     partitioner_.clear();
     for (auto const &page : fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
@@ -398,10 +353,9 @@ class HistUpdater {
       }
       partitioner_.emplace_back(this->ctx_, page.Size(), page.base_rowid,
                                 fmat->Info().IsColumnSplit());
-      ++page_id;
     }
-    histogram_builder_->Reset(n_total_bins, HistBatch(param_), ctx_->Threads(), page_id,
-                              collective::IsDistributed(), fmat->Info().IsColumnSplit());
+    histogram_builder_->Reset(ctx_, n_total_bins, 1, HistBatch(param_), collective::IsDistributed(),
+                              fmat->Info().IsColumnSplit(), hist_param_);
     evaluator_ = std::make_unique<HistEvaluator>(ctx_, this->param_, fmat->Info(), col_sampler_);
     p_last_tree_ = p_tree;
     monitor_->Stop(__func__);
@@ -410,7 +364,7 @@ class HistUpdater {
   void EvaluateSplits(DMatrix *p_fmat, RegTree const *p_tree,
                       std::vector<CPUExpandEntry> *best_splits) {
     monitor_->Start(__func__);
-    auto const &histograms = histogram_builder_->Histogram();
+    auto const &histograms = histogram_builder_->Histogram(0);
     auto ft = p_fmat->Info().feature_types.ConstHostSpan();
     for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
       evaluator_->EvaluateSplits(histograms, gmat.cut, ft, *p_tree, best_splits);
@@ -428,16 +382,8 @@ class HistUpdater {
     monitor_->Start(__func__);
     CPUExpandEntry node(RegTree::kRoot, p_tree->GetDepth(0));
 
-    std::size_t page_id = 0;
-    auto space = ConstructHistSpace(partitioner_, {node});
-    for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
-      std::vector<CPUExpandEntry> nodes_to_build{node};
-      std::vector<CPUExpandEntry> nodes_to_sub;
-      this->histogram_builder_->BuildHist(page_id, space, gidx, p_tree,
-                                          partitioner_.at(page_id).Partitions(), nodes_to_build,
-                                          nodes_to_sub, gpair.Slice(linalg::All(), 0).Values());
-      ++page_id;
-    }
+    this->histogram_builder_->BuildRootHist(p_fmat, p_tree, partitioner_, gpair, node,
+                                            HistBatch(param_));
 
     {
       GradientPairPrecise grad_stat;
@@ -451,7 +397,7 @@ class HistUpdater {
         CHECK_GE(row_ptr.size(), 2);
         std::uint32_t const ibegin = row_ptr[0];
         std::uint32_t const iend = row_ptr[1];
-        auto hist = this->histogram_builder_->Histogram()[RegTree::kRoot];
+        auto hist = this->histogram_builder_->Histogram(0)[RegTree::kRoot];
         auto begin = hist.data();
         for (std::uint32_t i = ibegin; i < iend; ++i) {
           GradientPairPrecise const &et = begin[i];
@@ -474,7 +420,7 @@ class HistUpdater {
       monitor_->Start("EvaluateSplits");
       auto ft = p_fmat->Info().feature_types.ConstHostSpan();
       for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
-        evaluator_->EvaluateSplits(histogram_builder_->Histogram(), gmat.cut, ft, *p_tree,
+        evaluator_->EvaluateSplits(histogram_builder_->Histogram(0), gmat.cut, ft, *p_tree,
                                    &entries);
         break;
       }
@@ -490,33 +436,8 @@ class HistUpdater {
                       std::vector<CPUExpandEntry> const &valid_candidates,
                       linalg::MatrixView<GradientPair const> gpair) {
     monitor_->Start(__func__);
-    std::vector<CPUExpandEntry> nodes_to_build(valid_candidates.size());
-    std::vector<CPUExpandEntry> nodes_to_sub(valid_candidates.size());
-
-    std::size_t n_idx = 0;
-    for (auto const &c : valid_candidates) {
-      auto left_nidx = (*p_tree)[c.nid].LeftChild();
-      auto right_nidx = (*p_tree)[c.nid].RightChild();
-      auto fewer_right = c.split.right_sum.GetHess() < c.split.left_sum.GetHess();
-
-      auto build_nidx = left_nidx;
-      auto subtract_nidx = right_nidx;
-      if (fewer_right) {
-        std::swap(build_nidx, subtract_nidx);
-      }
-      nodes_to_build[n_idx] = CPUExpandEntry{build_nidx, p_tree->GetDepth(build_nidx), {}};
-      nodes_to_sub[n_idx] = CPUExpandEntry{subtract_nidx, p_tree->GetDepth(subtract_nidx), {}};
-      n_idx++;
-    }
-
-    std::size_t page_id{0};
-    auto space = ConstructHistSpace(partitioner_, nodes_to_build);
-    for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, HistBatch(param_))) {
-      histogram_builder_->BuildHist(page_id, space, gidx, p_tree,
-                                    partitioner_.at(page_id).Partitions(), nodes_to_build,
-                                    nodes_to_sub, gpair.Values());
-      ++page_id;
-    }
+    this->histogram_builder_->BuildHistLeftRight(p_fmat, p_tree, partitioner_, valid_candidates,
+                                                 gpair, HistBatch(param_));
     monitor_->Stop(__func__);
   }
 
diff --git a/tests/cpp/common/test_hist_util.cc b/tests/cpp/common/test_hist_util.cc
index f35a35bb4..70ebecd3d 100644
--- a/tests/cpp/common/test_hist_util.cc
+++ b/tests/cpp/common/test_hist_util.cc
@@ -27,8 +27,8 @@ void ParallelGHistBuilderReset() {
 
   for(size_t inode = 0; inode < kNodesExtended; inode++) {
     collection.AddHistRow(inode);
+    collection.AllocateData(inode);
   }
-  collection.AllocateAllData();
   ParallelGHistBuilder hist_builder;
   hist_builder.Init(kBins);
   std::vector<GHistRow> target_hist(kNodes);
@@ -83,8 +83,8 @@ void ParallelGHistBuilderReduceHist(){
 
   for(size_t inode = 0; inode < kNodes; inode++) {
     collection.AddHistRow(inode);
+    collection.AllocateData(inode);
   }
-  collection.AllocateAllData();
   ParallelGHistBuilder hist_builder;
   hist_builder.Init(kBins);
   std::vector<GHistRow> target_hist(kNodes);
@@ -129,7 +129,7 @@ TEST(CutsBuilder, SearchGroupInd) {
 
   auto p_mat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
 
-  std::vector<bst_int> group(kNumGroups);
+  std::vector<bst_group_t> group(kNumGroups);
   group[0] = 2;
   group[1] = 3;
   group[2] = 7;
diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc
index 3615f7587..48fd2d8e9 100644
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@@ -92,7 +92,7 @@ TEST(Learner, CheckGroup) {
 
   std::shared_ptr<DMatrix> p_mat{RandomDataGenerator{kNumRows, kNumCols, 0.0f}.GenerateDMatrix()};
   std::vector<bst_float> weight(kNumGroups, 1);
-  std::vector<bst_int> group(kNumGroups);
+  std::vector<bst_group_t> group(kNumGroups);
   group[0] = 2;
   group[1] = 3;
   group[2] = 7;
diff --git a/tests/cpp/tree/hist/test_evaluate_splits.cc b/tests/cpp/tree/hist/test_evaluate_splits.cc
index 7bde3aca2..1685a3c80 100644
--- a/tests/cpp/tree/hist/test_evaluate_splits.cc
+++ b/tests/cpp/tree/hist/test_evaluate_splits.cc
@@ -4,13 +4,13 @@
 #include "../test_evaluate_splits.h"
 
 #include <gtest/gtest.h>
-#include <xgboost/base.h>                               // for GradientPairPrecise, Args, Gradie...
-#include <xgboost/context.h>                            // for Context
-#include <xgboost/data.h>                               // for FeatureType, DMatrix, MetaInfo
-#include <xgboost/logging.h>                            // for CHECK_EQ
-#include <xgboost/tree_model.h>                         // for RegTree, RTreeNodeStat
+#include <xgboost/base.h>        // for GradientPairPrecise, Args, Gradie...
+#include <xgboost/context.h>     // for Context
+#include <xgboost/data.h>        // for FeatureType, DMatrix, MetaInfo
+#include <xgboost/logging.h>     // for CHECK_EQ
+#include <xgboost/tree_model.h>  // for RegTree, RTreeNodeStat
 
-#include <memory>                                       // for make_shared, shared_ptr, addressof
+#include <memory>  // for make_shared, shared_ptr, addressof
 
 #include "../../../../src/common/hist_util.h"           // for HistCollection, HistogramCuts
 #include "../../../../src/common/random.h"              // for ColumnSampler
@@ -18,6 +18,8 @@
 #include "../../../../src/data/gradient_index.h"        // for GHistIndexMatrix
 #include "../../../../src/tree/hist/evaluate_splits.h"  // for HistEvaluator
 #include "../../../../src/tree/hist/expand_entry.h"     // for CPUExpandEntry
+#include "../../../../src/tree/hist/hist_cache.h"       // for BoundedHistCollection
+#include "../../../../src/tree/hist/param.h"            // for HistMakerTrainParam
 #include "../../../../src/tree/param.h"                 // for GradStats, TrainParam
 #include "../../helpers.h"                              // for RandomDataGenerator, AllThreadsFo...
 
@@ -34,7 +36,7 @@ void TestEvaluateSplits(bool force_read_by_column) {
   auto dmat = RandomDataGenerator(kRows, kCols, 0).Seed(3).GenerateDMatrix();
 
   auto evaluator = HistEvaluator{&ctx, &param, dmat->Info(), sampler};
-  common::HistCollection hist;
+  BoundedHistCollection hist;
   std::vector<GradientPair> row_gpairs = {
       {1.23f, 0.24f}, {0.24f, 0.25f}, {0.26f, 0.27f},  {2.27f, 0.28f},
       {0.27f, 0.29f}, {0.37f, 0.39f}, {-0.47f, 0.49f}, {0.57f, 0.59f}};
@@ -48,9 +50,9 @@ void TestEvaluateSplits(bool force_read_by_column) {
   std::iota(row_indices.begin(), row_indices.end(), 0);
   row_set_collection.Init();
 
-  hist.Init(gmat.cut.Ptrs().back());
-  hist.AddHistRow(0);
-  hist.AllocateAllData();
+  HistMakerTrainParam hist_param;
+  hist.Reset(gmat.cut.Ptrs().back(), hist_param.internal_max_cached_hist_node);
+  hist.AllocateHistograms({0});
   common::BuildHist<false>(row_gpairs, row_set_collection[0], gmat, hist[0], force_read_by_column);
 
   // Compute total gradient for all data points
@@ -111,13 +113,13 @@ TEST(HistMultiEvaluator, Evaluate) {
       RandomDataGenerator{n_samples, n_features, 0.5}.Targets(n_targets).GenerateDMatrix(true);
 
   HistMultiEvaluator evaluator{&ctx, p_fmat->Info(), &param, sampler};
-  std::vector<common::HistCollection> histogram(n_targets);
+  HistMakerTrainParam hist_param;
+  std::vector<BoundedHistCollection> histogram(n_targets);
   linalg::Vector<GradientPairPrecise> root_sum({2}, Context::kCpuId);
   for (bst_target_t t{0}; t < n_targets; ++t) {
     auto &hist = histogram[t];
-    hist.Init(n_bins * n_features);
-    hist.AddHistRow(0);
-    hist.AllocateAllData();
+    hist.Reset(n_bins * n_features, hist_param.internal_max_cached_hist_node);
+    hist.AllocateHistograms({0});
     auto node_hist = hist[0];
     node_hist[0] = {-0.5, 0.5};
     node_hist[1] = {2.0, 0.5};
@@ -143,7 +145,7 @@ TEST(HistMultiEvaluator, Evaluate) {
 
   std::vector<MultiExpandEntry> entries(1, {/*nidx=*/0, /*depth=*/0});
 
-  std::vector<common::HistCollection const *> ptrs;
+  std::vector<BoundedHistCollection const *> ptrs;
   std::transform(histogram.cbegin(), histogram.cend(), std::back_inserter(ptrs),
                  [](auto const &h) { return std::addressof(h); });
 
@@ -225,16 +227,16 @@ auto CompareOneHotAndPartition(bool onehot) {
   auto sampler = std::make_shared<common::ColumnSampler>();
   auto evaluator = HistEvaluator{&ctx, &param, dmat->Info(), sampler};
   std::vector<CPUExpandEntry> entries(1);
+  HistMakerTrainParam hist_param;
 
   for (auto const &gmat : dmat->GetBatches<GHistIndexMatrix>(&ctx, {32, param.sparse_threshold})) {
-    common::HistCollection hist;
+    BoundedHistCollection hist;
 
     entries.front().nid = 0;
     entries.front().depth = 0;
 
-    hist.Init(gmat.cut.TotalBins());
-    hist.AddHistRow(0);
-    hist.AllocateAllData();
+    hist.Reset(gmat.cut.TotalBins(), hist_param.internal_max_cached_hist_node);
+    hist.AllocateHistograms({0});
     auto node_hist = hist[0];
 
     CHECK_EQ(node_hist.size(), n_cats);
@@ -261,10 +263,10 @@ TEST(HistEvaluator, Categorical) {
 }
 
 TEST_F(TestCategoricalSplitWithMissing, HistEvaluator) {
-  common::HistCollection hist;
-  hist.Init(cuts_.TotalBins());
-  hist.AddHistRow(0);
-  hist.AllocateAllData();
+  BoundedHistCollection hist;
+  HistMakerTrainParam hist_param;
+  hist.Reset(cuts_.TotalBins(), hist_param.internal_max_cached_hist_node);
+  hist.AllocateHistograms({0});
   auto node_hist = hist[0];
   ASSERT_EQ(node_hist.size(), feature_histogram_.size());
   std::copy(feature_histogram_.cbegin(), feature_histogram_.cend(), node_hist.begin());
diff --git a/tests/cpp/tree/hist/test_histogram.cc b/tests/cpp/tree/hist/test_histogram.cc
index b43f7e360..b90b43101 100644
--- a/tests/cpp/tree/hist/test_histogram.cc
+++ b/tests/cpp/tree/hist/test_histogram.cc
@@ -2,16 +2,38 @@
  * Copyright 2018-2023 by Contributors
  */
 #include <gtest/gtest.h>
-#include <xgboost/context.h>  // Context
+#include <xgboost/base.h>                // for bst_node_t, bst_bin_t, Gradient...
+#include <xgboost/context.h>             // for Context
+#include <xgboost/data.h>                // for BatchIterator, BatchSet, DMatrix
+#include <xgboost/host_device_vector.h>  // for HostDeviceVector
+#include <xgboost/linalg.h>              // for MakeTensorView
+#include <xgboost/logging.h>             // for Error, LogCheck_EQ, LogCheck_LT
+#include <xgboost/span.h>                // for Span, operator!=
+#include <xgboost/tree_model.h>          // for RegTree
 
-#include <limits>
+#include <algorithm>   // for max
+#include <cstddef>     // for size_t
+#include <cstdint>     // for int32_t, uint32_t
+#include <functional>  // for function
+#include <iterator>    // for back_inserter
+#include <limits>      // for numeric_limits
+#include <memory>      // for shared_ptr, allocator, unique_ptr
+#include <numeric>     // for iota, accumulate
+#include <vector>      // for vector
 
-#include "../../../../src/common/categorical.h"
-#include "../../../../src/common/row_set.h"
-#include "../../../../src/tree/hist/expand_entry.h"
-#include "../../../../src/tree/hist/histogram.h"
-#include "../../categorical_helpers.h"
-#include "../../helpers.h"
+#include "../../../../src/collective/communicator-inl.h"  // for GetRank, GetWorldSize
+#include "../../../../src/common/hist_util.h"             // for GHistRow, HistogramCuts, Sketch...
+#include "../../../../src/common/ref_resource_view.h"     // for RefResourceView
+#include "../../../../src/common/row_set.h"               // for RowSetCollection
+#include "../../../../src/common/threading_utils.h"       // for BlockedSpace2d
+#include "../../../../src/data/gradient_index.h"          // for GHistIndexMatrix
+#include "../../../../src/tree/common_row_partitioner.h"  // for CommonRowPartitioner
+#include "../../../../src/tree/hist/expand_entry.h"       // for CPUExpandEntry
+#include "../../../../src/tree/hist/hist_cache.h"         // for BoundedHistCollection
+#include "../../../../src/tree/hist/histogram.h"          // for HistogramBuilder
+#include "../../../../src/tree/hist/param.h"              // for HistMakerTrainParam
+#include "../../categorical_helpers.h"                    // for OneHotEncodeFeature
+#include "../../helpers.h"                                // for RandomDataGenerator, GenerateRa...
 
 namespace xgboost::tree {
 namespace {
@@ -25,9 +47,8 @@ void InitRowPartitionForTest(common::RowSetCollection *row_set, size_t n_samples
 
 void TestAddHistRows(bool is_distributed) {
   Context ctx;
-  std::vector<CPUExpandEntry> nodes_for_explicit_hist_build_;
-  std::vector<CPUExpandEntry> nodes_for_subtraction_trick_;
-  int starting_index = std::numeric_limits<int>::max();
+  std::vector<bst_node_t> nodes_to_build;
+  std::vector<bst_node_t> nodes_to_sub;
 
   size_t constexpr kNRows = 8, kNCols = 16;
   int32_t constexpr kMaxBins = 4;
@@ -40,24 +61,22 @@ void TestAddHistRows(bool is_distributed) {
   tree.ExpandNode(0, 0, 0, false, 0, 0, 0, 0, 0, 0, 0);
   tree.ExpandNode(tree[0].LeftChild(), 0, 0, false, 0, 0, 0, 0, 0, 0, 0);
   tree.ExpandNode(tree[0].RightChild(), 0, 0, false, 0, 0, 0, 0, 0, 0, 0);
-  nodes_for_explicit_hist_build_.emplace_back(3, tree.GetDepth(3));
-  nodes_for_explicit_hist_build_.emplace_back(4, tree.GetDepth(4));
-  nodes_for_subtraction_trick_.emplace_back(5, tree.GetDepth(5));
-  nodes_for_subtraction_trick_.emplace_back(6, tree.GetDepth(6));
+  nodes_to_build.emplace_back(3);
+  nodes_to_build.emplace_back(4);
+  nodes_to_sub.emplace_back(5);
+  nodes_to_sub.emplace_back(6);
 
-  HistogramBuilder<CPUExpandEntry> histogram_builder;
-  histogram_builder.Reset(gmat.cut.TotalBins(), {kMaxBins, 0.5}, omp_get_max_threads(), 1,
-                          is_distributed, false);
-  histogram_builder.AddHistRows(&starting_index, nodes_for_explicit_hist_build_,
-                                nodes_for_subtraction_trick_);
+  HistMakerTrainParam hist_param;
+  HistogramBuilder histogram_builder;
+  histogram_builder.Reset(&ctx, gmat.cut.TotalBins(), {kMaxBins, 0.5}, is_distributed, false,
+                          &hist_param);
+  histogram_builder.AddHistRows(&tree, &nodes_to_build, &nodes_to_sub, false);
 
-  ASSERT_EQ(starting_index, 3);
-
-  for (const CPUExpandEntry &node : nodes_for_explicit_hist_build_) {
-    ASSERT_EQ(histogram_builder.Histogram().RowExists(node.nid), true);
+  for (bst_node_t const &nidx : nodes_to_build) {
+    ASSERT_TRUE(histogram_builder.Histogram().HistogramExists(nidx));
   }
-  for (const CPUExpandEntry &node : nodes_for_subtraction_trick_) {
-    ASSERT_EQ(histogram_builder.Histogram().RowExists(node.nid), true);
+  for (bst_node_t const &nidx : nodes_to_sub) {
+    ASSERT_TRUE(histogram_builder.Histogram().HistogramExists(nidx));
   }
 }
 
@@ -68,83 +87,77 @@ TEST(CPUHistogram, AddRows) {
 }
 
 void TestSyncHist(bool is_distributed) {
-  size_t constexpr kNRows = 8, kNCols = 16;
-  int32_t constexpr kMaxBins = 4;
+  std::size_t constexpr kNRows = 8, kNCols = 16;
+  bst_bin_t constexpr kMaxBins = 4;
   Context ctx;
 
-  std::vector<CPUExpandEntry> nodes_for_explicit_hist_build_;
-  std::vector<CPUExpandEntry> nodes_for_subtraction_trick_;
-  int starting_index = std::numeric_limits<int>::max();
+  std::vector<bst_bin_t> nodes_for_explicit_hist_build;
+  std::vector<bst_bin_t> nodes_for_subtraction_trick;
   RegTree tree;
 
   auto p_fmat = RandomDataGenerator(kNRows, kNCols, 0.8).Seed(3).GenerateDMatrix();
   auto const &gmat =
       *(p_fmat->GetBatches<GHistIndexMatrix>(&ctx, BatchParam{kMaxBins, 0.5}).begin());
 
-  HistogramBuilder<CPUExpandEntry> histogram;
+  HistogramBuilder histogram;
   uint32_t total_bins = gmat.cut.Ptrs().back();
-  histogram.Reset(total_bins, {kMaxBins, 0.5}, omp_get_max_threads(), 1, is_distributed, false);
+  HistMakerTrainParam hist_param;
+  histogram.Reset(&ctx, total_bins, {kMaxBins, 0.5}, is_distributed, false, &hist_param);
 
-  common::RowSetCollection row_set_collection_;
+  common::RowSetCollection row_set_collection;
   {
-    row_set_collection_.Clear();
-    std::vector<size_t> &row_indices = *row_set_collection_.Data();
+    row_set_collection.Clear();
+    std::vector<size_t> &row_indices = *row_set_collection.Data();
     row_indices.resize(kNRows);
     std::iota(row_indices.begin(), row_indices.end(), 0);
-    row_set_collection_.Init();
+    row_set_collection.Init();
   }
 
   // level 0
-  nodes_for_explicit_hist_build_.emplace_back(0, tree.GetDepth(0));
-  histogram.AddHistRows(&starting_index, nodes_for_explicit_hist_build_,
-                        nodes_for_subtraction_trick_);
+  nodes_for_explicit_hist_build.emplace_back(0);
+  histogram.AddHistRows(&tree, &nodes_for_explicit_hist_build, &nodes_for_subtraction_trick, false);
 
   tree.ExpandNode(0, 0, 0, false, 0, 0, 0, 0, 0, 0, 0);
-  nodes_for_explicit_hist_build_.clear();
-  nodes_for_subtraction_trick_.clear();
+  nodes_for_explicit_hist_build.clear();
+  nodes_for_subtraction_trick.clear();
 
   // level 1
-  nodes_for_explicit_hist_build_.emplace_back(tree[0].LeftChild(), tree.GetDepth(1));
-  nodes_for_subtraction_trick_.emplace_back(tree[0].RightChild(), tree.GetDepth(2));
+  nodes_for_explicit_hist_build.emplace_back(tree[0].LeftChild());
+  nodes_for_subtraction_trick.emplace_back(tree[0].RightChild());
 
-  histogram.AddHistRows(&starting_index, nodes_for_explicit_hist_build_,
-                        nodes_for_subtraction_trick_);
+  histogram.AddHistRows(&tree, &nodes_for_explicit_hist_build, &nodes_for_subtraction_trick, false);
 
   tree.ExpandNode(tree[0].LeftChild(), 0, 0, false, 0, 0, 0, 0, 0, 0, 0);
   tree.ExpandNode(tree[0].RightChild(), 0, 0, false, 0, 0, 0, 0, 0, 0, 0);
 
-  nodes_for_explicit_hist_build_.clear();
-  nodes_for_subtraction_trick_.clear();
+  nodes_for_explicit_hist_build.clear();
+  nodes_for_subtraction_trick.clear();
   // level 2
-  nodes_for_explicit_hist_build_.emplace_back(3, tree.GetDepth(3));
-  nodes_for_subtraction_trick_.emplace_back(4, tree.GetDepth(4));
-  nodes_for_explicit_hist_build_.emplace_back(5, tree.GetDepth(5));
-  nodes_for_subtraction_trick_.emplace_back(6, tree.GetDepth(6));
+  nodes_for_explicit_hist_build.emplace_back(3);
+  nodes_for_subtraction_trick.emplace_back(4);
+  nodes_for_explicit_hist_build.emplace_back(5);
+  nodes_for_subtraction_trick.emplace_back(6);
 
-  histogram.AddHistRows(&starting_index, nodes_for_explicit_hist_build_,
-                        nodes_for_subtraction_trick_);
+  histogram.AddHistRows(&tree, &nodes_for_explicit_hist_build, &nodes_for_subtraction_trick, false);
 
-  const size_t n_nodes = nodes_for_explicit_hist_build_.size();
+  const size_t n_nodes = nodes_for_explicit_hist_build.size();
   ASSERT_EQ(n_nodes, 2ul);
-  row_set_collection_.AddSplit(0, tree[0].LeftChild(), tree[0].RightChild(), 4,
-                               4);
-  row_set_collection_.AddSplit(1, tree[1].LeftChild(), tree[1].RightChild(), 2,
-                               2);
-  row_set_collection_.AddSplit(2, tree[2].LeftChild(), tree[2].RightChild(), 2,
-                               2);
+  row_set_collection.AddSplit(0, tree[0].LeftChild(), tree[0].RightChild(), 4, 4);
+  row_set_collection.AddSplit(1, tree[1].LeftChild(), tree[1].RightChild(), 2, 2);
+  row_set_collection.AddSplit(2, tree[2].LeftChild(), tree[2].RightChild(), 2, 2);
 
   common::BlockedSpace2d space(
       n_nodes,
-      [&](size_t node) {
-        const int32_t nid = nodes_for_explicit_hist_build_[node].nid;
-        return row_set_collection_[nid].Size();
+      [&](std::size_t nidx_in_set) {
+        bst_node_t nidx = nodes_for_explicit_hist_build[nidx_in_set];
+        return row_set_collection[nidx].Size();
       },
       256);
 
   std::vector<common::GHistRow> target_hists(n_nodes);
-  for (size_t i = 0; i < nodes_for_explicit_hist_build_.size(); ++i) {
-    const int32_t nid = nodes_for_explicit_hist_build_[i].nid;
-    target_hists[i] = histogram.Histogram()[nid];
+  for (size_t i = 0; i < nodes_for_explicit_hist_build.size(); ++i) {
+    bst_node_t nidx = nodes_for_explicit_hist_build[i];
+    target_hists[i] = histogram.Histogram()[nidx];
   }
 
   // set values to specific nodes hist
@@ -168,8 +181,7 @@ void TestSyncHist(bool is_distributed) {
 
   histogram.Buffer().Reset(1, n_nodes, space, target_hists);
   // sync hist
-  histogram.SyncHistogram(&tree, nodes_for_explicit_hist_build_,
-                                     nodes_for_subtraction_trick_, starting_index);
+  histogram.SyncHistogram(&tree, nodes_for_explicit_hist_build, nodes_for_subtraction_trick);
 
   using GHistRowT = common::GHistRow;
   auto check_hist = [](const GHistRowT parent, const GHistRowT left, const GHistRowT right,
@@ -182,11 +194,10 @@ void TestSyncHist(bool is_distributed) {
     }
   };
   size_t node_id = 0;
-  for (const CPUExpandEntry &node : nodes_for_explicit_hist_build_) {
-    auto this_hist = histogram.Histogram()[node.nid];
-    const size_t parent_id = tree[node.nid].Parent();
-    const size_t subtraction_node_id =
-        nodes_for_subtraction_trick_[node_id].nid;
+  for (auto const &nidx : nodes_for_explicit_hist_build) {
+    auto this_hist = histogram.Histogram()[nidx];
+    const size_t parent_id = tree[nidx].Parent();
+    const size_t subtraction_node_id = nodes_for_subtraction_trick[node_id];
     auto parent_hist = histogram.Histogram()[parent_id];
     auto sibling_hist = histogram.Histogram()[subtraction_node_id];
 
@@ -194,11 +205,10 @@ void TestSyncHist(bool is_distributed) {
     ++node_id;
   }
   node_id = 0;
-  for (const CPUExpandEntry &node : nodes_for_subtraction_trick_) {
-    auto this_hist = histogram.Histogram()[node.nid];
-    const size_t parent_id = tree[node.nid].Parent();
-    const size_t subtraction_node_id =
-        nodes_for_explicit_hist_build_[node_id].nid;
+  for (auto const &nidx : nodes_for_subtraction_trick) {
+    auto this_hist = histogram.Histogram()[nidx];
+    const size_t parent_id = tree[nidx].Parent();
+    const size_t subtraction_node_id = nodes_for_explicit_hist_build[node_id];
     auto parent_hist = histogram.Histogram()[parent_id];
     auto sibling_hist = histogram.Histogram()[subtraction_node_id];
 
@@ -232,9 +242,9 @@ void TestBuildHistogram(bool is_distributed, bool force_read_by_column, bool is_
       {0.27f, 0.29f}, {0.37f, 0.39f}, {0.47f, 0.49f}, {0.57f, 0.59f}};
 
   bst_node_t nid = 0;
-  HistogramBuilder<CPUExpandEntry> histogram;
-  histogram.Reset(total_bins, {kMaxBins, 0.5}, omp_get_max_threads(), 1, is_distributed,
-                  is_col_split);
+  HistogramBuilder histogram;
+  HistMakerTrainParam hist_param;
+  histogram.Reset(&ctx, total_bins, {kMaxBins, 0.5}, is_distributed, is_col_split, &hist_param);
 
   RegTree tree;
 
@@ -246,12 +256,17 @@ void TestBuildHistogram(bool is_distributed, bool force_read_by_column, bool is_
   row_set_collection.Init();
 
   CPUExpandEntry node{RegTree::kRoot, tree.GetDepth(0)};
-  std::vector<CPUExpandEntry> nodes_for_explicit_hist_build;
-  nodes_for_explicit_hist_build.push_back(node);
+  std::vector<bst_node_t> nodes_to_build{node.nid};
+  std::vector<bst_node_t> dummy_sub;
+
+  histogram.AddHistRows(&tree, &nodes_to_build, &dummy_sub, false);
+  common::BlockedSpace2d space{
+      1, [&](std::size_t nidx_in_set) { return row_set_collection[nidx_in_set].Size(); }, 256};
   for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(&ctx, {kMaxBins, 0.5})) {
-    histogram.BuildHist(0, gidx, &tree, row_set_collection, nodes_for_explicit_hist_build, {},
-                        gpair, force_read_by_column);
+    histogram.BuildHist(0, space, gidx, row_set_collection, nodes_to_build,
+                        linalg::MakeTensorView(&ctx, gpair, gpair.size()), force_read_by_column);
   }
+  histogram.SyncHistogram(&tree, nodes_to_build, {});
 
   // Check if number of histogram bins is correct
   ASSERT_EQ(histogram.Histogram()[nid].size(), gmat.cut.Ptrs().back());
@@ -312,18 +327,18 @@ void ValidateCategoricalHistogram(size_t n_categories,
 
 void TestHistogramCategorical(size_t n_categories, bool force_read_by_column) {
   size_t constexpr kRows = 340;
-  int32_t constexpr kBins = 256;
+  bst_bin_t constexpr kBins = 256;
   auto x = GenerateRandomCategoricalSingleColumn(kRows, n_categories);
   auto cat_m = GetDMatrixFromData(x, kRows, 1);
   cat_m->Info().feature_types.HostVector().push_back(FeatureType::kCategorical);
   Context ctx;
 
-  BatchParam batch_param{0, static_cast<int32_t>(kBins)};
+  BatchParam batch_param{0, kBins};
 
   RegTree tree;
-  CPUExpandEntry node{RegTree::kRoot, tree.GetDepth(0)};
-  std::vector<CPUExpandEntry> nodes_for_explicit_hist_build;
-  nodes_for_explicit_hist_build.push_back(node);
+  CPUExpandEntry node{RegTree::kRoot, tree.GetDepth(RegTree::kRoot)};
+  std::vector<bst_node_t> nodes_to_build;
+  nodes_to_build.push_back(node.nid);
 
   auto gpair = GenerateRandomGradients(kRows, 0, 2);
 
@@ -333,30 +348,41 @@ void TestHistogramCategorical(size_t n_categories, bool force_read_by_column) {
   row_indices.resize(kRows);
   std::iota(row_indices.begin(), row_indices.end(), 0);
   row_set_collection.Init();
+  HistMakerTrainParam hist_param;
+  std::vector<bst_node_t> dummy_sub;
+
+  common::BlockedSpace2d space{
+      1, [&](std::size_t nidx_in_set) { return row_set_collection[nidx_in_set].Size(); }, 256};
 
   /**
    * Generate hist with cat data.
    */
-  HistogramBuilder<CPUExpandEntry> cat_hist;
+  HistogramBuilder cat_hist;
   for (auto const &gidx : cat_m->GetBatches<GHistIndexMatrix>(&ctx, {kBins, 0.5})) {
     auto total_bins = gidx.cut.TotalBins();
-    cat_hist.Reset(total_bins, {kBins, 0.5}, omp_get_max_threads(), 1, false, false);
-    cat_hist.BuildHist(0, gidx, &tree, row_set_collection, nodes_for_explicit_hist_build, {},
-                       gpair.HostVector(), force_read_by_column);
+    cat_hist.Reset(&ctx, total_bins, {kBins, 0.5}, false, false, &hist_param);
+    cat_hist.AddHistRows(&tree, &nodes_to_build, &dummy_sub, false);
+    cat_hist.BuildHist(0, space, gidx, row_set_collection, nodes_to_build,
+                       linalg::MakeTensorView(&ctx, gpair.ConstHostSpan(), gpair.Size()),
+                       force_read_by_column);
   }
+  cat_hist.SyncHistogram(&tree, nodes_to_build, {});
 
   /**
    * Generate hist with one hot encoded data.
    */
   auto x_encoded = OneHotEncodeFeature(x, n_categories);
   auto encode_m = GetDMatrixFromData(x_encoded, kRows, n_categories);
-  HistogramBuilder<CPUExpandEntry> onehot_hist;
+  HistogramBuilder onehot_hist;
   for (auto const &gidx : encode_m->GetBatches<GHistIndexMatrix>(&ctx, {kBins, 0.5})) {
     auto total_bins = gidx.cut.TotalBins();
-    onehot_hist.Reset(total_bins, {kBins, 0.5}, omp_get_max_threads(), 1, false, false);
-    onehot_hist.BuildHist(0, gidx, &tree, row_set_collection, nodes_for_explicit_hist_build, {},
-                          gpair.HostVector(), force_read_by_column);
+    onehot_hist.Reset(&ctx, total_bins, {kBins, 0.5}, false, false, &hist_param);
+    onehot_hist.AddHistRows(&tree, &nodes_to_build, &dummy_sub, false);
+    onehot_hist.BuildHist(0, space, gidx, row_set_collection, nodes_to_build,
+                          linalg::MakeTensorView(&ctx, gpair.ConstHostSpan(), gpair.Size()),
+                          force_read_by_column);
   }
+  onehot_hist.SyncHistogram(&tree, nodes_to_build, {});
 
   auto cat = cat_hist.Histogram()[0];
   auto onehot = onehot_hist.Histogram()[0];
@@ -383,19 +409,22 @@ void TestHistogramExternalMemory(Context const *ctx, BatchParam batch_param, boo
     batch_param.hess = hess;
   }
 
-  std::vector<size_t> partition_size(1, 0);
-  size_t total_bins{0};
-  size_t n_samples{0};
+  std::vector<std::size_t> partition_size(1, 0);
+  bst_bin_t total_bins{0};
+  bst_row_t n_samples{0};
 
   auto gpair = GenerateRandomGradients(m->Info().num_row_, 0.0, 1.0);
   auto const &h_gpair = gpair.HostVector();
 
   RegTree tree;
-  std::vector<CPUExpandEntry> nodes;
-  nodes.emplace_back(0, tree.GetDepth(0));
+  std::vector<bst_node_t> nodes{RegTree::kRoot};
+  common::BlockedSpace2d space{
+      1, [&](std::size_t nidx_in_set) { return partition_size.at(nidx_in_set); }, 256};
 
   common::GHistRow multi_page;
-  HistogramBuilder<CPUExpandEntry> multi_build;
+  HistogramBuilder multi_build;
+  HistMakerTrainParam hist_param;
+  std::vector<bst_node_t> dummy_sub;
   {
     /**
      * Multi page
@@ -413,23 +442,21 @@ void TestHistogramExternalMemory(Context const *ctx, BatchParam batch_param, boo
     }
     ASSERT_EQ(n_samples, m->Info().num_row_);
 
-    common::BlockedSpace2d space{
-        1, [&](size_t nidx_in_set) { return partition_size.at(nidx_in_set); },
-        256};
-
-    multi_build.Reset(total_bins, batch_param, ctx->Threads(), rows_set.size(), false, false);
-
-    size_t page_idx{0};
+    multi_build.Reset(ctx, total_bins, batch_param, false, false, &hist_param);
+    multi_build.AddHistRows(&tree, &nodes, &dummy_sub, false);
+    std::size_t page_idx{0};
     for (auto const &page : m->GetBatches<GHistIndexMatrix>(ctx, batch_param)) {
-      multi_build.BuildHist(page_idx, space, page, &tree, rows_set.at(page_idx), nodes, {}, h_gpair,
+      multi_build.BuildHist(page_idx, space, page, rows_set[page_idx], nodes,
+                            linalg::MakeTensorView(ctx, h_gpair, h_gpair.size()),
                             force_read_by_column);
       ++page_idx;
     }
-    ASSERT_EQ(page_idx, 2);
-    multi_page = multi_build.Histogram()[0];
+    multi_build.SyncHistogram(&tree, nodes, {});
+
+    multi_page = multi_build.Histogram()[RegTree::kRoot];
   }
 
-  HistogramBuilder<CPUExpandEntry> single_build;
+  HistogramBuilder single_build;
   common::GHistRow single_page;
   {
     /**
@@ -438,18 +465,24 @@ void TestHistogramExternalMemory(Context const *ctx, BatchParam batch_param, boo
     common::RowSetCollection row_set_collection;
     InitRowPartitionForTest(&row_set_collection, n_samples);
 
-    single_build.Reset(total_bins, batch_param, ctx->Threads(), 1, false, false);
+    single_build.Reset(ctx, total_bins, batch_param, false, false, &hist_param);
     SparsePage concat;
     std::vector<float> hess(m->Info().num_row_, 1.0f);
-    for (auto const& page : m->GetBatches<SparsePage>()) {
+    for (auto const &page : m->GetBatches<SparsePage>()) {
       concat.Push(page);
     }
 
     auto cut = common::SketchOnDMatrix(ctx, m.get(), batch_param.max_bin, false, hess);
     GHistIndexMatrix gmat(concat, {}, cut, batch_param.max_bin, false,
                           std::numeric_limits<double>::quiet_NaN(), ctx->Threads());
-    single_build.BuildHist(0, gmat, &tree, row_set_collection, nodes, {}, h_gpair, force_read_by_column);
-    single_page = single_build.Histogram()[0];
+
+    single_build.AddHistRows(&tree, &nodes, &dummy_sub, false);
+    single_build.BuildHist(0, space, gmat, row_set_collection, nodes,
+                           linalg::MakeTensorView(ctx, h_gpair, h_gpair.size()),
+                           force_read_by_column);
+    single_build.SyncHistogram(&tree, nodes, {});
+
+    single_page = single_build.Histogram()[RegTree::kRoot];
   }
 
   for (size_t i = 0; i < single_page.size(); ++i) {
@@ -473,4 +506,108 @@ TEST(CPUHistogram, ExternalMemory) {
   TestHistogramExternalMemory(&ctx, {kBins, sparse_thresh}, false, false);
   TestHistogramExternalMemory(&ctx, {kBins, sparse_thresh}, false, true);
 }
+
+namespace {
+class OverflowTest : public ::testing::TestWithParam<std::tuple<bool, bool>> {
+ public:
+  std::vector<GradientPairPrecise> TestOverflow(bool limit, bool is_distributed,
+                                                bool is_col_split) {
+    bst_bin_t constexpr kBins = 256;
+    Context ctx;
+    HistMakerTrainParam hist_param;
+    if (limit) {
+      hist_param.Init(Args{{"internal_max_cached_hist_node", "1"}});
+    }
+
+    std::shared_ptr<DMatrix> Xy =
+        is_col_split ? RandomDataGenerator{8192, 16, 0.5}.GenerateDMatrix(true)
+                     : RandomDataGenerator{8192, 16, 0.5}.Bins(kBins).GenerateQuantileDMatrix(true);
+    if (is_col_split) {
+      Xy =
+          std::shared_ptr<DMatrix>{Xy->SliceCol(collective::GetWorldSize(), collective::GetRank())};
+    }
+
+    double sparse_thresh{TrainParam::DftSparseThreshold()};
+    auto batch = BatchParam{kBins, sparse_thresh};
+    bst_bin_t n_total_bins{0};
+    float split_cond{0};
+    for (auto const &page : Xy->GetBatches<GHistIndexMatrix>(&ctx, batch)) {
+      n_total_bins = page.cut.TotalBins();
+      // use a cut point in the second column for split
+      split_cond = page.cut.Values()[kBins + kBins / 2];
+    }
+
+    RegTree tree;
+    MultiHistogramBuilder hist_builder;
+    CHECK_EQ(Xy->Info().IsColumnSplit(), is_col_split);
+
+    hist_builder.Reset(&ctx, n_total_bins, tree.NumTargets(), batch, is_distributed,
+                       Xy->Info().IsColumnSplit(), &hist_param);
+
+    std::vector<CommonRowPartitioner> partitioners;
+    partitioners.emplace_back(&ctx, Xy->Info().num_row_, /*base_rowid=*/0,
+                              Xy->Info().IsColumnSplit());
+
+    auto gpair = GenerateRandomGradients(Xy->Info().num_row_, 0.0, 1.0);
+
+    CPUExpandEntry best;
+    hist_builder.BuildRootHist(Xy.get(), &tree, partitioners,
+                               linalg::MakeTensorView(&ctx, gpair.ConstHostSpan(), gpair.Size(), 1),
+                               best, batch);
+
+    best.split.Update(1.0f, 1, split_cond, false, false, GradStats{1.0, 1.0}, GradStats{1.0, 1.0});
+    tree.ExpandNode(best.nid, best.split.SplitIndex(), best.split.split_value, false,
+                    /*base_weight=*/2.0f,
+                    /*left_leaf_weight=*/1.0f, /*right_leaf_weight=*/1.0f, best.GetLossChange(),
+                    /*sum_hess=*/2.0f, best.split.left_sum.GetHess(),
+                    best.split.right_sum.GetHess());
+
+    std::vector<CPUExpandEntry> valid_candidates{best};
+    for (auto const &page : Xy->GetBatches<GHistIndexMatrix>(&ctx, batch)) {
+      partitioners.front().UpdatePosition(&ctx, page, valid_candidates, &tree);
+    }
+    CHECK_NE(partitioners.front()[tree.LeftChild(best.nid)].Size(), 0);
+    CHECK_NE(partitioners.front()[tree.RightChild(best.nid)].Size(), 0);
+
+    hist_builder.BuildHistLeftRight(
+        Xy.get(), &tree, partitioners, valid_candidates,
+        linalg::MakeTensorView(&ctx, gpair.ConstHostSpan(), gpair.Size(), 1), batch);
+
+    if (limit) {
+      CHECK(!hist_builder.Histogram(0).HistogramExists(best.nid));
+    } else {
+      CHECK(hist_builder.Histogram(0).HistogramExists(best.nid));
+    }
+
+    std::vector<GradientPairPrecise> result;
+    auto hist = hist_builder.Histogram(0)[tree.LeftChild(best.nid)];
+    std::copy(hist.cbegin(), hist.cend(), std::back_inserter(result));
+    hist = hist_builder.Histogram(0)[tree.RightChild(best.nid)];
+    std::copy(hist.cbegin(), hist.cend(), std::back_inserter(result));
+
+    return result;
+  }
+
+  void RunTest() {
+    auto param = GetParam();
+    auto res0 = this->TestOverflow(false, std::get<0>(param), std::get<1>(param));
+    auto res1 = this->TestOverflow(true, std::get<0>(param), std::get<1>(param));
+    ASSERT_EQ(res0, res1);
+  }
+};
+
+auto MakeParamsForTest() {
+  std::vector<std::tuple<bool, bool>> configs;
+  for (auto i : {true, false}) {
+    for (auto j : {true, false}) {
+      configs.emplace_back(i, j);
+    }
+  }
+  return configs;
+}
+}  // anonymous namespace
+
+TEST_P(OverflowTest, Overflow) { this->RunTest(); }
+
+INSTANTIATE_TEST_SUITE_P(CPUHistogram, OverflowTest, ::testing::ValuesIn(MakeParamsForTest()));
 }  // namespace xgboost::tree
diff --git a/tests/cpp/tree/test_evaluate_splits.h b/tests/cpp/tree/test_evaluate_splits.h
index a7e8972e5..04da4777d 100644
--- a/tests/cpp/tree/test_evaluate_splits.h
+++ b/tests/cpp/tree/test_evaluate_splits.h
@@ -2,22 +2,24 @@
  * Copyright 2022-2023 by XGBoost Contributors
  */
 #include <gtest/gtest.h>
-#include <xgboost/base.h>                       // for GradientPairInternal, GradientPairPrecise
-#include <xgboost/data.h>                       // for MetaInfo
-#include <xgboost/host_device_vector.h>         // for HostDeviceVector
-#include <xgboost/span.h>                       // for operator!=, Span, SpanIterator
+#include <xgboost/base.h>                // for GradientPairInternal, GradientPairPrecise
+#include <xgboost/data.h>                // for MetaInfo
+#include <xgboost/host_device_vector.h>  // for HostDeviceVector
+#include <xgboost/span.h>                // for operator!=, Span, SpanIterator
 
-#include <algorithm>                            // for max, max_element, next_permutation, copy
-#include <cmath>                                // for isnan
-#include <cstddef>                              // for size_t
-#include <cstdint>                              // for int32_t, uint64_t, uint32_t
-#include <limits>                               // for numeric_limits
-#include <numeric>                              // for iota
-#include <tuple>                                // for make_tuple, tie, tuple
-#include <utility>                              // for pair
-#include <vector>                               // for vector
+#include <algorithm>  // for max, max_element, next_permutation, copy
+#include <cmath>      // for isnan
+#include <cstddef>    // for size_t
+#include <cstdint>    // for int32_t, uint64_t, uint32_t
+#include <limits>     // for numeric_limits
+#include <numeric>    // for iota
+#include <tuple>      // for make_tuple, tie, tuple
+#include <utility>    // for pair
+#include <vector>     // for vector
 
 #include "../../../src/common/hist_util.h"      // for HistogramCuts, HistCollection, GHistRow
+#include "../../../src/tree/hist/hist_cache.h"  // for HistogramCollection
+#include "../../../src/tree/hist/param.h"       // for HistMakerTrainParam
 #include "../../../src/tree/param.h"            // for TrainParam, GradStats
 #include "../../../src/tree/split_evaluator.h"  // for TreeEvaluator
 #include "../helpers.h"                         // for SimpleLCG, SimpleRealUniformDistribution
@@ -35,7 +37,7 @@ class TestPartitionBasedSplit : public ::testing::Test {
   MetaInfo info_;
   float best_score_{-std::numeric_limits<float>::infinity()};
   common::HistogramCuts cuts_;
-  common::HistCollection hist_;
+  BoundedHistCollection hist_;
   GradientPairPrecise total_gpair_;
 
   void SetUp() override {
@@ -56,9 +58,9 @@ class TestPartitionBasedSplit : public ::testing::Test {
 
     cuts_.min_vals_.Resize(1);
 
-    hist_.Init(cuts_.TotalBins());
-    hist_.AddHistRow(0);
-    hist_.AllocateAllData();
+    HistMakerTrainParam hist_param;
+    hist_.Reset(cuts_.TotalBins(), hist_param.internal_max_cached_hist_node);
+    hist_.AllocateHistograms({0});
     auto node_hist = hist_[0];
 
     SimpleLCG lcg;
diff --git a/tests/python-gpu/test_device_quantile_dmatrix.py b/tests/python-gpu/test_device_quantile_dmatrix.py
index ace17933b..4cfc61321 100644
--- a/tests/python-gpu/test_device_quantile_dmatrix.py
+++ b/tests/python-gpu/test_device_quantile_dmatrix.py
@@ -7,6 +7,7 @@ from hypothesis import given, settings, strategies
 import xgboost as xgb
 from xgboost import testing as tm
 from xgboost.testing.data import check_inf
+from xgboost.testing.data_iter import run_mixed_sparsity
 
 sys.path.append("tests/python")
 import test_quantile_dmatrix as tqd
@@ -232,3 +233,6 @@ class TestQuantileDMatrix:
 
         rng = cp.random.default_rng(1994)
         check_inf(rng)
+
+    def test_mixed_sparsity(self) -> None:
+        run_mixed_sparsity("cuda")
diff --git a/tests/python/test_quantile_dmatrix.py b/tests/python/test_quantile_dmatrix.py
index b7428dfac..8ee00b8c0 100644
--- a/tests/python/test_quantile_dmatrix.py
+++ b/tests/python/test_quantile_dmatrix.py
@@ -16,6 +16,7 @@ from xgboost.testing import (
     predictor_equal,
 )
 from xgboost.testing.data import check_inf, np_dtypes
+from xgboost.testing.data_iter import run_mixed_sparsity
 
 
 class TestQuantileDMatrix:
@@ -334,3 +335,6 @@ class TestQuantileDMatrix:
 
         with pytest.raises(ValueError, match="consistent"):
             xgb.train({}, Xy, num_boost_round=2, xgb_model=booster)
+
+    def test_mixed_sparsity(self) -> None:
+        run_mixed_sparsity("cpu")
diff --git a/tests/python/test_updaters.py b/tests/python/test_updaters.py
index 5374a2891..3fa32660d 100644
--- a/tests/python/test_updaters.py
+++ b/tests/python/test_updaters.py
@@ -11,6 +11,7 @@ from xgboost import testing as tm
 from xgboost.testing.params import (
     cat_parameter_strategy,
     exact_parameter_strategy,
+    hist_cache_strategy,
     hist_multi_parameter_strategy,
     hist_parameter_strategy,
 )
@@ -40,14 +41,22 @@ class TestTreeMethodMulti:
     @given(
         exact_parameter_strategy,
         hist_parameter_strategy,
+        hist_cache_strategy,
         strategies.integers(1, 20),
         tm.multi_dataset_strategy,
     )
     @settings(deadline=None, print_blob=True)
-    def test_approx(self, param, hist_param, num_rounds, dataset):
+    def test_approx(
+        self, param: Dict[str, Any],
+        hist_param: Dict[str, Any],
+        cache_param: Dict[str, Any],
+        num_rounds: int,
+        dataset: tm.TestDataset,
+    ) -> None:
         param["tree_method"] = "approx"
         param = dataset.set_params(param)
         param.update(hist_param)
+        param.update(cache_param)
         result = train_result(param, dataset.get_dmat(), num_rounds)
         note(result)
         assert tm.non_increasing(result["train"][dataset.metric])
@@ -55,18 +64,25 @@ class TestTreeMethodMulti:
     @given(
         exact_parameter_strategy,
         hist_multi_parameter_strategy,
+        hist_cache_strategy,
         strategies.integers(1, 20),
         tm.multi_dataset_strategy,
     )
     @settings(deadline=None, print_blob=True)
     def test_hist(
-        self, param: dict, hist_param: dict, num_rounds: int, dataset: tm.TestDataset
+        self,
+        param: Dict[str, Any],
+        hist_param: Dict[str, Any],
+        cache_param: Dict[str, Any],
+        num_rounds: int,
+        dataset: tm.TestDataset,
     ) -> None:
         if dataset.name.endswith("-l1"):
             return
         param["tree_method"] = "hist"
         param = dataset.set_params(param)
         param.update(hist_param)
+        param.update(cache_param)
         result = train_result(param, dataset.get_dmat(), num_rounds)
         note(result)
         assert tm.non_increasing(result["train"][dataset.metric])
@@ -91,14 +107,23 @@ class TestTreeMethod:
     @given(
         exact_parameter_strategy,
         hist_parameter_strategy,
+        hist_cache_strategy,
         strategies.integers(1, 20),
         tm.make_dataset_strategy(),
     )
     @settings(deadline=None, print_blob=True)
-    def test_approx(self, param, hist_param, num_rounds, dataset):
+    def test_approx(
+        self,
+        param: Dict[str, Any],
+        hist_param: Dict[str, Any],
+        cache_param: Dict[str, Any],
+        num_rounds: int,
+        dataset: tm.TestDataset,
+    ) -> None:
         param["tree_method"] = "approx"
         param = dataset.set_params(param)
         param.update(hist_param)
+        param.update(cache_param)
         result = train_result(param, dataset.get_dmat(), num_rounds)
         note(result)
         assert tm.non_increasing(result["train"][dataset.metric])
@@ -130,17 +155,25 @@ class TestTreeMethod:
     @given(
         exact_parameter_strategy,
         hist_parameter_strategy,
+        hist_cache_strategy,
         strategies.integers(1, 20),
         tm.make_dataset_strategy()
     )
     @settings(deadline=None, print_blob=True)
-    def test_hist(self, param: dict, hist_param: dict, num_rounds: int, dataset: tm.TestDataset) -> None:
-        param['tree_method'] = 'hist'
+    def test_hist(
+        self, param: Dict[str, Any],
+        hist_param: Dict[str, Any],
+        cache_param: Dict[str, Any],
+        num_rounds: int,
+        dataset: tm.TestDataset,
+    ) -> None:
+        param["tree_method"] = "hist"
         param = dataset.set_params(param)
         param.update(hist_param)
+        param.update(cache_param)
         result = train_result(param, dataset.get_dmat(), num_rounds)
         note(result)
-        assert tm.non_increasing(result['train'][dataset.metric])
+        assert tm.non_increasing(result["train"][dataset.metric])
 
     def test_hist_categorical(self):
         # hist must be same as exact on all-categorial data
diff --git a/tests/test_distributed/test_with_dask/test_with_dask.py b/tests/test_distributed/test_with_dask/test_with_dask.py
index 3add01192..5630e5f3e 100644
--- a/tests/test_distributed/test_with_dask/test_with_dask.py
+++ b/tests/test_distributed/test_with_dask/test_with_dask.py
@@ -24,7 +24,7 @@ from sklearn.datasets import make_classification, make_regression
 import xgboost as xgb
 from xgboost import testing as tm
 from xgboost.data import _is_cudf_df
-from xgboost.testing.params import hist_parameter_strategy
+from xgboost.testing.params import hist_cache_strategy, hist_parameter_strategy
 from xgboost.testing.shared import (
     get_feature_weights,
     validate_data_initialization,
@@ -1512,14 +1512,23 @@ class TestWithDask:
         else:
             assert history[-1] < history[0]
 
-    @given(params=hist_parameter_strategy, dataset=tm.make_dataset_strategy())
+    @given(
+        params=hist_parameter_strategy,
+        cache_param=hist_cache_strategy,
+        dataset=tm.make_dataset_strategy(),
+    )
     @settings(
         deadline=None, max_examples=10, suppress_health_check=suppress, print_blob=True
     )
     def test_hist(
-        self, params: Dict, dataset: tm.TestDataset, client: "Client"
+        self,
+        params: Dict[str, Any],
+        cache_param: Dict[str, Any],
+        dataset: tm.TestDataset,
+        client: "Client",
     ) -> None:
         num_rounds = 10
+        params.update(cache_param)
         self.run_updater_test(client, params, num_rounds, dataset, "hist")
 
     def test_quantile_dmatrix(self, client: Client) -> None:
@@ -1579,14 +1588,23 @@ class TestWithDask:
         rmse = result["history"]["Valid"]["rmse"][-1]
         assert rmse < 32.0
 
-    @given(params=hist_parameter_strategy, dataset=tm.make_dataset_strategy())
+    @given(
+        params=hist_parameter_strategy,
+        cache_param=hist_cache_strategy,
+        dataset=tm.make_dataset_strategy()
+    )
     @settings(
         deadline=None, max_examples=10, suppress_health_check=suppress, print_blob=True
     )
     def test_approx(
-        self, client: "Client", params: Dict, dataset: tm.TestDataset
+        self,
+        client: "Client",
+        params: Dict,
+        cache_param: Dict[str, Any],
+        dataset: tm.TestDataset,
     ) -> None:
         num_rounds = 10
+        params.update(cache_param)
         self.run_updater_test(client, params, num_rounds, dataset, "approx")
 
     def test_adaptive(self) -> None:
@@ -2239,7 +2257,7 @@ async def test_worker_left(c, s, a, b):
         )
     await async_poll_for(lambda: len(s.workers) == 2, timeout=5)
     with pytest.raises(RuntimeError, match="Missing"):
-        await xgb.dask.train( 
+        await xgb.dask.train(
             c,
             {},
             d_train,
@@ -2256,7 +2274,7 @@ async def test_worker_restarted(c, s, a, b):
     )
     await c.restart_workers([a.worker_address])
     with pytest.raises(RuntimeError, match="Missing"):
-        await xgb.dask.train( 
+        await xgb.dask.train(
             c,
             {},
             d_train,

From 97fd5207dd1dbff4fce0df2d844ead19d2bf93cd Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 8 Aug 2023 14:04:46 +0800
Subject: [PATCH 069/136] Use lambda function in `ParallelFor2D`. (#9441)

---
 src/common/threading_utils.h | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/common/threading_utils.h b/src/common/threading_utils.h
index 3c1636906..4ca4ca070 100644
--- a/src/common/threading_utils.h
+++ b/src/common/threading_utils.h
@@ -13,7 +13,7 @@
 #include <cstdlib>      // for malloc, free
 #include <functional>   // for function
 #include <new>          // for bad_alloc
-#include <type_traits>  // for is_signed, conditional_t
+#include <type_traits>  // for is_signed, conditional_t, is_integral_v, invoke_result_t
 #include <vector>       // for vector
 
 #include "xgboost/logging.h"
@@ -87,8 +87,9 @@ class BlockedSpace2d {
   // dim1 - size of the first dimension in the space
   // getter_size_dim2 - functor to get the second dimensions for each 'row' by row-index
   // grain_size - max size of produced blocks
-  BlockedSpace2d(std::size_t dim1, std::function<std::size_t(std::size_t)> getter_size_dim2,
-                 std::size_t grain_size) {
+  template <typename Getter>
+  BlockedSpace2d(std::size_t dim1, Getter&& getter_size_dim2, std::size_t grain_size) {
+    static_assert(std::is_integral_v<std::invoke_result_t<Getter, std::size_t>>);
     for (std::size_t i = 0; i < dim1; ++i) {
       std::size_t size = getter_size_dim2(i);
       // Each row (second dim) is divided into n_blocks
@@ -137,8 +138,9 @@ class BlockedSpace2d {
 
 
 // Wrapper to implement nested parallelism with simple omp parallel for
-inline void ParallelFor2d(BlockedSpace2d const& space, std::int32_t n_threads,
-                          std::function<void(std::size_t, Range1d)> func) {
+template <typename Func>
+void ParallelFor2d(const BlockedSpace2d& space, int n_threads, Func&& func) {
+  static_assert(std::is_void_v<std::invoke_result_t<Func, std::size_t, Range1d>>);
   std::size_t n_blocks_in_space = space.Size();
   CHECK_GE(n_threads, 1);
 

From 7ce090e775e017f7850a279befddb1d694c0c9fe Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Mon, 7 Aug 2023 23:27:25 -0700
Subject: [PATCH 070/136] Handle UTF-8 paths correctly on Windows platform 
 (#9443)

* Fix round-trip serialization with UTF-8 paths

* Add compiler version check

* Add comment to C API functions

* Add Python tests

* [CI] Updatre MacOS deployment target

* Use std::filesystem instead of dmlc::TemporaryDirectory
---
 CMakeLists.txt                        | 20 ++++++++++++++++++--
 include/xgboost/c_api.h               |  4 ++--
 src/common/io.cc                      |  3 ++-
 tests/ci_build/build_python_wheels.sh |  2 +-
 tests/cpp/c_api/test_c_api.cc         | 25 +++++++++++++------------
 tests/python/test_basic.py            | 12 ++++++++++++
 6 files changed, 48 insertions(+), 18 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6a3fd3c73..a5eebef2e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,8 +14,24 @@ endif ((${CMAKE_VERSION} VERSION_GREATER 3.13) OR (${CMAKE_VERSION} VERSION_EQUA
 
 message(STATUS "CMake version ${CMAKE_VERSION}")
 
-if (CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.0)
-  message(FATAL_ERROR "GCC version must be at least 5.0!")
+# Check compiler versions
+# Use recent compilers to ensure that std::filesystem is available
+if(MSVC)
+  if(MSVC_VERSION LESS 1920)
+    message(FATAL_ERROR "Need Visual Studio 2019 or newer to build XGBoost")
+  endif()
+elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+  if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "8.1")
+    message(FATAL_ERROR "Need GCC 8.1 or newer to build XGBoost")
+  endif()
+elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
+  if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "11.0")
+    message(FATAL_ERROR "Need Xcode 11.0 (AppleClang 11.0) or newer to build XGBoost")
+  endif()
+elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+  if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "9.0")
+    message(FATAL_ERROR "Need Clang 9.0 or newer to build XGBoost")
+  endif()
 endif()
 
 include(${xgboost_SOURCE_DIR}/cmake/FindPrefetchIntrinsics.cmake)
diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h
index 8844b853b..fc60d2e77 100644
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@@ -1221,7 +1221,7 @@ XGB_DLL int XGBoosterPredictFromCudaColumnar(BoosterHandle handle, char const *v
  * \brief Load model from existing file
  *
  * \param handle handle
- * \param fname File URI or file name.
+ * \param fname File URI or file name. The string must be UTF-8 encoded.
  * \return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGBoosterLoadModel(BoosterHandle handle,
@@ -1230,7 +1230,7 @@ XGB_DLL int XGBoosterLoadModel(BoosterHandle handle,
  * \brief Save model into existing file
  *
  * \param handle handle
- * \param fname File URI or file name.
+ * \param fname File URI or file name. The string must be UTF-8 encoded.
  * \return 0 when success, -1 when failure happens
  */
 XGB_DLL int XGBoosterSaveModel(BoosterHandle handle,
diff --git a/src/common/io.cc b/src/common/io.cc
index db1624b95..1e15c4173 100644
--- a/src/common/io.cc
+++ b/src/common/io.cc
@@ -28,6 +28,7 @@
 #include <cstddef>       // for size_t
 #include <cstdint>       // for int32_t, uint32_t
 #include <cstring>       // for memcpy
+#include <filesystem>    // for filesystem
 #include <fstream>       // for ifstream
 #include <iterator>      // for distance
 #include <limits>        // for numeric_limits
@@ -153,7 +154,7 @@ std::string LoadSequentialFile(std::string uri, bool stream) {
     // Open in binary mode so that correct file size can be computed with
     // seekg(). This accommodates Windows platform:
     // https://docs.microsoft.com/en-us/cpp/standard-library/basic-istream-class?view=vs-2019#seekg
-    std::ifstream ifs(uri, std::ios_base::binary | std::ios_base::in);
+    std::ifstream ifs(std::filesystem::u8path(uri), std::ios_base::binary | std::ios_base::in);
     if (!ifs) {
       // https://stackoverflow.com/a/17338934
       OpenErr();
diff --git a/tests/ci_build/build_python_wheels.sh b/tests/ci_build/build_python_wheels.sh
index 205b3b695..435b50822 100644
--- a/tests/ci_build/build_python_wheels.sh
+++ b/tests/ci_build/build_python_wheels.sh
@@ -35,7 +35,7 @@ if [[ "$platform_id" == macosx_* ]]; then
         # MacOS, Intel
         wheel_tag=macosx_10_15_x86_64.macosx_11_0_x86_64.macosx_12_0_x86_64
         cpython_ver=38
-        export MACOSX_DEPLOYMENT_TARGET=10.13
+        export MACOSX_DEPLOYMENT_TARGET=10.15
         #OPENMP_URL="https://anaconda.org/conda-forge/llvm-openmp/11.1.0/download/osx-64/llvm-openmp-11.1.0-hda6cdc1_1.tar.bz2"
         OPENMP_URL="https://xgboost-ci-jenkins-artifacts.s3.us-west-2.amazonaws.com/llvm-openmp-11.1.0-hda6cdc1_1-osx-64.tar.bz2"
     else
diff --git a/tests/cpp/c_api/test_c_api.cc b/tests/cpp/c_api/test_c_api.cc
index 4e1b342ae..205e5f561 100644
--- a/tests/cpp/c_api/test_c_api.cc
+++ b/tests/cpp/c_api/test_c_api.cc
@@ -8,10 +8,11 @@
 #include <xgboost/learner.h>
 #include <xgboost/version_config.h>
 
-#include <array>    // for array
-#include <cstddef>  // std::size_t
-#include <limits>   // std::numeric_limits
-#include <string>   // std::string
+#include <array>      // for array
+#include <cstddef>    // std::size_t
+#include <filesystem> // std::filesystem
+#include <limits>     // std::numeric_limits
+#include <string>     // std::string
 #include <vector>
 
 #include "../../../src/c_api/c_api_error.h"
@@ -162,7 +163,7 @@ TEST(CAPI, ConfigIO) {
 TEST(CAPI, JsonModelIO) {
   size_t constexpr kRows = 10;
   size_t constexpr kCols = 10;
-  dmlc::TemporaryDirectory tempdir;
+  auto tempdir = std::filesystem::temp_directory_path();
 
   auto p_dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
   std::vector<std::shared_ptr<DMatrix>> mat {p_dmat};
@@ -178,19 +179,19 @@ TEST(CAPI, JsonModelIO) {
   learner->UpdateOneIter(0, p_dmat);
   BoosterHandle handle = learner.get();
 
-  std::string modelfile_0 = tempdir.path + "/model_0.json";
-  XGBoosterSaveModel(handle, modelfile_0.c_str());
-  XGBoosterLoadModel(handle, modelfile_0.c_str());
+  auto modelfile_0 = tempdir / std::filesystem::u8path(u8"모델_0.json");
+  XGBoosterSaveModel(handle, modelfile_0.u8string().c_str());
+  XGBoosterLoadModel(handle, modelfile_0.u8string().c_str());
 
   bst_ulong num_feature {0};
   ASSERT_EQ(XGBoosterGetNumFeature(handle, &num_feature), 0);
   ASSERT_EQ(num_feature, kCols);
 
-  std::string modelfile_1 = tempdir.path + "/model_1.json";
-  XGBoosterSaveModel(handle, modelfile_1.c_str());
+  auto modelfile_1 = tempdir / "model_1.json";
+  XGBoosterSaveModel(handle, modelfile_1.u8string().c_str());
 
-  auto model_str_0 = common::LoadSequentialFile(modelfile_0);
-  auto model_str_1 = common::LoadSequentialFile(modelfile_1);
+  auto model_str_0 = common::LoadSequentialFile(modelfile_0.u8string());
+  auto model_str_1 = common::LoadSequentialFile(modelfile_1.u8string());
 
   ASSERT_EQ(model_str_0.front(), '{');
   ASSERT_EQ(model_str_0, model_str_1);
diff --git a/tests/python/test_basic.py b/tests/python/test_basic.py
index e512e4bc6..b99351c7f 100644
--- a/tests/python/test_basic.py
+++ b/tests/python/test_basic.py
@@ -1,5 +1,6 @@
 import json
 import os
+import pathlib
 import tempfile
 from pathlib import Path
 
@@ -167,6 +168,17 @@ class TestBasic:
         with pytest.raises(xgb.core.XGBoostError):
             xgb.Booster(model_file=u'不正なパス')
 
+    @pytest.mark.parametrize("path", ["모델.ubj", "がうる・ぐら.json"], ids=["path-0", "path-1"])
+    def test_unicode_path(self, tmpdir, path):
+        model_path = pathlib.Path(tmpdir) / path
+        dtrain, _ = tm.load_agaricus(__file__)
+        param = {"max_depth": 2, "eta": 1, "objective": "binary:logistic"}
+        bst = xgb.train(param, dtrain, num_boost_round=2)
+        bst.save_model(model_path)
+
+        bst2 = xgb.Booster(model_file=model_path)
+        assert bst.get_dump(dump_format="text") == bst2.get_dump(dump_format="text")
+
     def test_dmatrix_numpy_init_omp(self):
 
         rows = [1000, 11326, 15000]

From c1b2cff874aca6aef3d8322255acc6721c95aa51 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 9 Aug 2023 03:02:45 +0800
Subject: [PATCH 071/136] [CI] Check compiler warnings. (#9444)

---
 plugin/federated/federated_server.cc | 12 ++++++------
 tests/ci_build/build_via_cmake.sh    |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/plugin/federated/federated_server.cc b/plugin/federated/federated_server.cc
index b16d34780..c50bf1f35 100644
--- a/plugin/federated/federated_server.cc
+++ b/plugin/federated/federated_server.cc
@@ -14,15 +14,15 @@
 namespace xgboost {
 namespace federated {
 
-grpc::Status FederatedService::Allgather(grpc::ServerContext* context,
-                                         AllgatherRequest const* request, AllgatherReply* reply) {
+grpc::Status FederatedService::Allgather(grpc::ServerContext*, AllgatherRequest const* request,
+                                         AllgatherReply* reply) {
   handler_.Allgather(request->send_buffer().data(), request->send_buffer().size(),
                      reply->mutable_receive_buffer(), request->sequence_number(), request->rank());
   return grpc::Status::OK;
 }
 
-grpc::Status FederatedService::Allreduce(grpc::ServerContext* context,
-                                         AllreduceRequest const* request, AllreduceReply* reply) {
+grpc::Status FederatedService::Allreduce(grpc::ServerContext*, AllreduceRequest const* request,
+                                         AllreduceReply* reply) {
   handler_.Allreduce(request->send_buffer().data(), request->send_buffer().size(),
                      reply->mutable_receive_buffer(), request->sequence_number(), request->rank(),
                      static_cast<xgboost::collective::DataType>(request->data_type()),
@@ -30,8 +30,8 @@ grpc::Status FederatedService::Allreduce(grpc::ServerContext* context,
   return grpc::Status::OK;
 }
 
-grpc::Status FederatedService::Broadcast(grpc::ServerContext* context,
-                                         BroadcastRequest const* request, BroadcastReply* reply) {
+grpc::Status FederatedService::Broadcast(grpc::ServerContext*, BroadcastRequest const* request,
+                                         BroadcastReply* reply) {
   handler_.Broadcast(request->send_buffer().data(), request->send_buffer().size(),
                      reply->mutable_receive_buffer(), request->sequence_number(), request->rank(),
                      request->root());
diff --git a/tests/ci_build/build_via_cmake.sh b/tests/ci_build/build_via_cmake.sh
index adf3cf984..ef5b8dc0e 100755
--- a/tests/ci_build/build_via_cmake.sh
+++ b/tests/ci_build/build_via_cmake.sh
@@ -24,7 +24,7 @@ fi
 rm -rf build
 mkdir build
 cd build
-cmake .. ${cmake_args} -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DCMAKE_VERBOSE_MAKEFILE=ON -DENABLE_ALL_WARNINGS=ON -GNinja ${cmake_prefix_flag} -DHIDE_CXX_SYMBOLS=ON
+cmake .. ${cmake_args} -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DCMAKE_VERBOSE_MAKEFILE=ON -DENABLE_ALL_WARNINGS=ON -DCMAKE_COMPILE_WARNING_AS_ERROR=ON -GNinja ${cmake_prefix_flag} -DHIDE_CXX_SYMBOLS=ON
 ninja clean
 time ninja -v
 cd ..

From 819098a48f2e58a92894bd0e39f191ea4e199323 Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Tue, 8 Aug 2023 21:29:19 -0700
Subject: [PATCH 072/136] [R] Handle UTF-8 paths on Windows (#9448)

---
 R-package/R/xgb.Booster.R               |  2 +-
 R-package/R/xgb.save.R                  |  2 +-
 R-package/tests/testthat/test_unicode.R | 21 +++++++++++++++++++++
 3 files changed, 23 insertions(+), 2 deletions(-)
 create mode 100644 R-package/tests/testthat/test_unicode.R

diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R
index 080067039..6a53577e9 100644
--- a/R-package/R/xgb.Booster.R
+++ b/R-package/R/xgb.Booster.R
@@ -12,7 +12,7 @@ xgb.Booster.handle <- function(params = list(), cachelist = list(),
       ## A filename
       handle <- .Call(XGBoosterCreate_R, cachelist)
       modelfile <- path.expand(modelfile)
-      .Call(XGBoosterLoadModel_R, handle, modelfile[1])
+      .Call(XGBoosterLoadModel_R, handle, enc2utf8(modelfile[1]))
       class(handle) <- "xgb.Booster.handle"
       if (length(params) > 0) {
         xgb.parameters(handle) <- params
diff --git a/R-package/R/xgb.save.R b/R-package/R/xgb.save.R
index 42ecb4153..14be0f065 100644
--- a/R-package/R/xgb.save.R
+++ b/R-package/R/xgb.save.R
@@ -43,6 +43,6 @@ xgb.save <- function(model, fname) {
   }
   model <- xgb.Booster.complete(model, saveraw = FALSE)
   fname <- path.expand(fname)
-  .Call(XGBoosterSaveModel_R, model$handle, fname[1])
+  .Call(XGBoosterSaveModel_R, model$handle, enc2utf8(fname[1]))
   return(TRUE)
 }
diff --git a/R-package/tests/testthat/test_unicode.R b/R-package/tests/testthat/test_unicode.R
new file mode 100644
index 000000000..cac544ef9
--- /dev/null
+++ b/R-package/tests/testthat/test_unicode.R
@@ -0,0 +1,21 @@
+context("Test Unicode handling")
+
+data(agaricus.train, package = 'xgboost')
+data(agaricus.test, package = 'xgboost')
+train <- agaricus.train
+test <- agaricus.test
+set.seed(1994)
+
+test_that("Can save and load models with Unicode paths", {
+  nrounds <- 2
+  bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
+                 eta = 1, nthread = 2, nrounds = nrounds, objective = "binary:logistic",
+                 eval_metric = "error")
+  tmpdir <- tempdir()
+  lapply(c("모델.json", "がうる・ぐら.json", "类继承.ubj"), function(x) {
+    path <- file.path(tmpdir, x)
+    xgb.save(bst, path)
+    bst2 <- xgb.load(path)
+    expect_equal(predict(bst, test$data), predict(bst2, test$data))
+  })
+})

From f05294a6f2e7d065ce7d479b9ad503e677f1b981 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 9 Aug 2023 15:34:45 +0800
Subject: [PATCH 073/136] Fix clang warnings. (#9447)

- static function in header. (which is marked as unused due to translation unit
visibility).
- Implicit copy operator is deprecated.
- Unused lambda capture.
- Moving a temporary variable prevents copy elision.
---
 include/xgboost/base.h         | 18 ++++--------------
 src/common/math.h              |  6 ------
 src/data/iterative_dmatrix.cu  |  2 +-
 src/metric/rank_metric.cc      |  3 ++-
 src/predictor/gpu_predictor.cu | 12 +++++-------
 5 files changed, 12 insertions(+), 29 deletions(-)

diff --git a/include/xgboost/base.h b/include/xgboost/base.h
index a5edadb6c..f02d75cdc 100644
--- a/include/xgboost/base.h
+++ b/include/xgboost/base.h
@@ -271,10 +271,11 @@ class GradientPairInt64 {
   GradientPairInt64() = default;
 
   // Copy constructor if of same value type, marked as default to be trivially_copyable
-  GradientPairInt64(const GradientPairInt64 &g) = default;
+  GradientPairInt64(GradientPairInt64 const &g) = default;
+  GradientPairInt64 &operator=(GradientPairInt64 const &g) = default;
 
-  XGBOOST_DEVICE T GetQuantisedGrad() const { return grad_; }
-  XGBOOST_DEVICE T GetQuantisedHess() const { return hess_; }
+  XGBOOST_DEVICE [[nodiscard]] T GetQuantisedGrad() const { return grad_; }
+  XGBOOST_DEVICE [[nodiscard]] T GetQuantisedHess() const { return hess_; }
 
   XGBOOST_DEVICE GradientPairInt64 &operator+=(const GradientPairInt64 &rhs) {
     grad_ += rhs.grad_;
@@ -323,17 +324,6 @@ using omp_ulong = dmlc::omp_ulong;  // NOLINT
 using bst_omp_uint = dmlc::omp_uint;  // NOLINT
 /*! \brief Type used for representing version number in binary form.*/
 using XGBoostVersionT = int32_t;
-
-/*!
- * \brief define compatible keywords in g++
- *  Used to support g++-4.6 and g++4.7
- */
-#if DMLC_USE_CXX11 && defined(__GNUC__) && !defined(__clang_version__)
-#if __GNUC__ == 4 && __GNUC_MINOR__ < 8
-#define override
-#define final
-#endif  // __GNUC__ == 4 && __GNUC_MINOR__ < 8
-#endif  // DMLC_USE_CXX11 && defined(__GNUC__) && !defined(__clang_version__)
 }  // namespace xgboost
 
 #endif  // XGBOOST_BASE_H_
diff --git a/src/common/math.h b/src/common/math.h
index c4d794b5d..be5ff7abd 100644
--- a/src/common/math.h
+++ b/src/common/math.h
@@ -134,12 +134,6 @@ inline float LogSum(Iterator begin, Iterator end) {
   return mx + std::log(sum);
 }
 
-// comparator functions for sorting pairs in descending order
-inline static bool CmpFirst(const std::pair<float, unsigned> &a,
-                            const std::pair<float, unsigned> &b) {
-  return a.first > b.first;
-}
-
 // Redefined here to workaround a VC bug that doesn't support overloading for integer
 // types.
 template <typename T>
diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu
index 1e74cb23c..cf34ca61d 100644
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@@ -114,7 +114,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
   this->info_.num_row_ = accumulated_rows;
   this->info_.num_nonzero_ = nnz;
 
-  auto init_page = [this, &proxy, &cuts, row_stride, accumulated_rows, get_device]() {
+  auto init_page = [this, &cuts, row_stride, accumulated_rows, get_device]() {
     if (!ellpack_) {
       // Should be put inside the while loop to protect against empty batch.  In
       // that case device id is invalid.
diff --git a/src/metric/rank_metric.cc b/src/metric/rank_metric.cc
index dd9adc017..8df6e585f 100644
--- a/src/metric/rank_metric.cc
+++ b/src/metric/rank_metric.cc
@@ -68,7 +68,8 @@ struct EvalAMS : public MetricNoCache {
     const auto &h_preds = preds.ConstHostVector();
     common::ParallelFor(ndata, ctx_->Threads(),
                         [&](bst_omp_uint i) { rec[i] = std::make_pair(h_preds[i], i); });
-    common::Sort(ctx_, rec.begin(), rec.end(), common::CmpFirst);
+    common::Sort(ctx_, rec.begin(), rec.end(),
+                 [](auto const& l, auto const& r) { return l.first > r.first; });
     auto ntop = static_cast<unsigned>(ratio_ * ndata);
     if (ntop == 0) ntop = ndata;
     const double br = 10.0;
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index 578fda180..70a5c02d5 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -344,7 +344,7 @@ class DeviceModel {
     dh::safe_cuda(cudaSetDevice(gpu_id));
 
     // Copy decision trees to device
-    tree_segments = std::move(HostDeviceVector<size_t>({}, gpu_id));
+    tree_segments = HostDeviceVector<size_t>({}, gpu_id);
     auto& h_tree_segments = tree_segments.HostVector();
     h_tree_segments.reserve((tree_end - tree_begin) + 1);
     size_t sum = 0;
@@ -354,10 +354,8 @@ class DeviceModel {
       h_tree_segments.push_back(sum);
     }
 
-    nodes = std::move(HostDeviceVector<RegTree::Node>(h_tree_segments.back(), RegTree::Node(),
-                                                      gpu_id));
-    stats = std::move(HostDeviceVector<RTreeNodeStat>(h_tree_segments.back(),
-                                                      RTreeNodeStat(), gpu_id));
+    nodes = HostDeviceVector<RegTree::Node>(h_tree_segments.back(), RegTree::Node(), gpu_id);
+    stats = HostDeviceVector<RTreeNodeStat>(h_tree_segments.back(), RTreeNodeStat(), gpu_id);
     auto d_nodes = nodes.DevicePointer();
     auto d_stats = stats.DevicePointer();
     for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
@@ -371,7 +369,7 @@ class DeviceModel {
           sizeof(RTreeNodeStat) * src_stats.size(), cudaMemcpyDefault));
     }
 
-    tree_group = std::move(HostDeviceVector<int>(model.tree_info.size(), 0, gpu_id));
+    tree_group = HostDeviceVector<int>(model.tree_info.size(), 0, gpu_id);
     auto& h_tree_group = tree_group.HostVector();
     std::memcpy(h_tree_group.data(), model.tree_info.data(), sizeof(int) * model.tree_info.size());
 
@@ -435,7 +433,7 @@ struct ShapSplitCondition {
   bool is_missing_branch;
 
   // Does this instance flow down this path?
-  XGBOOST_DEVICE bool EvaluateSplit(float x) const {
+  [[nodiscard]] XGBOOST_DEVICE bool EvaluateSplit(float x) const {
     // is nan
     if (isnan(x)) {
       return is_missing_branch;

From 7f854848d3df0709be360122ea0c78d5dc7c6cbf Mon Sep 17 00:00:00 2001
From: joshbrowning2358 <rockclimber112358@gmail.com>
Date: Wed, 9 Aug 2023 11:04:28 +0200
Subject: [PATCH 074/136] Update R docs based on deprecated
 parameters/behaviour (#9437)

---
 R-package/vignettes/discoverYourData.Rmd | 105 ++--
 doc/R-package/discoverYourData.md        | 646 ++++++++++++-----------
 2 files changed, 361 insertions(+), 390 deletions(-)

diff --git a/R-package/vignettes/discoverYourData.Rmd b/R-package/vignettes/discoverYourData.Rmd
index c939232a1..8b9e2e2e3 100644
--- a/R-package/vignettes/discoverYourData.Rmd
+++ b/R-package/vignettes/discoverYourData.Rmd
@@ -51,24 +51,24 @@ A *categorical* variable has a fixed number of different values. For instance, i
 >
 > Type `?factor` in the console for more information.
 
-To answer the question above we will convert *categorical* variables to `numeric` one.
+To answer the question above we will convert *categorical* variables to `numeric` ones.
 
 ### Conversion from categorical to numeric variables
 
 #### Looking at the raw data
 
-In this Vignette we will see how to transform a *dense* `data.frame` (*dense* = few zeroes in the matrix) with *categorical* variables to a very *sparse* matrix (*sparse* = lots of zero in the matrix) of `numeric` features.
++In this Vignette we will see how to transform a *dense* `data.frame` (*dense* = the majority of the matrix is non-zero) with *categorical* variables to a very *sparse* matrix (*sparse* = lots of zero entries in the matrix) of `numeric` features.
 
 The method we are going to see is usually called [one-hot encoding](https://en.wikipedia.org/wiki/One-hot).
 
-The first step is to load `Arthritis` dataset in memory and wrap it with `data.table` package.
+The first step is to load the `Arthritis` dataset in memory and wrap it with the `data.table` package.
 
 ```{r, results='hide'}
 data(Arthritis)
 df <- data.table(Arthritis, keep.rownames = FALSE)
 ```
 
-> `data.table` is 100% compliant with **R** `data.frame` but its syntax is more consistent and its performance for large dataset is [best in class](https://stackoverflow.com/questions/21435339/data-table-vs-dplyr-can-one-do-something-well-the-other-cant-or-does-poorly) (`dplyr` from **R** and `Pandas` from **Python** [included](https://github.com/Rdatatable/data.table/wiki/Benchmarks-%3A-Grouping)). Some parts of **XGBoost** **R** package use `data.table`.
+> `data.table` is 100% compliant with **R** `data.frame` but its syntax is more consistent and its performance for large dataset is [best in class](https://stackoverflow.com/questions/21435339/data-table-vs-dplyr-can-one-do-something-well-the-other-cant-or-does-poorly) (`dplyr` from **R** and `Pandas` from **Python** [included](https://github.com/Rdatatable/data.table/wiki/Benchmarks-%3A-Grouping)). Some parts of **XGBoost's** **R** package use `data.table`.
 
 The first thing we want to do is to have a look to the first few lines of the `data.table`:
 
@@ -95,19 +95,19 @@ We will add some new *categorical* features to see if it helps.
 
 ##### Grouping per 10 years
 
-For the first feature we create groups of age by rounding the real age.
+For the first features we create groups of age by rounding the real age.
 
-Note that we transform it to `factor` so the algorithm treat these age groups as independent values.
+Note that we transform it to `factor` so the algorithm treats these age groups as independent values.
 
-Therefore, 20 is not closer to 30 than 60. To make it short, the distance between ages is lost in this transformation.
+Therefore, 20 is not closer to 30 than 60. In other words, the distance between ages is lost in this transformation.
 
 ```{r}
 head(df[, AgeDiscret := as.factor(round(Age / 10, 0))])
 ```
 
-##### Random split into two groups
+##### Randomly split into two groups
 
-Following is an even stronger simplification of the real age with an arbitrary split at 30 years old. We choose this value **based on nothing**. We will see later if simplifying the information based on arbitrary values is a good strategy (you may already have an idea of how well it will work...).
+The following is an even stronger simplification of the real age with an arbitrary split at 30 years old. I choose this value **based on nothing**. We will see later if simplifying the information based on arbitrary values is a good strategy (you may already have an idea of how well it will work...).
 
 ```{r}
 head(df[, AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))])
@@ -119,7 +119,7 @@ These new features are highly correlated to the `Age` feature because they are s
 
 For many machine learning algorithms, using correlated features is not a good idea. It may sometimes make prediction less accurate, and most of the time make interpretation of the model almost impossible. GLM, for instance, assumes that the features are uncorrelated.
 
-Fortunately, decision tree algorithms (including boosted trees) are very robust to these features. Therefore we have nothing to do to manage this situation.
+Fortunately, decision tree algorithms (including boosted trees) are very robust to these features. Therefore we don't have to do anything to manage this situation.
 
 ##### Cleaning data
 
@@ -144,7 +144,7 @@ We will use the [dummy contrast coding](https://stats.oarc.ucla.edu/r/library/r-
 
 The purpose is to transform each value of each *categorical* feature into a *binary* feature `{0, 1}`.
 
-For example, the column `Treatment` will be replaced by two columns, `TreatmentPlacebo`, and `TreatmentTreated`. Each of them will be *binary*. Therefore, an observation which has the value `Placebo` in column `Treatment` before the transformation will have after the transformation the value `1` in the new column `TreatmentPlacebo` and the value `0` in the new column `TreatmentTreated`. The column `TreatmentPlacebo` will disappear during the contrast encoding, as it would be absorbed into a common constant intercept column.
+For example, the column `Treatment` will be replaced by two columns, `TreatmentPlacebo`, and `TreatmentTreated`. Each of them will be *binary*. Therefore, an observation which has the value `Placebo` in column `Treatment` before the transformation will have the value `1` in the new column `TreatmentPlacebo` and the value `0` in the new column `TreatmentTreated` after the transformation. The column `TreatmentPlacebo` will disappear during the contrast encoding, as it would be absorbed into a common constant intercept column.
 
 Column `Improved` is excluded because it will be our `label` column, the one we want to predict.
 
@@ -176,13 +176,9 @@ bst <- xgboost(data = sparse_matrix, label = output_vector, max_depth = 4,
 
 ```
 
-You can see some `train-error: 0.XXXXX` lines followed by a number. It decreases. Each line shows how well the model explains your data. Lower is better.
+You can see some `train-logloss: 0.XXXXX` lines followed by a number. It decreases. Each line shows how well the model explains the data. Lower is better.
 
-A small value for training error may be a symptom of [overfitting](https://en.wikipedia.org/wiki/Overfitting), meaning the model will not accurately predict the future values.
-
-> Here you can see the numbers decrease until line 7 and then increase.
->
-> It probably means we are overfitting. To fix that I should reduce the number of rounds to `nrounds = 4`. I will let things like that because I don't really care for the purpose of this example :-)
+A small value for training error may be a symptom of [overfitting](https://en.wikipedia.org/wiki/Overfitting), meaning the model will not accurately predict unseen values.
 
 Feature importance
 ------------------
@@ -199,64 +195,35 @@ importance <- xgb.importance(feature_names = colnames(sparse_matrix), model = bs
 head(importance)
 ```
 
-> The column `Gain` provide the information we are looking for.
+> The column `Gain` provides the information we are looking for.
 >
 > As you can see, features are classified by `Gain`.
 
-`Gain` is the improvement in accuracy brought by a feature to the branches it is on. The idea is that before adding a new split on a feature X to the branch there was some wrongly classified elements, after adding the split on this feature, there are two new branches, and each of these branch is more accurate (one branch saying if your observation is on this branch then it should be classified as `1`, and the other branch saying the exact opposite).
+`Gain` is the improvement in accuracy brought by a feature to the branches it is on. The idea is that before adding a new split on a feature X to the branch there were some wrongly classified elements; after adding the split on this feature, there are two new branches, and each of these branches is more accurate (one branch saying if your observation is on this branch then it should be classified as `1`, and the other branch saying the exact opposite).
 
-`Cover` measures the relative quantity of observations concerned by a feature.
+`Cover` is related to the second order derivative (or Hessian) of the loss function with respect to a particular variable; thus, a large value indicates a variable has a large potential impact on the loss function and so is important.
 
 `Frequency` is a simpler way to measure the `Gain`. It just counts the number of times a feature is used in all generated trees. You should not use it (unless you know why you want to use it).
 
-#### Improvement in the interpretability of feature importance data.table
-
-We can go deeper in the analysis of the model. In the `data.table` above, we have discovered which features counts to predict if the illness will go or not. But we don't yet know the role of these features. For instance, one of the question we may want to answer would be: does receiving a placebo treatment helps to recover from the illness?
-
-One simple solution is to count the co-occurrences of a feature and a class of the classification.
-
-For that purpose we will execute the same function as above but using two more parameters, `data` and `label`.
-
-```{r}
-importanceRaw <- xgb.importance(feature_names = colnames(sparse_matrix), model = bst, data = sparse_matrix, label = output_vector)
-
-# Cleaning for better display
-importanceClean <- importanceRaw[, `:=`(Cover = NULL, Frequency = NULL)]
-
-head(importanceClean)
-```
-
-> In the table above we have removed two not needed columns and select only the first lines.
-
-First thing you notice is the new column `Split`. It is the split applied to the feature on a branch of one of the tree. Each split is present, therefore a feature can appear several times in this table. Here we can see the feature `Age` is used several times with different splits.
-
-How the split is applied to count the co-occurrences? It is always `<`. For instance, in the second line, we measure the number of persons under 61.5 years with the illness gone after the treatment.
-
-The two other new columns are `RealCover` and `RealCover %`. In the first column it measures the number of observations in the dataset where the split is respected and the label marked as `1`. The second column is the percentage of the whole population that `RealCover` represents.
-
-Therefore, according to our findings, getting a placebo doesn't seem to help but being younger than 61 years may help (seems logic).
-
-> You may wonder how to interpret the `< 1.00001` on the first line. Basically, in a sparse `Matrix`, there is no `0`, therefore, looking for one hot-encoded categorical observations validating the rule `< 1.00001` is like just looking for `1` for this feature.
-
 ### Plotting the feature importance
 
-
 All these things are nice, but it would be even better to plot the results.
 
 ```{r, fig.width=8, fig.height=5, fig.align='center'}
 xgb.plot.importance(importance_matrix = importance)
 ```
 
-Feature have automatically been divided in 2 clusters: the interesting features... and the others.
+Running this line of code, you should get a bar chart showing the importance of the 6 features (containing the same data as the output we saw earlier, but displaying it visually for easier consumption).  Note that `xgb.ggplot.importance` is also available for all the ggplot2 fans!
 
 > Depending of the dataset and the learning parameters you may have more than two clusters. Default value is to limit them to `10`, but you can increase this limit. Look at the function documentation for more information.
 
 According to the plot above, the most important features in this dataset to predict if the treatment will work are :
 
-* the Age ;
-* having received a placebo or not ;
-* the sex is third but already included in the not interesting features group ;
-* then we see our generated features (AgeDiscret). We can see that their contribution is very low.
+* An individual's age;
+* Having received a placebo or not;
+* Gender;
+* Our generated feature AgeDiscret. We can see that its contribution is very low.
+
 
 ### Do these results make sense?
 
@@ -270,53 +237,53 @@ c2 <- chisq.test(df$Age, output_vector)
 print(c2)
 ```
 
-Pearson correlation between Age and illness disappearing is **`r round(c2$statistic, 2 )`**.
+The Pearson correlation between Age and illness disappearing is **`r round(c2$statistic, 2 )`**.
 
 ```{r, warning=FALSE, message=FALSE}
 c2 <- chisq.test(df$AgeDiscret, output_vector)
 print(c2)
 ```
 
-Our first simplification of Age gives a Pearson correlation is **`r round(c2$statistic, 2)`**.
+Our first simplification of Age gives a Pearson correlation of **`r round(c2$statistic, 2)`**.
 
 ```{r, warning=FALSE, message=FALSE}
 c2 <- chisq.test(df$AgeCat, output_vector)
 print(c2)
 ```
 
-The perfectly random split I did between young and old at 30 years old have a low correlation of **`r round(c2$statistic, 2)`**. It's a result we may expect as may be in my mind > 30 years is being old (I am 32 and starting feeling old, this may explain that), but for the illness we are studying, the age to be vulnerable is not the same.
+The perfectly random split we did between young and old at 30 years old has a low correlation of **2.36**. This suggests that, for the particular illness we are studying, the age at which someone is vulnerable to this disease is likely very different from 30.
 
-Morality: don't let your *gut* lower the quality of your model.
+Moral of the story: don't let your *gut* lower the quality of your model.
 
-In *data science* expression, there is the word *science* :-)
+In *data science*, there is the word *science* :-)
 
 Conclusion
 ----------
 
 As you can see, in general *destroying information by simplifying it won't improve your model*. **Chi2** just demonstrates that.
 
-But in more complex cases, creating a new feature based on existing one which makes link with the outcome more obvious may help the algorithm and improve the model.
+But in more complex cases, creating a new feature from an existing one may help the algorithm and improve the model.
 
-The case studied here is not enough complex to show that. Check [Kaggle website](http://www.kaggle.com/) for some challenging datasets. However it's almost always worse when you add some arbitrary rules.
++The case studied here is not complex enough to show that. Check [Kaggle website](https://www.kaggle.com/) for some challenging datasets.
 
-Moreover, you can notice that even if we have added some not useful new features highly correlated with other features, the boosting tree algorithm have been able to choose the best one, which in this case is the Age.
+Moreover, you can see that even if we have added some new features which are not very useful/highly correlated with other features, the boosting tree algorithm was still able to choose the best one (which in this case is the Age).
 
-Linear model may not be that smart in this scenario.
+Linear models may not perform as well.
 
 Special Note: What about Random Forests™?
 -----------------------------------------
 
-As you may know, [Random Forests](https://en.wikipedia.org/wiki/Random_forest) algorithm is cousin with boosting and both are part of the [ensemble learning](https://en.wikipedia.org/wiki/Ensemble_learning) family.
+As you may know, the [Random Forests](https://en.wikipedia.org/wiki/Random_forest) algorithm is cousin with boosting and both are part of the [ensemble learning](https://en.wikipedia.org/wiki/Ensemble_learning) family.
 
-Both trains several decision trees for one dataset. The *main* difference is that in Random Forests, trees are independent and in boosting, the tree `N+1` focus its learning on the loss (<=> what has not been well modeled by the tree `N`).
+Both train several decision trees for one dataset. The *main* difference is that in Random Forests, trees are independent and in boosting, the `N+1`-st tree focuses its learning on the loss (<=> what has not been well modeled by the tree `N`).
 
-This difference have an impact on a corner case in feature importance analysis: the *correlated features*.
+This difference can have an impact on a edge case in feature importance analysis: *correlated features*.
 
 Imagine two features perfectly correlated, feature `A` and feature `B`. For one specific tree, if the algorithm needs one of them, it will choose randomly (true in both boosting and Random Forests).
 
-However, in Random Forests this random choice will be done for each tree, because each tree is independent from the others. Therefore, approximatively, depending of your parameters, 50% of the trees will choose feature `A` and the other 50% will choose feature `B`. So the *importance* of the information contained in `A` and `B` (which is the same, because they are perfectly correlated) is diluted in `A` and `B`. So you won't easily know this information is important to predict what you want to predict! It is even worse when you have 10 correlated features...
+However, in Random Forests this random choice will be done for each tree, because each tree is independent from the others. Therefore, approximately (and depending on your parameters) 50% of the trees will choose feature `A` and the other 50% will choose feature `B`. So the *importance* of the information contained in `A` and `B` (which is the same, because they are perfectly correlated) is diluted in `A` and `B`. So you won't easily know this information is important to predict what you want to predict! It is even worse when you have 10 correlated features...
 
-In boosting, when a specific link between feature and outcome have been learned by the algorithm, it will try to not refocus on it (in theory it is what happens, reality is not always that simple). Therefore, all the importance will be on feature `A` or on feature `B` (but not both). You will know that one feature have an important role in the link between the observations and the label. It is still up to you to search for the correlated features to the one detected as important if you need to know all of them.
+In boosting, when a specific link between feature and outcome have been learned by the algorithm, it will try to not refocus on it (in theory it is what happens, reality is not always that simple). Therefore, all the importance will be on feature `A` or on feature `B` (but not both). You will know that one feature has an important role in the link between the observations and the label. It is still up to you to search for the correlated features to the one detected as important if you need to know all of them.
 
 If you want to try Random Forests algorithm, you can tweak XGBoost parameters!
 
diff --git a/doc/R-package/discoverYourData.md b/doc/R-package/discoverYourData.md
index 9233546df..1c5136459 100644
--- a/doc/R-package/discoverYourData.md
+++ b/doc/R-package/discoverYourData.md
@@ -1,102 +1,103 @@
+# Understand your dataset with XGBoost
 
-Understand your dataset with XGBoost
-====================================
+## Introduction
 
-Introduction
-------------
+The purpose of this vignette is to show you how to use **XGBoost** to
+discover and understand your own dataset better.
 
-The purpose of this Vignette is to show you how to use **XGBoost** to discover and understand your own dataset better.
-
-This Vignette is not about predicting anything (see [XGBoost presentation](https://github.com/dmlc/xgboost/blob/master/R-package/vignettes/xgboostPresentation.Rmd)). We will explain how to use **XGBoost** to highlight the *link* between the *features* of your data and the *outcome*.
+This vignette is not about predicting anything (see [XGBoost
+presentation](https://github.com/dmlc/xgboost/blob/master/R-package/vignettes/xgboostPresentation.Rmd)).
+We will explain how to use **XGBoost** to highlight the *link* between
+the *features* of your data and the *outcome*.
 
 Package loading:
 
-
-```r
-require(xgboost)
-require(Matrix)
-require(data.table)
-if (!require('vcd')) install.packages('vcd')
-```
+    require(xgboost)
+    require(Matrix)
+    require(data.table)
+    if (!require('vcd')) {
+      install.packages('vcd')
+    }
 
 > **VCD** package is used for one of its embedded dataset only.
 
-Preparation of the dataset
---------------------------
-
-### Numeric VS categorical variables
+## Preparation of the dataset
 
+### Numeric v.s. categorical variables
 
 **XGBoost** manages only `numeric` vectors.
 
 What to do when you have *categorical* data?
 
-A *categorical* variable has a fixed number of different values. For instance, if a variable called *Colour* can have only one of these three values, *red*, *blue* or *green*, then *Colour* is a *categorical* variable.
+A *categorical* variable has a fixed number of different values. For
+instance, if a variable called *Colour* can have only one of these three
+values, *red*, *blue* or *green*, then *Colour* is a *categorical*
+variable.
 
 > In **R**, a *categorical* variable is called `factor`.
 >
 > Type `?factor` in the console for more information.
 
-To answer the question above we will convert *categorical* variables to `numeric` one.
+To answer the question above we will convert *categorical* variables to
+`numeric` ones.
 
 ### Conversion from categorical to numeric variables
 
 #### Looking at the raw data
 
-In this Vignette we will see how to transform a *dense* `data.frame` (*dense* = few zeroes in the matrix) with *categorical* variables to a very *sparse* matrix (*sparse* = lots of zero in the matrix) of `numeric` features.
++In this Vignette we will see how to transform a *dense* `data.frame`
+(*dense* = the majority of the matrix is non-zero) with *categorical*
+variables to a very *sparse* matrix (*sparse* = lots of zero entries in
+the matrix) of `numeric` features.
 
-The method we are going to see is usually called [one-hot encoding](http://en.wikipedia.org/wiki/One-hot).
+The method we are going to see is usually called [one-hot
+encoding](https://en.wikipedia.org/wiki/One-hot).
 
-The first step is to load `Arthritis` dataset in memory and wrap it with `data.table` package.
+The first step is to load the `Arthritis` dataset in memory and wrap it
+with the `data.table` package.
 
+    data(Arthritis)
+    df <- data.table(Arthritis, keep.rownames = FALSE)
 
-```r
-data(Arthritis)
-df <- data.table(Arthritis, keep.rownames = FALSE)
-```
+> `data.table` is 100% compliant with **R** `data.frame` but its syntax
+> is more consistent and its performance for large dataset is [best in
+> class](https://stackoverflow.com/questions/21435339/data-table-vs-dplyr-can-one-do-something-well-the-other-cant-or-does-poorly)
+> (`dplyr` from **R** and `Pandas` from **Python**
+> [included](https://github.com/Rdatatable/data.table/wiki/Benchmarks-%3A-Grouping)).
+> Some parts of **XGBoost’s** **R** package use `data.table`.
 
-> `data.table` is 100% compliant with **R** `data.frame` but its syntax is more consistent and its performance for large dataset is [best in class](http://stackoverflow.com/questions/21435339/data-table-vs-dplyr-can-one-do-something-well-the-other-cant-or-does-poorly) (`dplyr` from **R** and `Pandas` from **Python** [included](https://github.com/Rdatatable/data.table/wiki/Benchmarks-%3A-Grouping)). Some parts of **XGBoost** **R** package use `data.table`.
+The first thing we want to do is to have a look to the first few lines
+of the `data.table`:
 
-The first thing we want to do is to have a look to the first lines of the `data.table`:
+    head(df)
 
-
-```r
-head(df)
-```
-
-```
-##    ID Treatment  Sex Age Improved
-## 1: 57   Treated Male  27     Some
-## 2: 46   Treated Male  29     None
-## 3: 77   Treated Male  30     None
-## 4: 17   Treated Male  32   Marked
-## 5: 36   Treated Male  46   Marked
-## 6: 23   Treated Male  58   Marked
-```
+    ##    ID Treatment  Sex Age Improved
+    ## 1: 57   Treated Male  27     Some
+    ## 2: 46   Treated Male  29     None
+    ## 3: 77   Treated Male  30     None
+    ## 4: 17   Treated Male  32   Marked
+    ## 5: 36   Treated Male  46   Marked
+    ## 6: 23   Treated Male  58   Marked
 
 Now we will check the format of each column.
 
+    str(df)
 
-```r
-str(df)
-```
-
-```
-## Classes 'data.table' and 'data.frame':	84 obs. of  5 variables:
-##  $ ID       : int  57 46 77 17 36 23 75 39 33 55 ...
-##  $ Treatment: Factor w/ 2 levels "Placebo","Treated": 2 2 2 2 2 2 2 2 2 2 ...
-##  $ Sex      : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 2 ...
-##  $ Age      : int  27 29 30 32 46 58 59 59 63 63 ...
-##  $ Improved : Ord.factor w/ 3 levels "None"<"Some"<..: 2 1 1 3 3 3 1 3 1 1 ...
-##  - attr(*, ".internal.selfref")=<externalptr>
-```
+    ## Classes 'data.table' and 'data.frame':   84 obs. of  5 variables:
+    ##  $ ID       : int  57 46 77 17 36 23 75 39 33 55 ...
+    ##  $ Treatment: Factor w/ 2 levels "Placebo","Treated": 2 2 2 2 2 2 2 2 2 2 ...
+    ##  $ Sex      : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 2 ...
+    ##  $ Age      : int  27 29 30 32 46 58 59 59 63 63 ...
+    ##  $ Improved : Ord.factor w/ 3 levels "None"<"Some"<..: 2 1 1 3 3 3 1 3 1 1 ...
+    ##  - attr(*, ".internal.selfref")=<externalptr>
 
 2 columns have `factor` type, one has `ordinal` type.
 
 > `ordinal` variable :
 >
-> * can take a limited number of values (like `factor`) ;
-> * these values are ordered (unlike `factor`). Here these ordered values are: `Marked > Some > None`
+> -   can take a limited number of values (like `factor`) ;
+> -   these values are ordered (unlike `factor`). Here these ordered
+>     values are: `Marked > Some > None`
 
 #### Creation of new features based on old ones
 
@@ -104,368 +105,371 @@ We will add some new *categorical* features to see if it helps.
 
 ##### Grouping per 10 years
 
-For the first feature we create groups of age by rounding the real age.
+For the first features we create groups of age by rounding the real age.
 
-Note that we transform it to `factor` so the algorithm treat these age groups as independent values.
+Note that we transform it to `factor` so the algorithm treats these age
+groups as independent values.
 
-Therefore, 20 is not closer to 30 than 60. To make it short, the distance between ages is lost in this transformation.
+Therefore, 20 is not closer to 30 than 60. In other words, the distance
+between ages is lost in this transformation.
 
+    head(df[, AgeDiscret := as.factor(round(Age / 10, 0))])
 
-```r
-head(df[,AgeDiscret := as.factor(round(Age/10,0))])
-```
+    ##    ID Treatment  Sex Age Improved AgeDiscret
+    ## 1: 57   Treated Male  27     Some          3
+    ## 2: 46   Treated Male  29     None          3
+    ## 3: 77   Treated Male  30     None          3
+    ## 4: 17   Treated Male  32   Marked          3
+    ## 5: 36   Treated Male  46   Marked          5
+    ## 6: 23   Treated Male  58   Marked          6
 
-```
-##    ID Treatment  Sex Age Improved AgeDiscret
-## 1: 57   Treated Male  27     Some          3
-## 2: 46   Treated Male  29     None          3
-## 3: 77   Treated Male  30     None          3
-## 4: 17   Treated Male  32   Marked          3
-## 5: 36   Treated Male  46   Marked          5
-## 6: 23   Treated Male  58   Marked          6
-```
+##### Randomly split into two groups
 
-##### Random split in two groups
+The following is an even stronger simplification of the real age with an
+arbitrary split at 30 years old. I choose this value **based on
+nothing**. We will see later if simplifying the information based on
+arbitrary values is a good strategy (you may already have an idea of how
+well it will work…).
 
-Following is an even stronger simplification of the real age with an arbitrary split at 30 years old. I choose this value **based on nothing**. We will see later if simplifying the information based on arbitrary values is a good strategy (you may already have an idea of how well it will work...).
+    head(df[, AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))])
 
-
-```r
-head(df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))])
-```
-
-```
-##    ID Treatment  Sex Age Improved AgeDiscret AgeCat
-## 1: 57   Treated Male  27     Some          3  Young
-## 2: 46   Treated Male  29     None          3  Young
-## 3: 77   Treated Male  30     None          3  Young
-## 4: 17   Treated Male  32   Marked          3    Old
-## 5: 36   Treated Male  46   Marked          5    Old
-## 6: 23   Treated Male  58   Marked          6    Old
-```
+    ##    ID Treatment  Sex Age Improved AgeDiscret AgeCat
+    ## 1: 57   Treated Male  27     Some          3  Young
+    ## 2: 46   Treated Male  29     None          3  Young
+    ## 3: 77   Treated Male  30     None          3  Young
+    ## 4: 17   Treated Male  32   Marked          3    Old
+    ## 5: 36   Treated Male  46   Marked          5    Old
+    ## 6: 23   Treated Male  58   Marked          6    Old
 
 ##### Risks in adding correlated features
 
-These new features are highly correlated to the `Age` feature because they are simple transformations of this feature.
+These new features are highly correlated to the `Age` feature because
+they are simple transformations of this feature.
 
-For many machine learning algorithms, using correlated features is not a good idea. It may sometimes make prediction less accurate, and most of the time make interpretation of the model almost impossible. GLM, for instance, assumes that the features are uncorrelated.
+For many machine learning algorithms, using correlated features is not a
+good idea. It may sometimes make prediction less accurate, and most of
+the time make interpretation of the model almost impossible. GLM, for
+instance, assumes that the features are uncorrelated.
 
-Fortunately, decision tree algorithms (including boosted trees) are very robust to these features. Therefore we have nothing to do to manage this situation.
+Fortunately, decision tree algorithms (including boosted trees) are very
+robust to these features. Therefore we don’t have to do anything to
+manage this situation.
 
 ##### Cleaning data
 
-We remove ID as there is nothing to learn from this feature (it would just add some noise).
+We remove ID as there is nothing to learn from this feature (it would
+just add some noise).
 
-
-```r
-df[,ID:=NULL]
-```
+    df[, ID := NULL]
 
 We will list the different values for the column `Treatment`:
 
+    levels(df[, Treatment])
 
-```r
-levels(df[,Treatment])
-```
+    ## [1] "Placebo" "Treated"
 
-```
-## [1] "Placebo" "Treated"
-```
-
-
-#### One-hot encoding
+#### Encoding categorical features
 
 Next step, we will transform the categorical data to dummy variables.
-This is the [one-hot encoding](http://en.wikipedia.org/wiki/One-hot) step.
+Several encoding methods exist, e.g., [one-hot
+encoding](https://en.wikipedia.org/wiki/One-hot) is a common approach.
+We will use the [dummy contrast
+coding](https://stats.oarc.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/)
+which is popular because it produces “full rank” encoding (also see
+[this blog post by Max
+Kuhn](http://appliedpredictivemodeling.com/blog/2013/10/23/the-basics-of-encoding-categorical-data-for-predictive-models)).
 
-The purpose is to transform each value of each *categorical* feature in a *binary* feature `{0, 1}`.
+The purpose is to transform each value of each *categorical* feature
+into a *binary* feature `{0, 1}`.
 
-For example, the column `Treatment` will be replaced by two columns, `Placebo`, and `Treated`. Each of them will be *binary*. Therefore, an observation which has the value `Placebo` in column `Treatment` before the transformation will have after the transformation the value `1` in the new column `Placebo` and the value `0` in the new column `Treated`. The column `Treatment` will disappear during the one-hot encoding.
+For example, the column `Treatment` will be replaced by two columns,
+`TreatmentPlacebo`, and `TreatmentTreated`. Each of them will be
+*binary*. Therefore, an observation which has the value `Placebo` in
+column `Treatment` before the transformation will have the value `1` in
+the new column `TreatmentPlacebo` and the value `0` in the new column
+`TreatmentTreated` after the transformation. The column
+`TreatmentPlacebo` will disappear during the contrast encoding, as it
+would be absorbed into a common constant intercept column.
 
-Column `Improved` is excluded because it will be our `label` column, the one we want to predict.
+Column `Improved` is excluded because it will be our `label` column, the
+one we want to predict.
 
+    sparse_matrix <- sparse.model.matrix(Improved ~ ., data = df)[, -1]
+    head(sparse_matrix)
 
-```r
-sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df)
-head(sparse_matrix)
-```
+    ## 6 x 9 sparse Matrix of class "dgCMatrix"
+    ##   TreatmentTreated SexMale Age AgeDiscret3 AgeDiscret4 AgeDiscret5 AgeDiscret6
+    ## 1                1       1  27           1           .           .           .
+    ## 2                1       1  29           1           .           .           .
+    ## 3                1       1  30           1           .           .           .
+    ## 4                1       1  32           1           .           .           .
+    ## 5                1       1  46           .           .           1           .
+    ## 6                1       1  58           .           .           .           1
+    ##   AgeDiscret7 AgeCatYoung
+    ## 1           .           1
+    ## 2           .           1
+    ## 3           .           1
+    ## 4           .           .
+    ## 5           .           .
+    ## 6           .           .
 
-```
-## 6 x 10 sparse Matrix of class "dgCMatrix"
-##                       
-## 1 . 1 1 27 1 . . . . 1
-## 2 . 1 1 29 1 . . . . 1
-## 3 . 1 1 30 1 . . . . 1
-## 4 . 1 1 32 1 . . . . .
-## 5 . 1 1 46 . . 1 . . .
-## 6 . 1 1 58 . . . 1 . .
-```
-
-> Formulae `Improved~.-1` used above means transform all *categorical* features but column `Improved` to binary values. The `-1` is here to remove the first column which is full of `1` (this column is generated by the conversion). For more information, you can type `?sparse.model.matrix` in the console.
+> Formula `Improved ~ .` used above means transform all *categorical*
+> features but column `Improved` to binary values. The `-1` column
+> selection removes the intercept column which is full of `1` (this
+> column is generated by the conversion). For more information, you can
+> type `?sparse.model.matrix` in the console.
 
 Create the output `numeric` vector (not as a sparse `Matrix`):
 
+    output_vector <- df[, Improved] == "Marked"
 
-```r
-output_vector = df[,Improved] == "Marked"
-```
+1.  set `Y` vector to `0`;
+2.  set `Y` to `1` for rows where `Improved == Marked` is `TRUE` ;
+3.  return `Y` vector.
 
-1. set `Y` vector to `0`;
-2. set `Y` to `1` for rows where `Improved == Marked` is `TRUE` ;
-3. return `Y` vector.
+## Build the model
 
-Build the model
----------------
+The code below is very usual. For more information, you can look at the
+documentation of `xgboost` function (or at the vignette [XGBoost
+presentation](https://github.com/dmlc/xgboost/blob/master/R-package/vignettes/xgboostPresentation.Rmd)).
 
-The code below is very usual. For more information, you can look at the documentation of `xgboost` function (or at the vignette [XGBoost presentation](https://github.com/dmlc/xgboost/blob/master/R-package/vignettes/xgboostPresentation.Rmd)).
+    bst <- xgboost(data = sparse_matrix, label = output_vector, max_depth = 4,
+                   eta = 1, nthread = 2, nrounds = 10, objective = "binary:logistic")
 
+    ## [1]  train-logloss:0.485466 
+    ## [2]  train-logloss:0.438534 
+    ## [3]  train-logloss:0.412250 
+    ## [4]  train-logloss:0.395828 
+    ## [5]  train-logloss:0.384264 
+    ## [6]  train-logloss:0.374028 
+    ## [7]  train-logloss:0.365005 
+    ## [8]  train-logloss:0.351233 
+    ## [9]  train-logloss:0.341678 
+    ## [10] train-logloss:0.334465
 
-```r
-bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 4,
-               eta = 1, nthread = 2, nrounds = 10,objective = "binary:logistic")
-```
+You can see some `train-logloss: 0.XXXXX` lines followed by a number. It
+decreases. Each line shows how well the model explains the data. Lower
+is better.
 
-```
-## [0]	train-error:0.202381
-## [1]	train-error:0.166667
-## [2]	train-error:0.166667
-## [3]	train-error:0.166667
-## [4]	train-error:0.154762
-## [5]	train-error:0.154762
-## [6]	train-error:0.154762
-## [7]	train-error:0.166667
-## [8]	train-error:0.166667
-## [9]	train-error:0.166667
-```
+A small value for training error may be a symptom of
+[overfitting](https://en.wikipedia.org/wiki/Overfitting), meaning the
+model will not accurately predict unseen values.
 
-You can see some `train-error: 0.XXXXX` lines followed by a number. It decreases. Each line shows how well the model explains your data. Lower is better.
-
-A model which fits too well may [overfit](http://en.wikipedia.org/wiki/Overfitting) (meaning it copy/paste too much the past, and won't be that good to predict the future).
-
-> Here you can see the numbers decrease until line 7 and then increase.
->
-> It probably means we are overfitting. To fix that I should reduce the number of rounds to `nrounds = 4`. I will let things like that because I don't really care for the purpose of this example :-)
-
-Feature importance
-------------------
+## Feature importance
 
 ## Measure feature importance
 
-
 ### Build the feature importance data.table
 
-In the code below, `sparse_matrix@Dimnames[[2]]` represents the column names of the sparse matrix. These names are the original values of the features (remember, each binary column == one value of one *categorical* feature).
+Remember, each binary column corresponds to a single value of one of
+*categorical* features.
 
+    importance <- xgb.importance(feature_names = colnames(sparse_matrix), model = bst)
+    head(importance)
 
-```r
-importance <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst)
-head(importance)
-```
+    ##             Feature        Gain      Cover  Frequency
+    ## 1:              Age 0.622031769 0.67251696 0.67241379
+    ## 2: TreatmentTreated 0.285750540 0.11916651 0.10344828
+    ## 3:          SexMale 0.048744022 0.04522028 0.08620690
+    ## 4:      AgeDiscret6 0.016604639 0.04784639 0.05172414
+    ## 5:      AgeDiscret3 0.016373781 0.08028951 0.05172414
+    ## 6:      AgeDiscret4 0.009270557 0.02858801 0.01724138
 
-```
-##             Feature        Gain      Cover  Frequency
-## 1:              Age 0.622031651 0.67251706 0.67241379
-## 2: TreatmentPlacebo 0.285750607 0.11916656 0.10344828
-## 3:          SexMale 0.048744054 0.04522027 0.08620690
-## 4:      AgeDiscret6 0.016604647 0.04784637 0.05172414
-## 5:      AgeDiscret3 0.016373791 0.08028939 0.05172414
-## 6:      AgeDiscret4 0.009270558 0.02858801 0.01724138
-```
-
-> The column `Gain` provide the information we are looking for.
+> The column `Gain` provides the information we are looking for.
 >
 > As you can see, features are classified by `Gain`.
 
-`Gain` is the improvement in accuracy brought by a feature to the branches it is on. The idea is that before adding a new split on a feature X to the branch there was some wrongly classified elements, after adding the split on this feature, there are two new branches, and each of these branch is more accurate (one branch saying if your observation is on this branch then it should be classified as `1`, and the other branch saying the exact opposite).
+`Gain` is the improvement in accuracy brought by a feature to the
+branches it is on. The idea is that before adding a new split on a
+feature X to the branch there were some wrongly classified elements;
+after adding the split on this feature, there are two new branches, and
+each of these branches is more accurate (one branch saying if your
+observation is on this branch then it should be classified as `1`, and
+the other branch saying the exact opposite).
 
-`Cover` measures the relative quantity of observations concerned by a feature.
+`Cover` is related to the second order derivative (or Hessian) of the
+loss function with respect to a particular variable; thus, a large value
+indicates a variable has a large potential impact on the loss function
+and so is important.
 
-`Frequency` is a simpler way to measure the `Gain`. It just counts the number of times a feature is used in all generated trees. You should not use it (unless you know why you want to use it).
-
-#### Improvement in the interpretability of feature importance data.table
-
-We can go deeper in the analysis of the model. In the `data.table` above, we have discovered which features counts to predict if the illness will go or not. But we don't yet know the role of these features. For instance, one of the question we may want to answer would be: does receiving a placebo treatment helps to recover from the illness?
-
-One simple solution is to count the co-occurrences of a feature and a class of the classification.
-
-For that purpose we will execute the same function as above but using two more parameters, `data` and `label`.
-
-
-```r
-importanceRaw <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst, data = sparse_matrix, label = output_vector)
-
-# Cleaning for better display
-importanceClean <- importanceRaw[,`:=`(Cover=NULL, Frequency=NULL)]
-
-head(importanceClean)
-```
-
-```
-##             Feature        Split       Gain RealCover RealCover %
-## 1: TreatmentPlacebo -1.00136e-05 0.28575061         7   0.2500000
-## 2:              Age         61.5 0.16374034        12   0.4285714
-## 3:              Age           39 0.08705750         8   0.2857143
-## 4:              Age         57.5 0.06947553        11   0.3928571
-## 5:          SexMale -1.00136e-05 0.04874405         4   0.1428571
-## 6:              Age         53.5 0.04620627        10   0.3571429
-```
-
-> In the table above we have removed two not needed columns and select only the first lines.
-
-First thing you notice is the new column `Split`. It is the split applied to the feature on a branch of one of the tree. Each split is present, therefore a feature can appear several times in this table. Here we can see the feature `Age` is used several times with different splits.
-
-How the split is applied to count the co-occurrences? It is always `<`. For instance, in the second line, we measure the number of persons under 61.5 years with the illness gone after the treatment.
-
-The two other new columns are `RealCover` and `RealCover %`. In the first column it measures the number of observations in the dataset where the split is respected and the label marked as `1`. The second column is the percentage of the whole population that `RealCover` represents.
-
-Therefore, according to our findings, getting a placebo doesn't seem to help but being younger than 61 years may help (seems logic).
-
-> You may wonder how to interpret the `< 1.00001` on the first line. Basically, in a sparse `Matrix`, there is no `0`, therefore, looking for one hot-encoded categorical observations validating the rule `< 1.00001` is like just looking for `1` for this feature.
+`Frequency` is a simpler way to measure the `Gain`. It just counts the
+number of times a feature is used in all generated trees. You should not
+use it (unless you know why you want to use it).
 
 ### Plotting the feature importance
 
+All these things are nice, but it would be even better to plot the
+results.
 
-All these things are nice, but it would be even better to plot the results.
+    xgb.plot.importance(importance_matrix = importance)
 
+<img src="discoverYourData_files/figure-markdown_strict/unnamed-chunk-12-1.png" style="display: block; margin: auto;" />
 
-```r
-xgb.plot.importance(importance_matrix = importanceRaw)
-```
+Running this line of code, you should get a bar chart showing the
+importance of the 6 features (containing the same data as the output we
+saw earlier, but displaying it visually for easier consumption). Note
+that `xgb.ggplot.importance` is also available for all the ggplot2 fans!
 
-```
-## Error in xgb.plot.importance(importance_matrix = importanceRaw): Importance matrix is not correct (column names issue)
-```
+> Depending of the dataset and the learning parameters you may have more
+> than two clusters. Default value is to limit them to `10`, but you can
+> increase this limit. Look at the function documentation for more
+> information.
 
-Feature have automatically been divided in 2 clusters: the interesting features... and the others.
+According to the plot above, the most important features in this dataset
+to predict if the treatment will work are :
 
-> Depending of the dataset and the learning parameters you may have more than two clusters. Default value is to limit them to `10`, but you can increase this limit. Look at the function documentation for more information.
-
-According to the plot above, the most important features in this dataset to predict if the treatment will work are :
-
-* the Age ;
-* having received a placebo or not ;
-* the sex is third but already included in the not interesting features group ;
-* then we see our generated features (AgeDiscret). We can see that their contribution is very low.
+-   An individual’s age;
+-   Having received a placebo or not;
+-   Gender;
+-   Our generated feature AgeDiscret. We can see that its contribution
+    is very low.
 
 ### Do these results make sense?
 
-
-Let's check some **Chi2** between each of these features and the label.
+Let’s check some **Chi2** between each of these features and the label.
 
 Higher **Chi2** means better correlation.
 
+    c2 <- chisq.test(df$Age, output_vector)
+    print(c2)
 
-```r
-c2 <- chisq.test(df$Age, output_vector)
-print(c2)
-```
+    ## 
+    ##  Pearson's Chi-squared test
+    ## 
+    ## data:  df$Age and output_vector
+    ## X-squared = 35.475, df = 35, p-value = 0.4458
 
-```
-## 
-## 	Pearson's Chi-squared test
-## 
-## data:  df$Age and output_vector
-## X-squared = 35.475, df = 35, p-value = 0.4458
-```
+The Pearson correlation between Age and illness disappearing is
+**35.47**.
 
-Pearson correlation between Age and illness disappearing is **35.48**.
+    c2 <- chisq.test(df$AgeDiscret, output_vector)
+    print(c2)
 
+    ## 
+    ##  Pearson's Chi-squared test
+    ## 
+    ## data:  df$AgeDiscret and output_vector
+    ## X-squared = 8.2554, df = 5, p-value = 0.1427
 
-```r
-c2 <- chisq.test(df$AgeDiscret, output_vector)
-print(c2)
-```
+Our first simplification of Age gives a Pearson correlation of **8.26**.
 
-```
-## 
-## 	Pearson's Chi-squared test
-## 
-## data:  df$AgeDiscret and output_vector
-## X-squared = 8.2554, df = 5, p-value = 0.1427
-```
+    c2 <- chisq.test(df$AgeCat, output_vector)
+    print(c2)
 
-Our first simplification of Age gives a Pearson correlation is **8.26**.
+    ## 
+    ##  Pearson's Chi-squared test with Yates' continuity correction
+    ## 
+    ## data:  df$AgeCat and output_vector
+    ## X-squared = 2.3571, df = 1, p-value = 0.1247
 
+The perfectly random split we did between young and old at 30 years old
+has a low correlation of **2.36**. This suggests that, for the
+particular illness we are studying, the age at which someone is
+vulnerable to this disease is likely very different from 30.
 
-```r
-c2 <- chisq.test(df$AgeCat, output_vector)
-print(c2)
-```
+Moral of the story: don’t let your *gut* lower the quality of your
+model.
 
-```
-## 
-## 	Pearson's Chi-squared test with Yates' continuity correction
-## 
-## data:  df$AgeCat and output_vector
-## X-squared = 2.3571, df = 1, p-value = 0.1247
-```
+In *data science*, there is the word *science* :-)
 
-The perfectly random split I did between young and old at 30 years old have a low correlation of **2.36**. It's a result we may expect as may be in my mind > 30 years is being old (I am 32 and starting feeling old, this may explain that), but for the illness we are studying, the age to be vulnerable is not the same.
+## Conclusion
 
-Morality: don't let your *gut* lower the quality of your model.
+As you can see, in general *destroying information by simplifying it
+won’t improve your model*. **Chi2** just demonstrates that.
 
-In *data science* expression, there is the word *science* :-)
+But in more complex cases, creating a new feature from an existing one
+may help the algorithm and improve the model.
 
-Conclusion
-----------
++The case studied here is not complex enough to show that. Check [Kaggle
+website](https://www.kaggle.com/) for some challenging datasets.
 
-As you can see, in general *destroying information by simplifying it won't improve your model*. **Chi2** just demonstrates that.
+Moreover, you can see that even if we have added some new features which
+are not very useful/highly correlated with other features, the boosting
+tree algorithm was still able to choose the best one (which in this case
+is the Age).
 
-But in more complex cases, creating a new feature based on existing one which makes link with the outcome more obvious may help the algorithm and improve the model.
+Linear models may not perform as well.
 
-The case studied here is not enough complex to show that. Check [Kaggle website](http://www.kaggle.com/) for some challenging datasets. However it's almost always worse when you add some arbitrary rules.
+## Special Note: What about Random Forests™?
 
-Moreover, you can notice that even if we have added some not useful new features highly correlated with other features, the boosting tree algorithm have been able to choose the best one, which in this case is the Age.
+As you may know, the [Random
+Forests](https://en.wikipedia.org/wiki/Random_forest) algorithm is
+cousin with boosting and both are part of the [ensemble
+learning](https://en.wikipedia.org/wiki/Ensemble_learning) family.
 
-Linear models may not be that smart in this scenario.
+Both train several decision trees for one dataset. The *main* difference
+is that in Random Forests, trees are independent and in boosting, the
+`N+1`-st tree focuses its learning on the loss (&lt;=&gt; what has not
+been well modeled by the tree `N`).
 
-Special Note: What about Random Forests™?
------------------------------------------
+This difference can have an impact on a edge case in feature importance
+analysis: *correlated features*.
 
-As you may know, [Random Forests](http://en.wikipedia.org/wiki/Random_forest) algorithm is cousin with boosting and both are part of the [ensemble learning](http://en.wikipedia.org/wiki/Ensemble_learning) family.
+Imagine two features perfectly correlated, feature `A` and feature `B`.
+For one specific tree, if the algorithm needs one of them, it will
+choose randomly (true in both boosting and Random Forests).
 
-Both train several decision trees for one dataset. The *main* difference is that in Random Forests, trees are independent and in boosting, the tree `N+1` focus its learning on the loss (<=> what has not been well modeled by the tree `N`).
+However, in Random Forests this random choice will be done for each
+tree, because each tree is independent from the others. Therefore,
+approximately (and depending on your parameters) 50% of the trees will
+choose feature `A` and the other 50% will choose feature `B`. So the
+*importance* of the information contained in `A` and `B` (which is the
+same, because they are perfectly correlated) is diluted in `A` and `B`.
+So you won’t easily know this information is important to predict what
+you want to predict! It is even worse when you have 10 correlated
+features…
 
-This difference have an impact on a corner case in feature importance analysis: the *correlated features*.
+In boosting, when a specific link between feature and outcome have been
+learned by the algorithm, it will try to not refocus on it (in theory it
+is what happens, reality is not always that simple). Therefore, all the
+importance will be on feature `A` or on feature `B` (but not both). You
+will know that one feature has an important role in the link between the
+observations and the label. It is still up to you to search for the
+correlated features to the one detected as important if you need to know
+all of them.
 
-Imagine two features perfectly correlated, feature `A` and feature `B`. For one specific tree, if the algorithm needs one of them, it will choose randomly (true in both boosting and Random Forests).
+If you want to try Random Forests algorithm, you can tweak XGBoost
+parameters!
 
-However, in Random Forests this random choice will be done for each tree, because each tree is independent from the others. Therefore, approximatively, depending of your parameters, 50% of the trees will choose feature `A` and the other 50% will choose feature `B`. So the *importance* of the information contained in `A` and `B` (which is the same, because they are perfectly correlated) is diluted in `A` and `B`. So you won't easily know this information is important to predict what you want to predict! It is even worse when you have 10 correlated features...
+For instance, to compute a model with 1000 trees, with a 0.5 factor on
+sampling rows and columns:
 
-In boosting, when a specific link between feature and outcome have been learned by the algorithm, it will try to not refocus on it (in theory it is what happens, reality is not always that simple). Therefore, all the importance will be on feature `A` or on feature `B` (but not both). You will know that one feature have an important role in the link between the observations and the label. It is still up to you to search for the correlated features to the one detected as important if you need to know all of them.
+    data(agaricus.train, package = 'xgboost')
+    data(agaricus.test, package = 'xgboost')
+    train <- agaricus.train
+    test <- agaricus.test
 
-If you want to try Random Forests algorithm, you can tweak XGBoost parameters!
+    #Random Forest - 1000 trees
+    bst <- xgboost(
+        data = train$data
+        , label = train$label
+        , max_depth = 4
+        , num_parallel_tree = 1000
+        , subsample = 0.5
+        , colsample_bytree = 0.5
+        , nrounds = 1
+        , objective = "binary:logistic"
+    )
 
-**Warning**: this is still an experimental parameter.
+    ## [1]  train-logloss:0.456201
 
-For instance, to compute a model with 1000 trees, with a 0.5 factor on sampling rows and columns:
+    #Boosting - 3 rounds
+    bst <- xgboost(
+        data = train$data
+        , label = train$label
+        , max_depth = 4
+        , nrounds = 3
+        , objective = "binary:logistic"
+    )
 
-
-```r
-data(agaricus.train, package='xgboost')
-data(agaricus.test, package='xgboost')
-train <- agaricus.train
-test <- agaricus.test
-
-#Random Forest - 1000 trees
-bst <- xgboost(data = train$data, label = train$label, max.depth = 4, num_parallel_tree = 1000, subsample = 0.5, colsample_bytree =0.5, nrounds = 1, objective = "binary:logistic")
-```
-
-```
-## [0]	train-error:0.002150
-```
-
-```r
-#Boosting - 3 rounds
-bst <- xgboost(data = train$data, label = train$label, max.depth = 4, nrounds = 3, objective = "binary:logistic")
-```
-
-```
-## [0]	train-error:0.006142
-## [1]	train-error:0.006756
-## [2]	train-error:0.001228
-```
+    ## [1]  train-logloss:0.444882 
+    ## [2]  train-logloss:0.302428 
+    ## [3]  train-logloss:0.212847
 
 > Note that the parameter `round` is set to `1`.
 
-> [**Random Forests**](https://www.stat.berkeley.edu/~breiman/RandomForests/cc_papers.htm) is a trademark of Leo Breiman and Adele Cutler and is licensed exclusively to Salford Systems for the commercial release of the software.
+> [**Random
+> Forests**](https://www.stat.berkeley.edu/~breiman/RandomForests/cc_papers.htm)
+> is a trademark of Leo Breiman and Adele Cutler and is licensed
+> exclusively to Salford Systems for the commercial release of the
+> software.

From d495a180d8d486b11f5c4e8903a614d8604cdd13 Mon Sep 17 00:00:00 2001
From: Bobby Wang <wbo4958@gmail.com>
Date: Wed, 9 Aug 2023 18:32:23 +0800
Subject: [PATCH 075/136] [pyspark] add logs for training (#9449)

---
 python-package/xgboost/spark/core.py  | 26 +++++++++++++++++---------
 python-package/xgboost/spark/utils.py |  4 ++++
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py
index 998afbf77..2150e5055 100644
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@@ -924,21 +924,17 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
                 # Note: Checking `is_cudf_available` in spark worker side because
                 # spark worker might has different python environment with driver side.
                 use_qdm = use_qdm and is_cudf_available()
+                get_logger("XGBoost-PySpark").info(
+                    "Leveraging %s to train with QDM: %s",
+                    booster_params["device"],
+                    "on" if use_qdm else "off",
+                )
 
             if use_qdm and (booster_params.get("max_bin", None) is not None):
                 dmatrix_kwargs["max_bin"] = booster_params["max_bin"]
 
             _rabit_args = {}
             if context.partitionId() == 0:
-                get_logger("XGBoostPySpark").debug(
-                    "booster params: %s\n"
-                    "train_call_kwargs_params: %s\n"
-                    "dmatrix_kwargs: %s",
-                    booster_params,
-                    train_call_kwargs_params,
-                    dmatrix_kwargs,
-                )
-
                 _rabit_args = _get_rabit_args(context, num_workers)
 
             worker_message = {
@@ -995,7 +991,19 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
             )
             return ret[0], ret[1]
 
+        get_logger("XGBoost-PySpark").info(
+            "Running xgboost-%s on %s workers with"
+            "\n\tbooster params: %s"
+            "\n\ttrain_call_kwargs_params: %s"
+            "\n\tdmatrix_kwargs: %s",
+            xgboost._py_version(),
+            num_workers,
+            booster_params,
+            train_call_kwargs_params,
+            dmatrix_kwargs,
+        )
         (config, booster) = _run_job()
+        get_logger("XGBoost-PySpark").info("Finished xgboost training!")
 
         result_xgb_model = self._convert_to_sklearn_model(
             bytearray(booster, "utf-8"), config
diff --git a/python-package/xgboost/spark/utils.py b/python-package/xgboost/spark/utils.py
index 5f3bb19ba..33a45a90e 100644
--- a/python-package/xgboost/spark/utils.py
+++ b/python-package/xgboost/spark/utils.py
@@ -104,6 +104,10 @@ def get_logger(name: str, level: str = "INFO") -> logging.Logger:
     # If the logger is configured, skip the configure
     if not logger.handlers and not logging.getLogger().handlers:
         handler = logging.StreamHandler(sys.stderr)
+        formatter = logging.Formatter(
+            "%(asctime)s %(levelname)s %(name)s: %(funcName)s %(message)s"
+        )
+        handler.setFormatter(formatter)
         logger.addHandler(handler)
     return logger
 

From f05a23b41cb9b3e55fe9f53c3ca26203c1d42748 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 10 Aug 2023 00:40:06 +0800
Subject: [PATCH 076/136] Use `weakref` instead of `id` for `DataIter` cache.
 (#9445)

- Fix case where Python reuses id from freed objects.
- Small optimization to column matrix with QDM by using `realloc` instead of copying data.
---
 demo/guide-python/external_memory.py       |  6 ++-
 doc/python/python_api.rst                  |  4 ++
 python-package/xgboost/_typing.py          | 11 +++-
 python-package/xgboost/core.py             | 43 ++++++++++++---
 python-package/xgboost/data.py             | 10 ++--
 python-package/xgboost/testing/__init__.py | 18 ++++---
 src/common/column_matrix.h                 | 35 ++++++++-----
 src/common/io.h                            |  6 +--
 tests/ci_build/lint_python.py              |  1 +
 tests/cpp/common/test_column_matrix.cc     | 61 +++++++++++++++++-----
 tests/cpp/common/test_io.cc                | 14 +++++
 tests/cpp/data/test_iterative_dmatrix.cc   |  8 ++-
 tests/python/test_data_iterator.py         | 18 ++++++-
 tests/python/test_quantile_dmatrix.py      | 21 +++++++-
 14 files changed, 193 insertions(+), 63 deletions(-)

diff --git a/demo/guide-python/external_memory.py b/demo/guide-python/external_memory.py
index fdaa9dab9..6d789486e 100644
--- a/demo/guide-python/external_memory.py
+++ b/demo/guide-python/external_memory.py
@@ -22,7 +22,10 @@ import xgboost
 
 
 def make_batches(
-    n_samples_per_batch: int, n_features: int, n_batches: int, tmpdir: str,
+    n_samples_per_batch: int,
+    n_features: int,
+    n_batches: int,
+    tmpdir: str,
 ) -> List[Tuple[str, str]]:
     files: List[Tuple[str, str]] = []
     rng = np.random.RandomState(1994)
@@ -38,6 +41,7 @@ def make_batches(
 
 class Iterator(xgboost.DataIter):
     """A custom iterator for loading files in batches."""
+
     def __init__(self, file_paths: List[Tuple[str, str]]):
         self._file_paths = file_paths
         self._it = 0
diff --git a/doc/python/python_api.rst b/doc/python/python_api.rst
index 0cbf63456..38b22a994 100644
--- a/doc/python/python_api.rst
+++ b/doc/python/python_api.rst
@@ -23,12 +23,16 @@ Core Data Structure
     :show-inheritance:
 
 .. autoclass:: xgboost.QuantileDMatrix
+    :members:
     :show-inheritance:
 
 .. autoclass:: xgboost.Booster
     :members:
     :show-inheritance:
 
+.. autoclass:: xgboost.DataIter
+    :members:
+    :show-inheritance:
 
 Learning API
 ------------
diff --git a/python-package/xgboost/_typing.py b/python-package/xgboost/_typing.py
index 39952aca9..a36757a81 100644
--- a/python-package/xgboost/_typing.py
+++ b/python-package/xgboost/_typing.py
@@ -8,7 +8,9 @@ from typing import (
     Callable,
     Dict,
     List,
+    Optional,
     Sequence,
+    Tuple,
     Type,
     TypeVar,
     Union,
@@ -20,8 +22,6 @@ import numpy as np
 
 DataType = Any
 
-# xgboost accepts some other possible types in practice due to historical reason, which is
-# lesser tested.  For now we encourage users to pass a simple list of string.
 FeatureInfo = Sequence[str]
 FeatureNames = FeatureInfo
 FeatureTypes = FeatureInfo
@@ -97,6 +97,13 @@ else:
         ctypes._Pointer,
     ]
 
+# The second arg is actually Optional[List[cudf.Series]], skipped for easier type check.
+# The cudf Series is the obtained cat codes, preserved in the `DataIter` to prevent it
+# being freed.
+TransformedData = Tuple[
+    Any, Optional[List], Optional[FeatureNames], Optional[FeatureTypes]
+]
+
 # template parameter
 _T = TypeVar("_T")
 _F = TypeVar("_F", bound=Callable[..., Any])
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 14a96f117..fbedfd7fb 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -9,6 +9,7 @@ import os
 import re
 import sys
 import warnings
+import weakref
 from abc import ABC, abstractmethod
 from collections.abc import Mapping
 from enum import IntEnum, unique
@@ -51,6 +52,7 @@ from ._typing import (
     FeatureTypes,
     ModelIn,
     NumpyOrCupy,
+    TransformedData,
     c_bst_ulong,
 )
 from .compat import PANDAS_INSTALLED, DataFrame, py_str
@@ -486,7 +488,16 @@ def _prediction_output(
 
 
 class DataIter(ABC):  # pylint: disable=too-many-instance-attributes
-    """The interface for user defined data iterator.
+    """The interface for user defined data iterator. The iterator facilitates
+    distributed training, :py:class:`QuantileDMatrix`, and external memory support using
+    :py:class:`DMatrix`. Most of time, users don't need to interact with this class
+    directly.
+
+    .. note::
+
+        The class caches some intermediate results using the `data` input (predictor
+        `X`) as key. Don't repeat the `X` for multiple batches with different meta data
+        (like `label`), make a copy if necessary.
 
     Parameters
     ----------
@@ -510,13 +521,13 @@ class DataIter(ABC):  # pylint: disable=too-many-instance-attributes
         self._allow_host = True
         self._release = release_data
         # Stage data in Python until reset or next is called to avoid data being free.
-        self._temporary_data: Optional[Tuple[Any, Any, Any, Any]] = None
-        self._input_id: int = 0
+        self._temporary_data: Optional[TransformedData] = None
+        self._data_ref: Optional[weakref.ReferenceType] = None
 
     def get_callbacks(
         self, allow_host: bool, enable_categorical: bool
     ) -> Tuple[Callable, Callable]:
-        """Get callback functions for iterating in C."""
+        """Get callback functions for iterating in C. This is an internal function."""
         assert hasattr(self, "cache_prefix"), "__init__ is not called."
         self._reset_callback = ctypes.CFUNCTYPE(None, ctypes.c_void_p)(
             self._reset_wrapper
@@ -591,7 +602,19 @@ class DataIter(ABC):  # pylint: disable=too-many-instance-attributes
             from .data import _proxy_transform, dispatch_proxy_set_data
 
             # Reduce the amount of transformation that's needed for QuantileDMatrix.
-            if self._temporary_data is not None and id(data) == self._input_id:
+            #
+            # To construct the QDM, one needs 4 iterations on CPU, or 2 iterations on
+            # GPU. If the QDM has only one batch of input (most of the cases), we can
+            # avoid transforming the data repeatly.
+            try:
+                ref = weakref.ref(data)
+            except TypeError:
+                ref = None
+            if (
+                self._temporary_data is not None
+                and ref is not None
+                and ref is self._data_ref
+            ):
                 new, cat_codes, feature_names, feature_types = self._temporary_data
             else:
                 new, cat_codes, feature_names, feature_types = _proxy_transform(
@@ -608,7 +631,7 @@ class DataIter(ABC):  # pylint: disable=too-many-instance-attributes
                 feature_types=feature_types,
                 **kwargs,
             )
-            self._input_id = id(data)
+            self._data_ref = ref
 
         # pylint: disable=not-callable
         return self._handle_exception(lambda: self.next(input_data), 0)
@@ -1134,7 +1157,7 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes,too-many-public-m
         testing purposes. If this is a quantized DMatrix then quantized values are
         returned instead of input values.
 
-            .. versionadded:: 1.7.0
+        .. versionadded:: 1.7.0
 
         """
         indptr = np.empty(self.num_row() + 1, dtype=np.uint64)
@@ -1155,7 +1178,11 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes,too-many-public-m
         return ret
 
     def get_quantile_cut(self) -> Tuple[np.ndarray, np.ndarray]:
-        """Get quantile cuts for quantization."""
+        """Get quantile cuts for quantization.
+
+        .. versionadded:: 2.0.0
+
+        """
         n_features = self.num_col()
 
         c_sindptr = ctypes.c_char_p()
diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py
index 7864d541f..04bdc5739 100644
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -5,7 +5,7 @@ import ctypes
 import json
 import os
 import warnings
-from typing import Any, Callable, Iterator, List, Optional, Sequence, Tuple, Union, cast
+from typing import Any, Callable, Iterator, List, Optional, Sequence, Tuple, cast
 
 import numpy as np
 
@@ -17,6 +17,7 @@ from ._typing import (
     FloatCompatible,
     NumpyDType,
     PandasDType,
+    TransformedData,
     c_bst_ulong,
 )
 from .compat import DataFrame, lazy_isinstance
@@ -1268,12 +1269,7 @@ def _proxy_transform(
     feature_names: Optional[FeatureNames],
     feature_types: Optional[FeatureTypes],
     enable_categorical: bool,
-) -> Tuple[
-    Union[bool, ctypes.c_void_p, np.ndarray],
-    Optional[list],
-    Optional[FeatureNames],
-    Optional[FeatureTypes],
-]:
+) -> TransformedData:
     if _is_cudf_df(data) or _is_cudf_ser(data):
         return _transform_cudf_df(
             data, feature_names, feature_types, enable_categorical
diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py
index 48809b46f..8a21b6085 100644
--- a/python-package/xgboost/testing/__init__.py
+++ b/python-package/xgboost/testing/__init__.py
@@ -230,7 +230,7 @@ class IteratorForTest(xgb.core.DataIter):
 
     def as_arrays(
         self,
-    ) -> Tuple[Union[np.ndarray, sparse.csr_matrix], ArrayLike, ArrayLike]:
+    ) -> Tuple[Union[np.ndarray, sparse.csr_matrix], ArrayLike, Optional[ArrayLike]]:
         if isinstance(self.X[0], sparse.csr_matrix):
             X = sparse.vstack(self.X, format="csr")
         else:
@@ -244,7 +244,12 @@ class IteratorForTest(xgb.core.DataIter):
 
 
 def make_batches(
-    n_samples_per_batch: int, n_features: int, n_batches: int, use_cupy: bool = False
+    n_samples_per_batch: int,
+    n_features: int,
+    n_batches: int,
+    use_cupy: bool = False,
+    *,
+    vary_size: bool = False,
 ) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]:
     X = []
     y = []
@@ -255,10 +260,11 @@ def make_batches(
         rng = cupy.random.RandomState(1994)
     else:
         rng = np.random.RandomState(1994)
-    for _ in range(n_batches):
-        _X = rng.randn(n_samples_per_batch, n_features)
-        _y = rng.randn(n_samples_per_batch)
-        _w = rng.uniform(low=0, high=1, size=n_samples_per_batch)
+    for i in range(n_batches):
+        n_samples = n_samples_per_batch + i * 10 if vary_size else n_samples_per_batch
+        _X = rng.randn(n_samples, n_features)
+        _y = rng.randn(n_samples)
+        _w = rng.uniform(low=0, high=1, size=n_samples)
         X.append(_X)
         y.append(_y)
         w.append(_w)
diff --git a/src/common/column_matrix.h b/src/common/column_matrix.h
index 78361744d..0862c21ad 100644
--- a/src/common/column_matrix.h
+++ b/src/common/column_matrix.h
@@ -9,12 +9,12 @@
 #define XGBOOST_COMMON_COLUMN_MATRIX_H_
 
 #include <algorithm>
-#include <cstddef>  // for size_t
+#include <cstddef>  // for size_t, byte
 #include <cstdint>  // for uint8_t
 #include <limits>
 #include <memory>
-#include <utility>  // for move
-#include <vector>
+#include <type_traits>  // for enable_if_t, is_same_v, is_signed_v
+#include <utility>      // for move
 
 #include "../data/adapter.h"
 #include "../data/gradient_index.h"
@@ -112,9 +112,6 @@ class SparseColumnIter : public Column<BinIdxT> {
  */
 template <typename BinIdxT, bool any_missing>
 class DenseColumnIter : public Column<BinIdxT> {
- public:
-  using ByteType = bool;
-
  private:
   using Base = Column<BinIdxT>;
   /* flags for missing values in dense columns */
@@ -153,8 +150,17 @@ class ColumnMatrix {
    * @brief A bit set for indicating whether an element in a dense column is missing.
    */
   struct MissingIndicator {
-    LBitField32 missing;
-    RefResourceView<std::uint32_t> storage;
+    using BitFieldT = LBitField32;
+    using T = typename BitFieldT::value_type;
+
+    BitFieldT missing;
+    RefResourceView<T> storage;
+    static_assert(std::is_same_v<T, std::uint32_t>);
+
+    template <typename U>
+    [[nodiscard]] std::enable_if_t<!std::is_signed_v<U>, U> static InitValue(bool init) {
+      return init ? ~U{0} : U{0};
+    }
 
     MissingIndicator() = default;
     /**
@@ -163,7 +169,7 @@ class ColumnMatrix {
      */
     MissingIndicator(std::size_t n_elements, bool init) {
       auto m_size = missing.ComputeStorageSize(n_elements);
-      storage = common::MakeFixedVecWithMalloc(m_size, init ? ~std::uint32_t{0} : std::uint32_t{0});
+      storage = common::MakeFixedVecWithMalloc(m_size, InitValue<T>(init));
       this->InitView();
     }
     /** @brief Set the i^th element to be a valid element (instead of missing). */
@@ -181,11 +187,12 @@ class ColumnMatrix {
       if (m_size == storage.size()) {
         return;
       }
+      // grow the storage
+      auto resource = std::dynamic_pointer_cast<common::MallocResource>(storage.Resource());
+      CHECK(resource);
+      resource->Resize(m_size * sizeof(T), InitValue<std::byte>(init));
+      storage = RefResourceView<T>{resource->DataAs<T>(), m_size, resource};
 
-      auto new_storage =
-          common::MakeFixedVecWithMalloc(m_size, init ? ~std::uint32_t{0} : std::uint32_t{0});
-      std::copy_n(storage.cbegin(), storage.size(), new_storage.begin());
-      storage = std::move(new_storage);
       this->InitView();
     }
   };
@@ -210,7 +217,6 @@ class ColumnMatrix {
   }
 
  public:
-  using ByteType = bool;
   // get number of features
   [[nodiscard]] bst_feature_t GetNumFeature() const {
     return static_cast<bst_feature_t>(type_.size());
@@ -408,6 +414,7 @@ class ColumnMatrix {
   // IO procedures for external memory.
   [[nodiscard]] bool Read(AlignedResourceReadStream* fi, uint32_t const* index_base);
   [[nodiscard]] std::size_t Write(AlignedFileWriteStream* fo) const;
+  [[nodiscard]] MissingIndicator const& Missing() const { return missing_; }
 
  private:
   RefResourceView<std::uint8_t> index_;
diff --git a/src/common/io.h b/src/common/io.h
index baf518aa5..95971abae 100644
--- a/src/common/io.h
+++ b/src/common/io.h
@@ -10,7 +10,7 @@
 #include <dmlc/io.h>
 #include <rabit/rabit.h>
 
-#include <algorithm>    // for min
+#include <algorithm>    // for min, fill_n, copy_n
 #include <array>        // for array
 #include <cstddef>      // for byte, size_t
 #include <cstdlib>      // for malloc, realloc, free
@@ -207,7 +207,7 @@ class MallocResource : public ResourceHandler {
    * @param n_bytes The new size.
    */
   template <bool force_malloc = false>
-  void Resize(std::size_t n_bytes) {
+  void Resize(std::size_t n_bytes, std::byte init = std::byte{0}) {
     // realloc(ptr, 0) works, but is deprecated.
     if (n_bytes == 0) {
       this->Clear();
@@ -236,7 +236,7 @@ class MallocResource : public ResourceHandler {
       std::copy_n(reinterpret_cast<std::byte*>(ptr_), n_, reinterpret_cast<std::byte*>(new_ptr));
     }
     // default initialize
-    std::memset(reinterpret_cast<std::byte*>(new_ptr) + n_, '\0', n_bytes - n_);
+    std::fill_n(reinterpret_cast<std::byte*>(new_ptr) + n_, n_bytes - n_, init);
     // free the old ptr if malloc is used.
     if (need_copy) {
       this->Clear();
diff --git a/tests/ci_build/lint_python.py b/tests/ci_build/lint_python.py
index 9749a8485..e6cfb462b 100644
--- a/tests/ci_build/lint_python.py
+++ b/tests/ci_build/lint_python.py
@@ -42,6 +42,7 @@ class LintersPaths:
         "demo/guide-python/feature_weights.py",
         "demo/guide-python/sklearn_parallel.py",
         "demo/guide-python/spark_estimator_examples.py",
+        "demo/guide-python/external_memory.py",
         "demo/guide-python/individual_trees.py",
         "demo/guide-python/quantile_regression.py",
         "demo/guide-python/multioutput_regression.py",
diff --git a/tests/cpp/common/test_column_matrix.cc b/tests/cpp/common/test_column_matrix.cc
index 8b8df4861..8445dc466 100644
--- a/tests/cpp/common/test_column_matrix.cc
+++ b/tests/cpp/common/test_column_matrix.cc
@@ -2,15 +2,26 @@
  * Copyright 2018-2023 by XGBoost Contributors
  */
 #include <gtest/gtest.h>
+#include <xgboost/base.h>     // for bst_bin_t
+#include <xgboost/context.h>  // for Context
+#include <xgboost/data.h>     // for BatchIterator, BatchSet, DMatrix, Met...
 
-#include "../../../src/common/column_matrix.h"
-#include "../helpers.h"
+#include <cstddef>      // for size_t
+#include <cstdint>      // for int32_t, uint16_t, uint8_t
+#include <limits>       // for numeric_limits
+#include <memory>       // for shared_ptr, __shared_ptr_access, allo...
+#include <type_traits>  // for remove_reference_t
 
+#include "../../../src/common/column_matrix.h"      // for ColumnMatrix, Column, DenseColumnIter
+#include "../../../src/common/hist_util.h"          // for DispatchBinType, BinTypeSize, Index
+#include "../../../src/common/ref_resource_view.h"  // for RefResourceView
+#include "../../../src/data/gradient_index.h"       // for GHistIndexMatrix
+#include "../../../src/data/iterative_dmatrix.h"    // for IterativeDMatrix
+#include "../../../src/tree/param.h"                // for TrainParam
+#include "../helpers.h"                             // for RandomDataGenerator, NumpyArrayIterFo...
 
-namespace xgboost {
-namespace common {
-
-TEST(DenseColumn, Test) {
+namespace xgboost::common {
+TEST(ColumnMatrix, Basic) {
   int32_t max_num_bins[] = {static_cast<int32_t>(std::numeric_limits<uint8_t>::max()) + 1,
                             static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 1,
                             static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 2};
@@ -22,7 +33,7 @@ TEST(DenseColumn, Test) {
     GHistIndexMatrix gmat{&ctx, dmat.get(), max_num_bin, sparse_thresh, false};
     ColumnMatrix column_matrix;
     for (auto const& page : dmat->GetBatches<SparsePage>()) {
-      column_matrix.InitFromSparse(page, gmat, sparse_thresh, AllThreadsForTest());
+      column_matrix.InitFromSparse(page, gmat, sparse_thresh, ctx.Threads());
     }
     ASSERT_GE(column_matrix.GetTypeSize(), last);
     ASSERT_LE(column_matrix.GetTypeSize(), kUint32BinsTypeSize);
@@ -59,7 +70,7 @@ void CheckSparseColumn(SparseColumnIter<BinIdxType>* p_col, const GHistIndexMatr
   }
 }
 
-TEST(SparseColumn, Test) {
+TEST(ColumnMatrix, SparseColumn) {
   int32_t max_num_bins[] = {static_cast<int32_t>(std::numeric_limits<uint8_t>::max()) + 1,
                             static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 1,
                             static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 2};
@@ -69,7 +80,7 @@ TEST(SparseColumn, Test) {
     GHistIndexMatrix gmat{&ctx, dmat.get(), max_num_bin, 0.5f, false};
     ColumnMatrix column_matrix;
     for (auto const& page : dmat->GetBatches<SparsePage>()) {
-      column_matrix.InitFromSparse(page, gmat, 1.0, AllThreadsForTest());
+      column_matrix.InitFromSparse(page, gmat, 1.0, ctx.Threads());
     }
     common::DispatchBinType(column_matrix.GetTypeSize(), [&](auto dtype) {
       using T = decltype(dtype);
@@ -90,7 +101,7 @@ void CheckColumWithMissingValue(const DenseColumnIter<BinIdxType, true>& col,
   }
 }
 
-TEST(DenseColumnWithMissing, Test) {
+TEST(ColumnMatrix, DenseColumnWithMissing) {
   int32_t max_num_bins[] = {static_cast<int32_t>(std::numeric_limits<uint8_t>::max()) + 1,
                             static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 1,
                             static_cast<int32_t>(std::numeric_limits<uint16_t>::max()) + 2};
@@ -100,7 +111,7 @@ TEST(DenseColumnWithMissing, Test) {
     GHistIndexMatrix gmat(&ctx, dmat.get(), max_num_bin, 0.2, false);
     ColumnMatrix column_matrix;
     for (auto const& page : dmat->GetBatches<SparsePage>()) {
-      column_matrix.InitFromSparse(page, gmat, 0.2, AllThreadsForTest());
+      column_matrix.InitFromSparse(page, gmat, 0.2, ctx.Threads());
     }
     ASSERT_TRUE(column_matrix.AnyMissing());
     DispatchBinType(column_matrix.GetTypeSize(), [&](auto dtype) {
@@ -110,5 +121,29 @@ TEST(DenseColumnWithMissing, Test) {
     });
   }
 }
-}  // namespace common
-}  // namespace xgboost
+
+TEST(ColumnMatrix, GrowMissing) {
+  float sparsity = 0.5;
+  NumpyArrayIterForTest iter(sparsity);
+  auto n_threads = 0;
+  bst_bin_t n_bins = 16;
+  BatchParam batch{n_bins, tree::TrainParam::DftSparseThreshold()};
+  Context ctx;
+  auto m = std::make_shared<data::IterativeDMatrix>(&iter, iter.Proxy(), nullptr, Reset, Next,
+                                                    std::numeric_limits<float>::quiet_NaN(),
+                                                    n_threads, n_bins);
+  for (auto const& page : m->GetBatches<GHistIndexMatrix>(&ctx, batch)) {
+    auto const& column_matrix = page.Transpose();
+    auto const& missing = column_matrix.Missing();
+    auto n = NumpyArrayIterForTest::Rows() * NumpyArrayIterForTest::Cols();
+    auto expected = std::remove_reference_t<decltype(missing)>::BitFieldT::ComputeStorageSize(n);
+    auto got = missing.storage.size();
+    ASSERT_EQ(expected, got);
+    DispatchBinType(column_matrix.GetTypeSize(), [&](auto dtype) {
+      using T = decltype(dtype);
+      auto col = column_matrix.DenseColumn<T, true>(0);
+      CheckColumWithMissingValue(col, page);
+    });
+  }
+}
+}  // namespace xgboost::common
diff --git a/tests/cpp/common/test_io.cc b/tests/cpp/common/test_io.cc
index 986e58c5a..8bc12698b 100644
--- a/tests/cpp/common/test_io.cc
+++ b/tests/cpp/common/test_io.cc
@@ -119,6 +119,20 @@ TEST(IO, Resource) {
     for (std::size_t i = n; i < 2 * n; ++i) {
       ASSERT_EQ(malloc_resource->DataAs<std::uint8_t>()[i], 0);
     }
+
+    ptr = malloc_resource->DataAs<std::uint8_t>();
+    std::fill_n(ptr, malloc_resource->Size(), 7);
+    if (force_malloc) {
+      malloc_resource->Resize<true>(n * 3, std::byte{3});
+    } else {
+      malloc_resource->Resize<false>(n * 3, std::byte{3});
+    }
+    for (std::size_t i = 0; i < n * 2; ++i) {
+      ASSERT_EQ(malloc_resource->DataAs<std::uint8_t>()[i], 7);
+    }
+    for (std::size_t i = n * 2; i < n * 3; ++i) {
+      ASSERT_EQ(malloc_resource->DataAs<std::uint8_t>()[i], 3);
+    }
   };
   test_malloc_resize(true);
   test_malloc_resize(false);
diff --git a/tests/cpp/data/test_iterative_dmatrix.cc b/tests/cpp/data/test_iterative_dmatrix.cc
index 74a69e109..b760b967d 100644
--- a/tests/cpp/data/test_iterative_dmatrix.cc
+++ b/tests/cpp/data/test_iterative_dmatrix.cc
@@ -12,8 +12,7 @@
 #include "../helpers.h"
 #include "xgboost/data.h"  // DMatrix
 
-namespace xgboost {
-namespace data {
+namespace xgboost::data {
 TEST(IterativeDMatrix, Ref) {
   Context ctx;
   TestRefDMatrix<GHistIndexMatrix, NumpyArrayIterForTest>(
@@ -21,7 +20,7 @@ TEST(IterativeDMatrix, Ref) {
 }
 
 TEST(IterativeDMatrix, IsDense) {
-  int n_bins = 16;
+  bst_bin_t n_bins = 16;
   auto test = [n_bins](float sparsity) {
     NumpyArrayIterForTest iter(sparsity);
     auto n_threads = 0;
@@ -38,5 +37,4 @@ TEST(IterativeDMatrix, IsDense) {
   test(0.1);
   test(1.0);
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
diff --git a/tests/python/test_data_iterator.py b/tests/python/test_data_iterator.py
index 24c117f15..e6bdfd2e7 100644
--- a/tests/python/test_data_iterator.py
+++ b/tests/python/test_data_iterator.py
@@ -1,4 +1,5 @@
-from typing import Callable, Dict, List
+import weakref
+from typing import Any, Callable, Dict, List
 
 import numpy as np
 import pytest
@@ -179,5 +180,18 @@ def test_data_cache() -> None:
     data = make_batches(n_samples_per_batch, n_features, n_batches, False)
     batches = [v[0] for v in data]
     it = IterForCacheTest(*batches)
+    transform = xgb.data._proxy_transform
+
+    called = 0
+
+    def mock(*args: Any, **kwargs: Any) -> Any:
+        nonlocal called
+        called += 1
+        return transform(*args, **kwargs)
+
+    xgb.data._proxy_transform = mock
     xgb.QuantileDMatrix(it)
-    assert it._input_id == id(batches[0])
+    assert it._data_ref is weakref.ref(batches[0])
+    assert called == 1
+
+    xgb.data._proxy_transform = transform
diff --git a/tests/python/test_quantile_dmatrix.py b/tests/python/test_quantile_dmatrix.py
index 8ee00b8c0..28a7eb37a 100644
--- a/tests/python/test_quantile_dmatrix.py
+++ b/tests/python/test_quantile_dmatrix.py
@@ -103,12 +103,29 @@ class TestQuantileDMatrix:
                 *make_batches_sparse(
                     n_samples_per_batch, n_features, n_batches, sparsity
                 ),
-                None
+                None,
             )
         Xy = xgb.QuantileDMatrix(it)
         assert Xy.num_row() == n_samples_per_batch * n_batches
         assert Xy.num_col() == n_features
 
+    def test_different_size(self) -> None:
+        n_samples_per_batch = 317
+        n_features = 8
+        n_batches = 7
+
+        it = IteratorForTest(
+            *make_batches(
+                n_samples_per_batch, n_features, n_batches, False, vary_size=True
+            ),
+            cache=None,
+        )
+        Xy = xgb.QuantileDMatrix(it)
+        assert Xy.num_row() == 2429
+        X, y, w = it.as_arrays()
+        Xy1 = xgb.QuantileDMatrix(X, y, weight=w)
+        assert predictor_equal(Xy, Xy1)
+
     @pytest.mark.parametrize("sparsity", [0.0, 0.1, 0.5, 0.8, 0.9])
     def test_training(self, sparsity: float) -> None:
         n_samples_per_batch = 317
@@ -123,7 +140,7 @@ class TestQuantileDMatrix:
                 *make_batches_sparse(
                     n_samples_per_batch, n_features, n_batches, sparsity
                 ),
-                None
+                None,
             )
 
         parameters = {"tree_method": "hist", "max_bin": 256}

From a57371ef7c58acefe4bc794a23089a90734cc1a1 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 10 Aug 2023 02:38:14 +0800
Subject: [PATCH 077/136] Fix links in R doc. (#9450)

---
 R-package/R/xgb.plot.shap.R            | 4 ++--
 R-package/man/xgb.plot.shap.summary.Rd | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/R-package/R/xgb.plot.shap.R b/R-package/R/xgb.plot.shap.R
index 581f61dba..9efcb66ec 100644
--- a/R-package/R/xgb.plot.shap.R
+++ b/R-package/R/xgb.plot.shap.R
@@ -193,7 +193,7 @@ xgb.plot.shap <- function(data, shap_contrib = NULL, features = NULL, top_n = 1,
 #' hence allows us to see which features have a negative / positive contribution
 #' on the model prediction, and whether the contribution is different for larger
 #' or smaller values of the feature. We effectively try to replicate the
-#' \code{summary_plot} function from https://github.com/slundberg/shap.
+#' \code{summary_plot} function from https://github.com/shap/shap.
 #'
 #' @inheritParams xgb.plot.shap
 #'
@@ -202,7 +202,7 @@ xgb.plot.shap <- function(data, shap_contrib = NULL, features = NULL, top_n = 1,
 #'
 #' @examples # See \code{\link{xgb.plot.shap}}.
 #' @seealso \code{\link{xgb.plot.shap}}, \code{\link{xgb.ggplot.shap.summary}},
-#'   \url{https://github.com/slundberg/shap}
+#'   \url{https://github.com/shap/shap}
 xgb.plot.shap.summary <- function(data, shap_contrib = NULL, features = NULL, top_n = 10, model = NULL,
                                   trees = NULL, target_class = NULL, approxcontrib = FALSE, subsample = NULL) {
   # Only ggplot implementation is available.
diff --git a/R-package/man/xgb.plot.shap.summary.Rd b/R-package/man/xgb.plot.shap.summary.Rd
index f757fd740..3ff8af21c 100644
--- a/R-package/man/xgb.plot.shap.summary.Rd
+++ b/R-package/man/xgb.plot.shap.summary.Rd
@@ -67,12 +67,12 @@ Each point (observation) is coloured based on its feature value. The plot
 hence allows us to see which features have a negative / positive contribution
 on the model prediction, and whether the contribution is different for larger
 or smaller values of the feature. We effectively try to replicate the
-\code{summary_plot} function from https://github.com/slundberg/shap.
+\code{summary_plot} function from https://github.com/shap/shap.
 }
 \examples{
 # See \code{\link{xgb.plot.shap}}.
 }
 \seealso{
 \code{\link{xgb.plot.shap}}, \code{\link{xgb.ggplot.shap.summary}},
-  \url{https://github.com/slundberg/shap}
+  \url{https://github.com/shap/shap}
 }

From 1caa93221a11f0dd160731c10006ac6acb11da17 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 10 Aug 2023 14:05:27 +0800
Subject: [PATCH 078/136] Use `realloc` for histogram cache and expose the
 cache limit. (#9455)

---
 doc/parameter.rst                           |  9 +++++
 python-package/xgboost/testing/params.py    |  2 +-
 src/common/ref_resource_view.h              | 37 ++++++++++++++++++++-
 src/tree/hist/hist_cache.h                  | 22 +++++++-----
 src/tree/hist/histogram.h                   |  2 +-
 src/tree/hist/param.h                       |  4 +--
 src/tree/updater_gpu_hist.cu                |  3 ++
 tests/cpp/tree/hist/test_evaluate_splits.cc |  8 ++---
 tests/cpp/tree/hist/test_histogram.cc       |  2 +-
 tests/cpp/tree/test_evaluate_splits.h       |  2 +-
 10 files changed, 71 insertions(+), 20 deletions(-)

diff --git a/doc/parameter.rst b/doc/parameter.rst
index 1b1bb80a4..fdb4b8357 100644
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -226,6 +226,15 @@ Parameters for Tree Booster
     - ``one_output_per_tree``: One model for each target.
     - ``multi_output_tree``:  Use multi-target trees.
 
+* ``max_cached_hist_node``, [default = 65536]
+
+  Maximum number of cached nodes for CPU histogram.
+
+  .. versionadded:: 2.0.0
+
+  - For most of the cases this parameter should not be set except for growing deep trees
+    on CPU.
+
 .. _cat-param:
 
 Parameters for Categorical Feature
diff --git a/python-package/xgboost/testing/params.py b/python-package/xgboost/testing/params.py
index 4ed8f4c4e..6b47f4a01 100644
--- a/python-package/xgboost/testing/params.py
+++ b/python-package/xgboost/testing/params.py
@@ -42,7 +42,7 @@ hist_parameter_strategy = strategies.fixed_dictionaries(
 )
 
 hist_cache_strategy = strategies.fixed_dictionaries(
-    {"internal_max_cached_hist_node": strategies.sampled_from([1, 4, 1024, 2**31])}
+    {"max_cached_hist_node": strategies.sampled_from([1, 4, 1024, 2**31])}
 )
 
 hist_multi_parameter_strategy = strategies.fixed_dictionaries(
diff --git a/src/common/ref_resource_view.h b/src/common/ref_resource_view.h
index 2804d79eb..0fadf846d 100644
--- a/src/common/ref_resource_view.h
+++ b/src/common/ref_resource_view.h
@@ -35,6 +35,13 @@ class RefResourceView {
   size_type size_{0};
   std::shared_ptr<common::ResourceHandler> mem_{nullptr};
 
+ protected:
+  void Init(value_type* ptr, size_type size, std::shared_ptr<common::ResourceHandler> mem) {
+    ptr_ = ptr;
+    size_ = size;
+    mem_ = std::move(mem);
+  }
+
  public:
   RefResourceView(value_type* ptr, size_type n, std::shared_ptr<common::ResourceHandler> mem)
       : ptr_{ptr}, size_{n}, mem_{std::move(mem)} {
@@ -60,11 +67,11 @@ class RefResourceView {
 
   RefResourceView() = default;
   RefResourceView(RefResourceView const& that) = delete;
-  RefResourceView(RefResourceView&& that) = delete;
   RefResourceView& operator=(RefResourceView const& that) = delete;
   /**
    * @brief We allow move assignment for lazy initialization.
    */
+  RefResourceView(RefResourceView&& that) = default;
   RefResourceView& operator=(RefResourceView&& that) = default;
 
   [[nodiscard]] size_type size() const { return size_; }  // NOLINT
@@ -154,5 +161,33 @@ template <typename T>
   auto resource = std::make_shared<common::MallocResource>(n_elements * sizeof(T));
   return RefResourceView{resource->DataAs<T>(), n_elements, resource, init};
 }
+
+template <typename T>
+class ReallocVector : public RefResourceView<T> {
+  static_assert(!std::is_reference_v<T>);
+  static_assert(!std::is_const_v<T>);
+  static_assert(std::is_trivially_copyable_v<T>);
+
+  using Upper = RefResourceView<T>;
+  using size_type = typename Upper::size_type;    // NOLINT
+  using value_type = typename Upper::value_type;  // NOLINT
+
+ public:
+  ReallocVector() : RefResourceView<T>{MakeFixedVecWithMalloc(0, T{})} {}
+
+  ReallocVector(size_type n, value_type const& init)
+      : RefResourceView<T>{MakeFixedVecWithMalloc(n, init)} {}
+  ReallocVector(ReallocVector const& that) = delete;
+  ReallocVector(ReallocVector&& that) = delete;
+  ReallocVector& operator=(ReallocVector const& that) = delete;
+  ReallocVector& operator=(ReallocVector&& that) = delete;
+
+  void Resize(typename Upper::size_type new_size) {
+    auto resource = std::dynamic_pointer_cast<common::MallocResource>(this->Resource());
+    CHECK(resource);
+    resource->Resize(new_size * sizeof(T));
+    this->Init(resource->template DataAs<T>(), new_size, resource);
+  }
+};
 }  // namespace xgboost::common
 #endif  // XGBOOST_COMMON_REF_RESOURCE_VIEW_H_
diff --git a/src/tree/hist/hist_cache.h b/src/tree/hist/hist_cache.h
index 79e5d9bad..8a2ba193a 100644
--- a/src/tree/hist/hist_cache.h
+++ b/src/tree/hist/hist_cache.h
@@ -5,12 +5,14 @@
 #define XGBOOST_TREE_HIST_HIST_CACHE_H_
 #include <cstddef>  // for size_t
 #include <map>      // for map
+#include <memory>   // for unique_ptr
 #include <vector>   // for vector
 
-#include "../../common/hist_util.h"  // for GHistRow, ConstGHistRow
-#include "xgboost/base.h"            // for bst_node_t, bst_bin_t
-#include "xgboost/logging.h"         // for CHECK_GT
-#include "xgboost/span.h"            // for Span
+#include "../../common/hist_util.h"          // for GHistRow, ConstGHistRow
+#include "../../common/ref_resource_view.h"  // for ReallocVector
+#include "xgboost/base.h"                    // for bst_node_t, bst_bin_t
+#include "xgboost/logging.h"                 // for CHECK_GT
+#include "xgboost/span.h"                    // for Span
 
 namespace xgboost::tree {
 /**
@@ -32,7 +34,8 @@ class BoundedHistCollection {
   std::size_t current_size_{0};
 
   // stores the histograms in a contiguous buffer
-  std::vector<GradientPairPrecise> data_;
+  using Vec = common::ReallocVector<GradientPairPrecise>;
+  std::unique_ptr<Vec> data_{new Vec{}};  // nvcc 12.1 trips over std::make_unique
 
   // number of histogram bins across all features
   bst_bin_t n_total_bins_{0};
@@ -42,13 +45,14 @@ class BoundedHistCollection {
   bool has_exceeded_{false};
 
  public:
+  BoundedHistCollection() = default;
   common::GHistRow operator[](std::size_t idx) {
     auto offset = node_map_.at(idx);
-    return common::Span{data_.data(), data_.size()}.subspan(offset, n_total_bins_);
+    return common::Span{data_->data(), data_->size()}.subspan(offset, n_total_bins_);
   }
   common::ConstGHistRow operator[](std::size_t idx) const {
     auto offset = node_map_.at(idx);
-    return common::Span{data_.data(), data_.size()}.subspan(offset, n_total_bins_);
+    return common::Span{data_->data(), data_->size()}.subspan(offset, n_total_bins_);
   }
   void Reset(bst_bin_t n_total_bins, std::size_t n_cached_nodes) {
     n_total_bins_ = n_total_bins;
@@ -81,8 +85,8 @@ class BoundedHistCollection {
     auto n_new_nodes = nodes_to_build.size() + nodes_to_sub.size();
     auto alloc_size = n_new_nodes * n_total_bins_;
     auto new_size = alloc_size + current_size_;
-    if (new_size > data_.size()) {
-      data_.resize(new_size);
+    if (new_size > data_->size()) {
+      data_->Resize(new_size);
     }
     for (auto nidx : nodes_to_build) {
       node_map_[nidx] = current_size_;
diff --git a/src/tree/hist/histogram.h b/src/tree/hist/histogram.h
index 54c716887..f378c7808 100644
--- a/src/tree/hist/histogram.h
+++ b/src/tree/hist/histogram.h
@@ -63,7 +63,7 @@ class HistogramBuilder {
              bool is_col_split, HistMakerTrainParam const *param) {
     n_threads_ = ctx->Threads();
     param_ = p;
-    hist_.Reset(total_bins, param->internal_max_cached_hist_node);
+    hist_.Reset(total_bins, param->max_cached_hist_node);
     buffer_.Init(total_bins);
     is_distributed_ = is_distributed;
     is_col_split_ = is_col_split;
diff --git a/src/tree/hist/param.h b/src/tree/hist/param.h
index 0f2f4ac00..8757b65e6 100644
--- a/src/tree/hist/param.h
+++ b/src/tree/hist/param.h
@@ -13,7 +13,7 @@ struct HistMakerTrainParam : public XGBoostParameter<HistMakerTrainParam> {
   constexpr static std::size_t DefaultNodes() { return static_cast<std::size_t>(1) << 16; }
 
   bool debug_synchronize{false};
-  std::size_t internal_max_cached_hist_node{DefaultNodes()};
+  std::size_t max_cached_hist_node{DefaultNodes()};
 
   void CheckTreesSynchronized(RegTree const* local_tree) const;
 
@@ -22,7 +22,7 @@ struct HistMakerTrainParam : public XGBoostParameter<HistMakerTrainParam> {
     DMLC_DECLARE_FIELD(debug_synchronize)
         .set_default(false)
         .describe("Check if all distributed tree are identical after tree construction.");
-    DMLC_DECLARE_FIELD(internal_max_cached_hist_node)
+    DMLC_DECLARE_FIELD(max_cached_hist_node)
         .set_default(DefaultNodes())
         .set_lower_bound(1)
         .describe("Maximum number of nodes in CPU histogram cache. Only for internal usage.");
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 0403c7881..5cce89e2c 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -866,6 +866,9 @@ class GPUGlobalApproxMaker : public TreeUpdater {
     // Used in test to count how many configurations are performed
     LOG(DEBUG) << "[GPU Approx]: Configure";
     hist_maker_param_.UpdateAllowUnknown(args);
+    if (hist_maker_param_.max_cached_hist_node != HistMakerTrainParam::DefaultNodes()) {
+      LOG(WARNING) << "The `max_cached_hist_node` is ignored in GPU.";
+    }
     dh::CheckComputeCapability();
     initialised_ = false;
 
diff --git a/tests/cpp/tree/hist/test_evaluate_splits.cc b/tests/cpp/tree/hist/test_evaluate_splits.cc
index 1685a3c80..095284a38 100644
--- a/tests/cpp/tree/hist/test_evaluate_splits.cc
+++ b/tests/cpp/tree/hist/test_evaluate_splits.cc
@@ -51,7 +51,7 @@ void TestEvaluateSplits(bool force_read_by_column) {
   row_set_collection.Init();
 
   HistMakerTrainParam hist_param;
-  hist.Reset(gmat.cut.Ptrs().back(), hist_param.internal_max_cached_hist_node);
+  hist.Reset(gmat.cut.Ptrs().back(), hist_param.max_cached_hist_node);
   hist.AllocateHistograms({0});
   common::BuildHist<false>(row_gpairs, row_set_collection[0], gmat, hist[0], force_read_by_column);
 
@@ -118,7 +118,7 @@ TEST(HistMultiEvaluator, Evaluate) {
   linalg::Vector<GradientPairPrecise> root_sum({2}, Context::kCpuId);
   for (bst_target_t t{0}; t < n_targets; ++t) {
     auto &hist = histogram[t];
-    hist.Reset(n_bins * n_features, hist_param.internal_max_cached_hist_node);
+    hist.Reset(n_bins * n_features, hist_param.max_cached_hist_node);
     hist.AllocateHistograms({0});
     auto node_hist = hist[0];
     node_hist[0] = {-0.5, 0.5};
@@ -235,7 +235,7 @@ auto CompareOneHotAndPartition(bool onehot) {
     entries.front().nid = 0;
     entries.front().depth = 0;
 
-    hist.Reset(gmat.cut.TotalBins(), hist_param.internal_max_cached_hist_node);
+    hist.Reset(gmat.cut.TotalBins(), hist_param.max_cached_hist_node);
     hist.AllocateHistograms({0});
     auto node_hist = hist[0];
 
@@ -265,7 +265,7 @@ TEST(HistEvaluator, Categorical) {
 TEST_F(TestCategoricalSplitWithMissing, HistEvaluator) {
   BoundedHistCollection hist;
   HistMakerTrainParam hist_param;
-  hist.Reset(cuts_.TotalBins(), hist_param.internal_max_cached_hist_node);
+  hist.Reset(cuts_.TotalBins(), hist_param.max_cached_hist_node);
   hist.AllocateHistograms({0});
   auto node_hist = hist[0];
   ASSERT_EQ(node_hist.size(), feature_histogram_.size());
diff --git a/tests/cpp/tree/hist/test_histogram.cc b/tests/cpp/tree/hist/test_histogram.cc
index b90b43101..8949b5f4b 100644
--- a/tests/cpp/tree/hist/test_histogram.cc
+++ b/tests/cpp/tree/hist/test_histogram.cc
@@ -516,7 +516,7 @@ class OverflowTest : public ::testing::TestWithParam<std::tuple<bool, bool>> {
     Context ctx;
     HistMakerTrainParam hist_param;
     if (limit) {
-      hist_param.Init(Args{{"internal_max_cached_hist_node", "1"}});
+      hist_param.Init(Args{{"max_cached_hist_node", "1"}});
     }
 
     std::shared_ptr<DMatrix> Xy =
diff --git a/tests/cpp/tree/test_evaluate_splits.h b/tests/cpp/tree/test_evaluate_splits.h
index 04da4777d..6cb75e23b 100644
--- a/tests/cpp/tree/test_evaluate_splits.h
+++ b/tests/cpp/tree/test_evaluate_splits.h
@@ -59,7 +59,7 @@ class TestPartitionBasedSplit : public ::testing::Test {
     cuts_.min_vals_.Resize(1);
 
     HistMakerTrainParam hist_param;
-    hist_.Reset(cuts_.TotalBins(), hist_param.internal_max_cached_hist_node);
+    hist_.Reset(cuts_.TotalBins(), hist_param.max_cached_hist_node);
     hist_.AllocateHistograms({0});
     auto node_hist = hist_[0];
 

From 4359356d46db7d30633ee13391233bb4839b7ca5 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Thu, 10 Aug 2023 04:49:16 -0500
Subject: [PATCH 079/136] [R] [CI] use lintr 3.1.0 (#9456)

---
 R-package/R/xgb.model.dt.tree.R          |  3 +--
 R-package/R/xgb.plot.deepness.R          |  2 +-
 R-package/demo/interaction_constraints.R |  2 +-
 R-package/tests/testthat/test_helpers.R  |  2 +-
 tests/ci_build/lint_r.R                  | 15 ++++++++++++---
 5 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R
index 987117d56..fa11c50fb 100644
--- a/R-package/R/xgb.model.dt.tree.R
+++ b/R-package/R/xgb.model.dt.tree.R
@@ -86,8 +86,7 @@ xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL,
     text <- xgb.dump(model = model, with_stats = TRUE)
   }
 
-  if (length(text) < 2 ||
-      sum(grepl('leaf=(\\d+)', text)) < 1) {
+  if (length(text) < 2 || !any(grepl('leaf=(\\d+)', text))) {
     stop("Non-tree model detected! This function can only be used with tree models.")
   }
 
diff --git a/R-package/R/xgb.plot.deepness.R b/R-package/R/xgb.plot.deepness.R
index 6579fb511..f6230e1ab 100644
--- a/R-package/R/xgb.plot.deepness.R
+++ b/R-package/R/xgb.plot.deepness.R
@@ -136,7 +136,7 @@ get.leaf.depth <- function(dt_tree) {
     # list of paths to each leaf in a tree
     paths <- lapply(paths_tmp$vpath, names)
     # combine into a resulting path lengths table for a tree
-    data.table(Depth = sapply(paths, length), ID = To[Leaf == TRUE])
+    data.table(Depth = lengths(paths), ID = To[Leaf == TRUE])
   }, by = Tree]
 }
 
diff --git a/R-package/demo/interaction_constraints.R b/R-package/demo/interaction_constraints.R
index 6da541b9b..9e694e3eb 100644
--- a/R-package/demo/interaction_constraints.R
+++ b/R-package/demo/interaction_constraints.R
@@ -44,7 +44,7 @@ treeInteractions <- function(input_tree, input_max_depth) {
 
   # Remove non-interactions (same variable)
   interaction_list <- lapply(interaction_list, unique)  # remove same variables
-  interaction_length <- sapply(interaction_list, length)
+  interaction_length <- lengths(interaction_list)
   interaction_list <- interaction_list[interaction_length > 1]
   interaction_list <- unique(lapply(interaction_list, sort))
   return(interaction_list)
diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R
index b48c8c707..f00ac0881 100644
--- a/R-package/tests/testthat/test_helpers.R
+++ b/R-package/tests/testthat/test_helpers.R
@@ -189,7 +189,7 @@ test_that("SHAPs sum to predictions, with or without DART", {
     tol <- 1e-5
 
     expect_equal(rowSums(shap), pred, tol = tol)
-    expect_equal(apply(shapi, 1, sum), pred, tol = tol)
+    expect_equal(rowSums(shapi), pred, tol = tol)
     for (i in seq_len(nrow(d)))
       for (f in list(rowSums, colSums))
         expect_equal(f(shapi[i, , ]), shap[i, ], tol = tol)
diff --git a/tests/ci_build/lint_r.R b/tests/ci_build/lint_r.R
index 9d50fe433..ce512482d 100644
--- a/tests/ci_build/lint_r.R
+++ b/tests/ci_build/lint_r.R
@@ -20,15 +20,23 @@ my_linters <- list(
   any_duplicated = lintr::any_duplicated_linter(),
   any_is_na = lintr::any_is_na_linter(),
   assignment_linter = lintr::assignment_linter(),
+  boolean_arithmetic = lintr::boolean_arithmetic_linter(),
   brace_linter = lintr::brace_linter(),
+  class_equals = lintr::class_equals_linter(),
   commas_linter = lintr::commas_linter(),
+  empty_assignment = lintr::empty_assignment_linter(),
   equals_na = lintr::equals_na_linter(),
   fixed_regex = lintr::fixed_regex_linter(),
+  for_loop_index = lintr::for_loop_index_linter(),
+  function_return = lintr::function_return_linter(),
   infix_spaces_linter = lintr::infix_spaces_linter(),
+  is_numeric = lintr::is_numeric_linter(),
   line_length_linter = lintr::line_length_linter(length = 150L),
-  no_tab_linter = lintr::no_tab_linter(),
+  lengths = lintr::lengths_linter(),
+  matrix = lintr::matrix_apply_linter(),
   object_usage_linter = lintr::object_usage_linter(),
   object_length_linter = lintr::object_length_linter(),
+  routine_registration = lintr::routine_registration_linter(),
   semicolon = lintr::semicolon_linter(),
   seq = lintr::seq_linter(),
   spaces_inside_linter = lintr::spaces_inside_linter(),
@@ -37,9 +45,10 @@ my_linters <- list(
   trailing_blank_lines_linter = lintr::trailing_blank_lines_linter(),
   trailing_whitespace_linter = lintr::trailing_whitespace_linter(),
   true_false = lintr::T_and_F_symbol_linter(),
-  unneeded_concatenation = lintr::unneeded_concatenation_linter(),
+  unnecessary_concatenation = lintr::unnecessary_concatenation_linter(),
   unreachable_code = lintr::unreachable_code_linter(),
-  vector_logic = lintr::vector_logic_linter()
+  vector_logic = lintr::vector_logic_linter(),
+  whitespace = lintr::whitespace_linter()
 )
 
 noquote(paste0(length(FILES_TO_LINT), " R files need linting"))

From 9dbb71490c4737a55ed8b12296f53dfbf5b910c7 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Thu, 10 Aug 2023 06:26:36 -0500
Subject: [PATCH 080/136] [Doc] fix typos in documentation (#9458)

---
 .gitignore                                       |  1 +
 doc/build.rst                                    |  2 +-
 doc/contrib/ci.rst                               |  4 ++--
 doc/contrib/coding_guide.rst                     |  2 +-
 doc/contrib/donate.rst                           |  2 +-
 doc/contrib/unit_tests.rst                       |  2 +-
 doc/faq.rst                                      |  2 +-
 doc/jvm/java_intro.rst                           |  2 +-
 doc/prediction.rst                               |  4 ++--
 doc/python/sklearn_estimator.rst                 |  2 +-
 doc/treemethod.rst                               |  2 +-
 doc/tutorials/c_api_tutorial.rst                 | 10 +++++-----
 doc/tutorials/custom_metric_obj.rst              |  6 +++---
 doc/tutorials/dask.rst                           |  4 ++--
 doc/tutorials/external_memory.rst                |  8 ++++----
 doc/tutorials/feature_interaction_constraint.rst |  2 +-
 doc/tutorials/learning_to_rank.rst               |  6 +++---
 doc/tutorials/ray.rst                            |  2 +-
 18 files changed, 32 insertions(+), 31 deletions(-)

diff --git a/.gitignore b/.gitignore
index 3a606c847..672b5bcde 100644
--- a/.gitignore
+++ b/.gitignore
@@ -48,6 +48,7 @@ Debug
 *.Rproj
 ./xgboost.mpi
 ./xgboost.mock
+*.bak
 #.Rbuildignore
 R-package.Rproj
 *.cache*
diff --git a/doc/build.rst b/doc/build.rst
index e30d57bc8..cba75ff57 100644
--- a/doc/build.rst
+++ b/doc/build.rst
@@ -119,7 +119,7 @@ An up-to-date version of the CUDA toolkit is required.
 
 .. note:: Checking your compiler version
 
-  CUDA is really picky about supported compilers, a table for the compatible compilers for the latests CUDA version on Linux can be seen `here <https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html>`_.
+  CUDA is really picky about supported compilers, a table for the compatible compilers for the latest CUDA version on Linux can be seen `here <https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html>`_.
 
   Some distros package a compatible ``gcc`` version with CUDA. If you run into compiler errors with ``nvcc``, try specifying the correct compiler with ``-DCMAKE_CXX_COMPILER=/path/to/correct/g++ -DCMAKE_C_COMPILER=/path/to/correct/gcc``. On Arch Linux, for example, both binaries can be found under ``/opt/cuda/bin/``.
 
diff --git a/doc/contrib/ci.rst b/doc/contrib/ci.rst
index 76e06de35..2db6f80bc 100644
--- a/doc/contrib/ci.rst
+++ b/doc/contrib/ci.rst
@@ -32,7 +32,7 @@ GitHub Actions is also used to build Python wheels targeting MacOS Intel and App
 ``python_wheels`` pipeline sets up environment variables prefixed ``CIBW_*`` to indicate the target
 OS and processor. The pipeline then invokes the script ``build_python_wheels.sh``, which in turns
 calls ``cibuildwheel`` to build the wheel. The ``cibuildwheel`` is a library that sets up a
-suitable Python environment for each OS and processor target. Since we don't have Apple Silion
+suitable Python environment for each OS and processor target. Since we don't have Apple Silicon
 machine in GitHub Actions, cross-compilation is needed; ``cibuildwheel`` takes care of the complex
 task of cross-compiling a Python wheel. (Note that ``cibuildwheel`` will call
 ``pip wheel``. Since XGBoost has a native library component, we created a customized build
@@ -131,7 +131,7 @@ set up a credential pair in order to provision resources on AWS. See
 Worker Image Pipeline
 =====================
 Building images for worker machines used to be a chore: you'd provision an EC2 machine, SSH into it, and
-manually install the necessary packages. This process is not only laborous but also error-prone. You may
+manually install the necessary packages. This process is not only laborious but also error-prone. You may
 forget to install a package or change a system configuration.
 
 No more. Now we have an automated pipeline for building images for worker machines.
diff --git a/doc/contrib/coding_guide.rst b/doc/contrib/coding_guide.rst
index f939a17b2..e799ad286 100644
--- a/doc/contrib/coding_guide.rst
+++ b/doc/contrib/coding_guide.rst
@@ -100,7 +100,7 @@ two automatic checks to enforce coding style conventions. To expedite the code r
 
 Linter
 ======
-We use `pylint <https://github.com/PyCQA/pylint>`_ and `cpplint <https://github.com/cpplint/cpplint>`_ to enforce style convention and find potential errors. Linting is especially useful for Python, as we can catch many errors that would have otherwise occured at run-time.
+We use `pylint <https://github.com/PyCQA/pylint>`_ and `cpplint <https://github.com/cpplint/cpplint>`_ to enforce style convention and find potential errors. Linting is especially useful for Python, as we can catch many errors that would have otherwise occurred at run-time.
 
 To run this check locally, run the following command from the top level source tree:
 
diff --git a/doc/contrib/donate.rst b/doc/contrib/donate.rst
index cc373d2b8..b6171c412 100644
--- a/doc/contrib/donate.rst
+++ b/doc/contrib/donate.rst
@@ -29,7 +29,7 @@ The Project Management Committee (PMC) of the XGBoost project appointed `Open So
 
 All expenses incurred for hosting CI will be submitted to the fiscal host with receipts. Only the expenses in the following categories will be approved for reimbursement:
 
-* Cloud exprenses for the cloud test farm (https://buildkite.com/xgboost)
+* Cloud expenses for the cloud test farm (https://buildkite.com/xgboost)
 * Cost of domain https://xgboost-ci.net
 * Monthly cost of using BuildKite
 * Hosting cost of the User Forum (https://discuss.xgboost.ai)
diff --git a/doc/contrib/unit_tests.rst b/doc/contrib/unit_tests.rst
index 5131dbabb..ef4ad1480 100644
--- a/doc/contrib/unit_tests.rst
+++ b/doc/contrib/unit_tests.rst
@@ -169,7 +169,7 @@ supply a specified SANITIZER_PATH.
 
 How to use sanitizers with CUDA support
 =======================================
-Runing XGBoost on CUDA with address sanitizer (asan) will raise memory error.
+Running XGBoost on CUDA with address sanitizer (asan) will raise memory error.
 To use asan with CUDA correctly, you need to configure asan via ASAN_OPTIONS
 environment variable:
 
diff --git a/doc/faq.rst b/doc/faq.rst
index 072a7f975..51de4bbc8 100644
--- a/doc/faq.rst
+++ b/doc/faq.rst
@@ -63,7 +63,7 @@ XGBoost supports missing values by default.
 In tree algorithms, branch directions for missing values are learned during training.
 Note that the gblinear booster treats missing values as zeros.
 
-When the ``missing`` parameter is specifed, values in the input predictor that is equal to
+When the ``missing`` parameter is specified, values in the input predictor that is equal to
 ``missing`` will be treated as missing and removed.  By default it's set to ``NaN``.
 
 **************************************
diff --git a/doc/jvm/java_intro.rst b/doc/jvm/java_intro.rst
index 29fed9644..c5aba8e6a 100644
--- a/doc/jvm/java_intro.rst
+++ b/doc/jvm/java_intro.rst
@@ -129,7 +129,7 @@ With parameters and data, you are able to train a booster model.
 
     booster.saveModel("model.bin");
 
-* Generaing model dump with feature map
+* Generating model dump with feature map
 
   .. code-block:: java
 
diff --git a/doc/prediction.rst b/doc/prediction.rst
index 026009d59..c94ddfbbf 100644
--- a/doc/prediction.rst
+++ b/doc/prediction.rst
@@ -54,7 +54,7 @@ After 1.4 release, we added a new parameter called ``strict_shape``, one can set
   Output is a 4-dim array with ``(n_samples, n_iterations, n_classes, n_trees_in_forest)``
   as shape.  ``n_trees_in_forest`` is specified by the ``numb_parallel_tree`` during
   training.  When strict shape is set to False, output is a 2-dim array with last 3 dims
-  concatenated into 1.  Also the last dimension is dropped if it eqauls to 1. When using
+  concatenated into 1.  Also the last dimension is dropped if it equals to 1. When using
   ``apply`` method in scikit learn interface, this is set to False by default.
 
 
@@ -68,7 +68,7 @@ n_classes, n_trees_in_forest)``, while R with ``strict_shape=TRUE`` outputs
 Other than these prediction types, there's also a parameter called ``iteration_range``,
 which is similar to model slicing.  But instead of actually splitting up the model into
 multiple stacks, it simply returns the prediction formed by the trees within range.
-Number of trees created in each iteration eqauls to :math:`trees_i = num\_class \times
+Number of trees created in each iteration equals to :math:`trees_i = num\_class \times
 num\_parallel\_tree`.  So if you are training a boosted random forest with size of 4, on
 the 3-class classification dataset, and want to use the first 2 iterations of trees for
 prediction, you need to provide ``iteration_range=(0, 2)``.  Then the first :math:`2
diff --git a/doc/python/sklearn_estimator.rst b/doc/python/sklearn_estimator.rst
index 9748dbebd..a4835dcac 100644
--- a/doc/python/sklearn_estimator.rst
+++ b/doc/python/sklearn_estimator.rst
@@ -20,7 +20,7 @@ sklearn estimator interface is still working in progress.
 
 You can find some some quick start examples at
 :ref:`sphx_glr_python_examples_sklearn_examples.py`. The main advantage of using sklearn
-interface is that it works with most of the utilites provided by sklearn like
+interface is that it works with most of the utilities provided by sklearn like
 :py:func:`sklearn.model_selection.cross_validate`. Also, many other libraries recognize
 the sklearn estimator interface thanks to its popularity.
 
diff --git a/doc/treemethod.rst b/doc/treemethod.rst
index 1f83401fe..a3b2b8c2e 100644
--- a/doc/treemethod.rst
+++ b/doc/treemethod.rst
@@ -68,7 +68,7 @@ Other Updaters
 1. ``Prune``: It prunes the existing trees.  ``prune`` is usually used as part of other
    tree methods.  To use pruner independently, one needs to set the process type to update
    by: ``{"process_type": "update", "updater": "prune"}``.  With this set of parameters,
-   during trianing, XGBOost will prune the existing trees according to 2 parameters
+   during training, XGBoost will prune the existing trees according to 2 parameters
    ``min_split_loss (gamma)`` and ``max_depth``.
 
 2. ``Refresh``: Refresh the statistic of built trees on a new training dataset.  Like the
diff --git a/doc/tutorials/c_api_tutorial.rst b/doc/tutorials/c_api_tutorial.rst
index 090743a0f..3c33278be 100644
--- a/doc/tutorials/c_api_tutorial.rst
+++ b/doc/tutorials/c_api_tutorial.rst
@@ -55,7 +55,7 @@ To ensure that CMake can locate the XGBoost library, supply ``-DCMAKE_PREFIX_PAT
 
 .. code-block:: bash
 
-  # Nagivate to the build directory for your application
+  # Navigate to the build directory for your application
   cd build
   # Activate the Conda environment where we previously installed XGBoost
   conda activate [env_name]
@@ -65,7 +65,7 @@ To ensure that CMake can locate the XGBoost library, supply ``-DCMAKE_PREFIX_PAT
   make
 
 ************************
-Usefull Tips To Remember
+Useful Tips To Remember
 ************************
 
 Below are some useful tips while using C API:
@@ -151,7 +151,7 @@ c. Assertion technique: It works both in C/ C++. If expression evaluates to 0 (f
    Example if we our training data is in ``dense matrix`` format then your prediction dataset should also be a ``dense matrix`` or if training in ``libsvm`` format then dataset for prediction should also be in ``libsvm`` format.
 
 
-4. Always use strings for setting values to the parameters in booster handle object. The paramter value can be of any data type (e.g. int, char, float, double, etc), but they should always be encoded as strings.
+4. Always use strings for setting values to the parameters in booster handle object. The parameter value can be of any data type (e.g. int, char, float, double, etc), but they should always be encoded as strings.
 
 .. code-block:: c
 
@@ -168,7 +168,7 @@ Sample examples along with Code snippet to use C API functions
 .. code-block:: c
 
   DMatrixHandle data; // handle to DMatrix
-  // Load the dat from file & store it in data variable of DMatrixHandle datatype
+  // Load the data from file & store it in data variable of DMatrixHandle datatype
   safe_xgboost(XGDMatrixCreateFromFile("/path/to/file/filename", silent, &data));
 
 
@@ -278,7 +278,7 @@ Sample examples along with Code snippet to use C API functions
     uint64_t const* out_shape;
     /* Dimension of output prediction */
     uint64_t out_dim;
-    /* Pointer to a thread local contigious array, assigned in prediction function. */
+    /* Pointer to a thread local contiguous array, assigned in prediction function. */
     float const* out_result = NULL;
     safe_xgboost(
         XGBoosterPredictFromDMatrix(booster, dmatrix, config, &out_shape, &out_dim, &out_result));
diff --git a/doc/tutorials/custom_metric_obj.rst b/doc/tutorials/custom_metric_obj.rst
index c6d5fbff5..f5c08bf59 100644
--- a/doc/tutorials/custom_metric_obj.rst
+++ b/doc/tutorials/custom_metric_obj.rst
@@ -38,7 +38,7 @@ Although XGBoost has native support for said functions, using it for demonstrati
 provides us the opportunity of comparing the result from our own implementation and the
 one from XGBoost internal for learning purposes.  After finishing this tutorial, we should
 be able to provide our own functions for rapid experiments.  And at the end, we will
-provide some notes on non-identy link function along with examples of using custom metric
+provide some notes on non-identity link function along with examples of using custom metric
 and objective with the `scikit-learn` interface.
 
 If we compute the gradient of said objective function:
@@ -165,7 +165,7 @@ Reverse Link Function
 When using builtin objective, the raw prediction is transformed according to the objective
 function.  When a custom objective is provided XGBoost doesn't know its link function so the
 user is responsible for making the transformation for both objective and custom evaluation
-metric.  For objective with identiy link like ``squared error`` this is trivial, but for
+metric.  For objective with identity link like ``squared error`` this is trivial, but for
 other link functions like log link or inverse link the difference is significant.
 
 For the Python package, the behaviour of prediction can be controlled by the
@@ -173,7 +173,7 @@ For the Python package, the behaviour of prediction can be controlled by the
 parameter without a custom objective, the metric function will receive transformed
 prediction since the objective is defined by XGBoost. However, when the custom objective is
 also provided along with that metric, then both the objective and custom metric will
-recieve raw prediction.  The following example provides a comparison between two different
+receive raw prediction.  The following example provides a comparison between two different
 behavior with a multi-class classification model. Firstly we define 2 different Python
 metric functions implementing the same underlying metric for comparison,
 `merror_with_transform` is used when custom objective is also used, otherwise the simpler
diff --git a/doc/tutorials/dask.rst b/doc/tutorials/dask.rst
index 131929b24..7ab251bcf 100644
--- a/doc/tutorials/dask.rst
+++ b/doc/tutorials/dask.rst
@@ -256,7 +256,7 @@ In the example below, a ``KubeCluster`` is used for `deploying Dask on Kubernete
       m = 1000
       n = 10
       kWorkers = 2                # assuming you have 2 GPU nodes on that cluster.
-      # You need to work out the worker-spec youself.  See document in dask_kubernetes for
+      # You need to work out the worker-spec yourself.  See document in dask_kubernetes for
       # its usage.  Here we just want to show that XGBoost works on various clusters.
       cluster = KubeCluster.from_yaml('worker-spec.yaml', deploy_mode='remote')
       cluster.scale(kWorkers)     # scale to use all GPUs
@@ -648,7 +648,7 @@ environment than training the model using a single node due to aforementioned cr
 Memory Usage
 ************
 
-Here are some pratices on reducing memory usage with dask and xgboost.
+Here are some practices on reducing memory usage with dask and xgboost.
 
 - In a distributed work flow, data is best loaded by dask collections directly instead of
   loaded by client process.  When loading with client process is unavoidable, use
diff --git a/doc/tutorials/external_memory.rst b/doc/tutorials/external_memory.rst
index 811db6bd5..6daacf741 100644
--- a/doc/tutorials/external_memory.rst
+++ b/doc/tutorials/external_memory.rst
@@ -7,7 +7,7 @@ dataset needs to be loaded into memory. This can be costly and sometimes
 infeasible. Staring from 1.5, users can define a custom iterator to load data in chunks
 for running XGBoost algorithms. External memory can be used for both training and
 prediction, but training is the primary use case and it will be our focus in this
-tutorial. For prediction and evaluation, users can iterate through the data themseleves
+tutorial. For prediction and evaluation, users can iterate through the data themselves
 while training requires the full dataset to be loaded into the memory.
 
 During training, there are two different modes for external memory support available in
@@ -142,7 +142,7 @@ see `this paper <https://arxiv.org/abs/2005.09148>`_.
 .. warning::
 
    When GPU is running out of memory during iteration on external memory, user might
-   recieve a segfault instead of an OOM exception.
+   receive a segfault instead of an OOM exception.
 
 .. _ext_remarks:
 
@@ -150,7 +150,7 @@ see `this paper <https://arxiv.org/abs/2005.09148>`_.
 Remarks
 *******
 
-When using external memory with XBGoost, data is divided into smaller chunks so that only
+When using external memory with XGBoost, data is divided into smaller chunks so that only
 a fraction of it needs to be stored in memory at any given time. It's important to note
 that this method only applies to the predictor data (``X``), while other data, like labels
 and internal runtime structures are concatenated. This means that memory reduction is most
@@ -211,7 +211,7 @@ construction of `QuantileDmatrix` with data chunks. On the other hand, if it's p
 doesn't fetch data during training. On the other hand, the external memory `DMatrix`
 fetches data batches from external memory on-demand.  Use the `QuantileDMatrix` (with
 iterator if necessary) when you can fit most of your data in memory. The training would be
-an order of magnitute faster than using external memory.
+an order of magnitude faster than using external memory.
 
 ****************
 Text File Inputs
diff --git a/doc/tutorials/feature_interaction_constraint.rst b/doc/tutorials/feature_interaction_constraint.rst
index 07e5f5676..b3d655584 100644
--- a/doc/tutorials/feature_interaction_constraint.rst
+++ b/doc/tutorials/feature_interaction_constraint.rst
@@ -233,7 +233,7 @@ This has lead to some interesting implications of feature interaction constraint
 ``[[0, 1], [0, 1, 2], [1, 2]]`` as another example.  Assuming we have only 3 available
 features in our training datasets for presentation purpose, careful readers might have
 found out that the above constraint is the same as simply ``[[0, 1, 2]]``.  Since no matter which
-feature is chosen for split in the root node, all its descendants are allowd to include every
+feature is chosen for split in the root node, all its descendants are allowed to include every
 feature as legitimate split candidates without violating interaction constraints.
 
 For one last example, we use ``[[0, 1], [1, 3, 4]]`` and choose feature ``0`` as split for
diff --git a/doc/tutorials/learning_to_rank.rst b/doc/tutorials/learning_to_rank.rst
index 965a623c3..c562dc2df 100644
--- a/doc/tutorials/learning_to_rank.rst
+++ b/doc/tutorials/learning_to_rank.rst
@@ -11,12 +11,12 @@ Learning to Rank
 ********
 Overview
 ********
-Often in the context of information retrieval, learning-to-rank aims to train a model that arranges a set of query results into an ordered list `[1] <#references>`__. For surprivised learning-to-rank, the predictors are sample documents encoded as feature matrix, and the labels are relevance degree for each sample. Relevance degree can be multi-level (graded) or binary (relevant or not). The training samples are often grouped by their query index with each query group containing multiple query results.
+Often in the context of information retrieval, learning-to-rank aims to train a model that arranges a set of query results into an ordered list `[1] <#references>`__. For supervised learning-to-rank, the predictors are sample documents encoded as feature matrix, and the labels are relevance degree for each sample. Relevance degree can be multi-level (graded) or binary (relevant or not). The training samples are often grouped by their query index with each query group containing multiple query results.
 
 XGBoost implements learning to rank through a set of objective functions and performance metrics. The default objective is ``rank:ndcg`` based on the ``LambdaMART`` `[2] <#references>`__ algorithm, which in turn is an adaptation of the ``LambdaRank`` `[3] <#references>`__ framework to gradient boosting trees. For a history and a summary of the algorithm, see `[5] <#references>`__. The implementation in XGBoost features deterministic GPU computation, distributed training, position debiasing and two different pair construction strategies.
 
 ************************************
-Training with the Pariwise Objective
+Training with the Pairwise Objective
 ************************************
 ``LambdaMART`` is a pairwise ranking model, meaning that it compares the relevance degree for every pair of samples in a query group and calculate a proxy gradient for each pair. The default objective ``rank:ndcg`` is using the surrogate gradient derived from the ``ndcg`` metric. To train a XGBoost model, we need an additional sorted array called ``qid`` for specifying the query group of input samples. An example input would look like this:
 
@@ -59,7 +59,7 @@ Notice that the samples are sorted based on their query index in a non-decreasin
   X = X[sorted_idx, :]
   y = y[sorted_idx]
 
-The simpliest way to train a ranking model is by using the scikit-learn estimator interface. Continuing the previous snippet, we can train a simple ranking model without tuning:
+The simplest way to train a ranking model is by using the scikit-learn estimator interface. Continuing the previous snippet, we can train a simple ranking model without tuning:
 
 .. code-block:: python
 
diff --git a/doc/tutorials/ray.rst b/doc/tutorials/ray.rst
index 9c09db474..f3032c970 100644
--- a/doc/tutorials/ray.rst
+++ b/doc/tutorials/ray.rst
@@ -138,7 +138,7 @@ This will train on four GPUs in parallel.
 
 Note that it usually does not make sense to allocate more than one GPU per actor,
 as XGBoost relies on distributed libraries such as Dask or Ray to utilize multi
-GPU taining.
+GPU training.
 
 Setting the number of CPUs per actor
 ====================================

From 44bd2981b2be1a40253945b7e02b297a297b140b Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Thu, 10 Aug 2023 08:40:59 -0500
Subject: [PATCH 081/136] [R] remove default values in internal utility
 functions (#9457)

---
 R-package/R/utils.R     |  8 ++++----
 R-package/R/xgb.cv.R    | 14 ++++++++++++--
 R-package/R/xgb.train.R | 17 ++++++++++++++---
 3 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/R-package/R/utils.R b/R-package/R/utils.R
index a822113a7..458b119f6 100644
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@@ -140,7 +140,7 @@ check.custom.eval <- function(env = parent.frame()) {
 
 
 # Update a booster handle for an iteration with dtrain data
-xgb.iter.update <- function(booster_handle, dtrain, iter, obj = NULL) {
+xgb.iter.update <- function(booster_handle, dtrain, iter, obj) {
   if (!identical(class(booster_handle), "xgb.Booster.handle")) {
     stop("booster_handle must be of xgb.Booster.handle class")
   }
@@ -163,7 +163,7 @@ xgb.iter.update <- function(booster_handle, dtrain, iter, obj = NULL) {
 # Evaluate one iteration.
 # Returns a named vector of evaluation metrics
 # with the names in a 'datasetname-metricname' format.
-xgb.iter.eval <- function(booster_handle, watchlist, iter, feval = NULL) {
+xgb.iter.eval <- function(booster_handle, watchlist, iter, feval) {
   if (!identical(class(booster_handle), "xgb.Booster.handle"))
     stop("class of booster_handle must be xgb.Booster.handle")
 
@@ -234,7 +234,7 @@ generate.cv.folds <- function(nfold, nrows, stratified, label, params) {
         y <- factor(y)
       }
     }
-    folds <- xgb.createFolds(y, nfold)
+    folds <- xgb.createFolds(y = y, k = nfold)
   } else {
     # make simple non-stratified folds
     kstep <- length(rnd_idx) %/% nfold
@@ -251,7 +251,7 @@ generate.cv.folds <- function(nfold, nrows, stratified, label, params) {
 # Creates CV folds stratified by the values of y.
 # It was borrowed from caret::createFolds and simplified
 # by always returning an unnamed list of fold indices.
-xgb.createFolds <- function(y, k = 10) {
+xgb.createFolds <- function(y, k) {
   if (is.numeric(y)) {
     ## Group the numeric data based on their magnitudes
     ## and sample within those groups.
diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R
index 27730cbc3..788638921 100644
--- a/R-package/R/xgb.cv.R
+++ b/R-package/R/xgb.cv.R
@@ -223,8 +223,18 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
     for (f in cb$pre_iter) f()
 
     msg <- lapply(bst_folds, function(fd) {
-      xgb.iter.update(fd$bst, fd$dtrain, iteration - 1, obj)
-      xgb.iter.eval(fd$bst, fd$watchlist, iteration - 1, feval)
+      xgb.iter.update(
+        booster_handle = fd$bst,
+        dtrain = fd$dtrain,
+        iter = iteration - 1,
+        obj = obj
+      )
+      xgb.iter.eval(
+        booster_handle = fd$bst,
+        watchlist = fd$watchlist,
+        iter = iteration - 1,
+        feval = feval
+      )
     })
     msg <- simplify2array(msg)
     bst_evaluation <- rowMeans(msg)
diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R
index 5a7d2eb5e..729475945 100644
--- a/R-package/R/xgb.train.R
+++ b/R-package/R/xgb.train.R
@@ -390,10 +390,21 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
 
     for (f in cb$pre_iter) f()
 
-    xgb.iter.update(bst$handle, dtrain, iteration - 1, obj)
+    xgb.iter.update(
+        booster_handle = bst$handle,
+        dtrain = dtrain,
+        iter = iteration - 1,
+        obj = obj
+    )
 
-    if (length(watchlist) > 0)
-      bst_evaluation <- xgb.iter.eval(bst$handle, watchlist, iteration - 1, feval)  # nolint: object_usage_linter
+    if (length(watchlist) > 0) {
+      bst_evaluation <- xgb.iter.eval(  # nolint: object_usage_linter
+        booster_handle = bst$handle,
+        watchlist = watchlist,
+        iter = iteration - 1,
+        feval = feval
+      )
+    }
 
     xgb.attr(bst$handle, 'niter') <- iteration - 1
 

From d6385355815005625115b648f6b7dc861eacd47e Mon Sep 17 00:00:00 2001
From: ShaneConneely <connees@gmail.com>
Date: Thu, 10 Aug 2023 21:02:04 +0100
Subject: [PATCH 082/136] Update README.md (#9462)

---
 demo/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/demo/README.md b/demo/README.md
index df53b05bb..2be1141dd 100644
--- a/demo/README.md
+++ b/demo/README.md
@@ -106,7 +106,7 @@ Please send pull requests if you find ones that are missing here.
 - Prarthana Bhat, 2nd place winner in [DYD Competition](https://datahack.analyticsvidhya.com/contest/date-your-data/). Link to [Solution](https://github.com/analyticsvidhya/DateYourData/blob/master/Prathna_Bhat_Model.R).
 
 ## Talks
-- [XGBoost: A Scalable Tree Boosting System](http://datascience.la/xgboost-workshop-and-meetup-talk-with-tianqi-chen/) (video+slides) by Tianqi Chen at the Los Angeles Data Science meetup
+- XGBoost: A Scalable Tree Boosting System ([video] (https://www.youtube.com/watch?v=Vly8xGnNiWs) + [slides](https://speakerdeck.com/datasciencela/tianqi-chen-xgboost-overview-and-latest-news-la-meetup-talk)) by Tianqi Chen at the Los Angeles Data Science meetup
 
 ## Tutorials
 

From 592989017489ec64d5538c53fe5ff19da539b151 Mon Sep 17 00:00:00 2001
From: amdsc21 <amdsc21@users.noreply.github.com>
Date: Thu, 10 Aug 2023 20:02:16 +0000
Subject: [PATCH 083/136] [CI] Update RAPIDS to latest stable

---
 tests/buildkite/conftest.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/buildkite/conftest.sh b/tests/buildkite/conftest.sh
index 0036a06fe..9e821f0fe 100755
--- a/tests/buildkite/conftest.sh
+++ b/tests/buildkite/conftest.sh
@@ -24,7 +24,7 @@ set -x
 
 CUDA_VERSION=11.8.0
 NCCL_VERSION=2.16.5-1
-RAPIDS_VERSION=23.06
+RAPIDS_VERSION=23.08
 SPARK_VERSION=3.4.0
 JDK_VERSION=8
 

From 428f6cbbe20ccf65d6bdada13bcb0187cc67da04 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Fri, 11 Aug 2023 02:07:18 -0500
Subject: [PATCH 084/136] [R] remove default values in internal booster
 manipulation functions (#9461)

---
 R-package/R/callbacks.R   |  4 ++--
 R-package/R/xgb.Booster.R | 14 +++++++++-----
 R-package/R/xgb.cv.R      |  7 ++++++-
 R-package/R/xgb.load.R    | 11 ++++++++---
 R-package/R/xgb.train.R   |  9 +++++++--
 5 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/R-package/R/callbacks.R b/R-package/R/callbacks.R
index d2ee59476..7265967b2 100644
--- a/R-package/R/callbacks.R
+++ b/R-package/R/callbacks.R
@@ -511,7 +511,7 @@ cb.cv.predict <- function(save_models = FALSE) {
     if (save_models) {
       env$basket$models <- lapply(env$bst_folds, function(fd) {
         xgb.attr(fd$bst, 'niter') <- env$end_iteration - 1
-        xgb.Booster.complete(xgb.handleToBooster(fd$bst), saveraw = TRUE)
+        xgb.Booster.complete(xgb.handleToBooster(handle = fd$bst, raw = NULL), saveraw = TRUE)
       })
     }
   }
@@ -659,7 +659,7 @@ cb.gblinear.history <- function(sparse = FALSE) {
     } else { # xgb.cv:
       cf <- vector("list", length(env$bst_folds))
       for (i in seq_along(env$bst_folds)) {
-        dmp <- xgb.dump(xgb.handleToBooster(env$bst_folds[[i]]$bst))
+        dmp <- xgb.dump(xgb.handleToBooster(handle = env$bst_folds[[i]]$bst, raw = NULL))
         cf[[i]] <- as.numeric(grep('(booster|bias|weigh)', dmp, invert = TRUE, value = TRUE))
         if (sparse) cf[[i]] <- as(cf[[i]], "sparseVector")
       }
diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R
index 6a53577e9..5ffbbc31c 100644
--- a/R-package/R/xgb.Booster.R
+++ b/R-package/R/xgb.Booster.R
@@ -1,7 +1,6 @@
 # Construct an internal xgboost Booster and return a handle to it.
 # internal utility function
-xgb.Booster.handle <- function(params = list(), cachelist = list(),
-                               modelfile = NULL, handle = NULL) {
+xgb.Booster.handle <- function(params, cachelist, modelfile, handle) {
   if (typeof(cachelist) != "list" ||
       !all(vapply(cachelist, inherits, logical(1), what = 'xgb.DMatrix'))) {
     stop("cachelist must be a list of xgb.DMatrix objects")
@@ -44,7 +43,7 @@ xgb.Booster.handle <- function(params = list(), cachelist = list(),
 
 # Convert xgb.Booster.handle to xgb.Booster
 # internal utility function
-xgb.handleToBooster <- function(handle, raw = NULL) {
+xgb.handleToBooster <- function(handle, raw) {
   bst <- list(handle = handle, raw = raw)
   class(bst) <- "xgb.Booster"
   return(bst)
@@ -129,7 +128,12 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
     stop("argument type must be xgb.Booster")
 
   if (is.null.handle(object$handle)) {
-    object$handle <- xgb.Booster.handle(modelfile = object$raw, handle = object$handle)
+    object$handle <- xgb.Booster.handle(
+      params = list(),
+      cachelist = list(),
+      modelfile = object$raw,
+      handle = object$handle
+    )
   } else {
     if (is.null(object$raw) && saveraw) {
       object$raw <- xgb.serialize(object$handle)
@@ -475,7 +479,7 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
 #' @export
 predict.xgb.Booster.handle <- function(object, ...) {
 
-  bst <- xgb.handleToBooster(object)
+  bst <- xgb.handleToBooster(handle = object, raw = NULL)
 
   ret <- predict(bst, ...)
   return(ret)
diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R
index 788638921..24c1b3f3c 100644
--- a/R-package/R/xgb.cv.R
+++ b/R-package/R/xgb.cv.R
@@ -202,7 +202,12 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
        dtrain <- slice(dall, unlist(folds[-k]))
     else
        dtrain <- slice(dall, train_folds[[k]])
-    handle <- xgb.Booster.handle(params, list(dtrain, dtest))
+    handle <- xgb.Booster.handle(
+      params = params,
+      cachelist = list(dtrain, dtest),
+      modelfile = NULL,
+      handle = NULL
+    )
     list(dtrain = dtrain, bst = handle, watchlist = list(train = dtrain, test = dtest), index = folds[[k]])
   })
   rm(dall)
diff --git a/R-package/R/xgb.load.R b/R-package/R/xgb.load.R
index d98041908..cfbf0b2d8 100644
--- a/R-package/R/xgb.load.R
+++ b/R-package/R/xgb.load.R
@@ -35,7 +35,12 @@ xgb.load <- function(modelfile) {
   if (is.null(modelfile))
     stop("xgb.load: modelfile cannot be NULL")
 
-  handle <- xgb.Booster.handle(modelfile = modelfile)
+  handle <- xgb.Booster.handle(
+    params = list(),
+    cachelist = list(),
+    modelfile = modelfile,
+    handle = NULL
+  )
   # re-use modelfile if it is raw so we do not need to serialize
   if (typeof(modelfile) == "raw") {
     warning(
@@ -45,9 +50,9 @@ xgb.load <- function(modelfile) {
         " `xgb.unserialize` instead. "
       )
     )
-    bst <- xgb.handleToBooster(handle, modelfile)
+    bst <- xgb.handleToBooster(handle = handle, raw = modelfile)
   } else {
-    bst <- xgb.handleToBooster(handle, NULL)
+    bst <- xgb.handleToBooster(handle = handle, raw = NULL)
   }
   bst <- xgb.Booster.complete(bst, saveraw = TRUE)
   return(bst)
diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R
index 729475945..7fe64ab34 100644
--- a/R-package/R/xgb.train.R
+++ b/R-package/R/xgb.train.R
@@ -363,8 +363,13 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
   is_update <- NVL(params[['process_type']], '.') == 'update'
 
   # Construct a booster (either a new one or load from xgb_model)
-  handle <- xgb.Booster.handle(params, append(watchlist, dtrain), xgb_model)
-  bst <- xgb.handleToBooster(handle)
+  handle <- xgb.Booster.handle(
+    params = params,
+    cachelist = append(watchlist, dtrain),
+    modelfile = xgb_model,
+    handle = NULL
+  )
+  bst <- xgb.handleToBooster(handle = handle, raw = NULL)
 
   # extract parameters that can affect the relationship b/w #trees and #iterations
   num_class <- max(as.numeric(NVL(params[['num_class']], 1)), 1)

From bdc1a3c1780da9989e90caac527a29678e716ac8 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 11 Aug 2023 19:07:50 +0800
Subject: [PATCH 085/136] Fix pyspark parameter. (#9460)

- Don't pass the `use_gpu` parameter to the learner.
- Fix GPU approx with PySpark.
---
 python-package/xgboost/spark/core.py                   |  7 +++----
 .../test_gpu_with_spark/test_gpu_spark.py              | 10 ++++++++--
 .../test_with_spark/test_spark_local.py                |  8 +++++---
 3 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py
index 2150e5055..a072e9961 100644
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@@ -115,6 +115,7 @@ _pyspark_specific_params = [
     "qid_col",
     "repartition_random_shuffle",
     "pred_contrib_col",
+    "use_gpu",
 ]
 
 _non_booster_params = ["missing", "n_estimators", "feature_types", "feature_weights"]
@@ -349,11 +350,9 @@ class _SparkXGBParams(
             )
 
         tree_method = self.getOrDefault(self.getParam("tree_method"))
-        if (
-            self.getOrDefault(self.use_gpu) or use_cuda(self.getOrDefault(self.device))
-        ) and not _can_use_qdm(tree_method):
+        if tree_method == "exact":
             raise ValueError(
-                f"The `{tree_method}` tree method is not supported on GPU."
+                "The `exact` tree method is not supported for distributed systems."
             )
 
         if self.getOrDefault(self.features_cols):
diff --git a/tests/test_distributed/test_gpu_with_spark/test_gpu_spark.py b/tests/test_distributed/test_gpu_with_spark/test_gpu_spark.py
index a962f778e..a954d9d6c 100644
--- a/tests/test_distributed/test_gpu_with_spark/test_gpu_spark.py
+++ b/tests/test_distributed/test_gpu_with_spark/test_gpu_spark.py
@@ -151,12 +151,18 @@ def spark_diabetes_dataset_feature_cols(spark_session_with_gpu):
     return train_df, test_df, data.feature_names
 
 
-def test_sparkxgb_classifier_with_gpu(spark_iris_dataset):
+@pytest.mark.parametrize("tree_method", ["hist", "approx"])
+def test_sparkxgb_classifier_with_gpu(tree_method: str, spark_iris_dataset) -> None:
     from pyspark.ml.evaluation import MulticlassClassificationEvaluator
 
-    classifier = SparkXGBClassifier(device="cuda", num_workers=num_workers)
+    classifier = SparkXGBClassifier(
+        device="cuda", num_workers=num_workers, tree_method=tree_method
+    )
     train_df, test_df = spark_iris_dataset
     model = classifier.fit(train_df)
+    config = json.loads(model.get_booster().save_config())
+    ctx = config["learner"]["generic_param"]
+    assert ctx["device"] == "cuda:0"
     pred_result_df = model.transform(test_df)
     evaluator = MulticlassClassificationEvaluator(metricName="f1")
     f1 = evaluator.evaluate(pred_result_df)
diff --git a/tests/test_distributed/test_with_spark/test_spark_local.py b/tests/test_distributed/test_with_spark/test_spark_local.py
index 50eafb0a1..e323a3606 100644
--- a/tests/test_distributed/test_with_spark/test_spark_local.py
+++ b/tests/test_distributed/test_with_spark/test_spark_local.py
@@ -456,7 +456,9 @@ def check_sub_dict_match(
             assert sub_dist[k] == whole_dict[k], f"check on {k} failed"
 
 
-def get_params_map(params_kv: dict, estimator: Type) -> dict:
+def get_params_map(
+    params_kv: dict, estimator: xgb.spark.core._SparkXGBEstimator
+) -> dict:
     return {getattr(estimator, k): v for k, v in params_kv.items()}
 
 
@@ -870,10 +872,10 @@ class TestPySparkLocal:
 
     def test_device_param(self, reg_data: RegData, clf_data: ClfData) -> None:
         clf = SparkXGBClassifier(device="cuda", tree_method="exact")
-        with pytest.raises(ValueError, match="not supported on GPU"):
+        with pytest.raises(ValueError, match="not supported for distributed"):
             clf.fit(clf_data.cls_df_train)
         regressor = SparkXGBRegressor(device="cuda", tree_method="exact")
-        with pytest.raises(ValueError, match="not supported on GPU"):
+        with pytest.raises(ValueError, match="not supported for distributed"):
             regressor.fit(reg_data.reg_df_train)
 
         reg = SparkXGBRegressor(device="cuda", tree_method="gpu_hist")

From bb5618339615338b3bb29aa5c208688f993a21ff Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 11 Aug 2023 21:26:46 +0800
Subject: [PATCH 086/136] Normalize file system path. (#9463)

---
 R-package/src/Makevars.in               |   1 +
 R-package/src/Makevars.win              |   1 +
 R-package/tests/testthat/test_dmatrix.R |   1 +
 src/common/io.cc                        |   5 +-
 src/data/data.cc                        | 120 +++++++++++++-----------
 src/data/file_iterator.cc               |  51 ++++++++++
 src/data/file_iterator.h                |  56 +++--------
 7 files changed, 137 insertions(+), 98 deletions(-)
 create mode 100644 src/data/file_iterator.cc

diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in
index a93f773f9..9e7cbfed4 100644
--- a/R-package/src/Makevars.in
+++ b/R-package/src/Makevars.in
@@ -47,6 +47,7 @@ OBJECTS= \
     $(PKGROOT)/src/data/data.o \
     $(PKGROOT)/src/data/sparse_page_raw_format.o \
     $(PKGROOT)/src/data/ellpack_page.o \
+    $(PKGROOT)/src/data/file_iterator.o \
     $(PKGROOT)/src/data/gradient_index.o \
     $(PKGROOT)/src/data/gradient_index_page_source.o \
     $(PKGROOT)/src/data/gradient_index_format.o \
diff --git a/R-package/src/Makevars.win b/R-package/src/Makevars.win
index d2f47b2aa..7dfa415a4 100644
--- a/R-package/src/Makevars.win
+++ b/R-package/src/Makevars.win
@@ -47,6 +47,7 @@ OBJECTS= \
     $(PKGROOT)/src/data/data.o \
     $(PKGROOT)/src/data/sparse_page_raw_format.o \
     $(PKGROOT)/src/data/ellpack_page.o \
+    $(PKGROOT)/src/data/file_iterator.o \
     $(PKGROOT)/src/data/gradient_index.o \
     $(PKGROOT)/src/data/gradient_index_page_source.o \
     $(PKGROOT)/src/data/gradient_index_format.o \
diff --git a/R-package/tests/testthat/test_dmatrix.R b/R-package/tests/testthat/test_dmatrix.R
index 21d39f255..57cc82c17 100644
--- a/R-package/tests/testthat/test_dmatrix.R
+++ b/R-package/tests/testthat/test_dmatrix.R
@@ -72,6 +72,7 @@ test_that("xgb.DMatrix: saving, loading", {
   tmp <- c("0 1:1 2:1", "1 3:1", "0 1:1")
   tmp_file <- tempfile(fileext = ".libsvm")
   writeLines(tmp, tmp_file)
+  expect_true(file.exists(tmp_file))
   dtest4 <- xgb.DMatrix(paste(tmp_file, "?format=libsvm", sep = ""), silent = TRUE)
   expect_equal(dim(dtest4), c(3, 4))
   expect_equal(getinfo(dtest4, 'label'), c(0, 1, 0))
diff --git a/src/common/io.cc b/src/common/io.cc
index 1e15c4173..8dbeba935 100644
--- a/src/common/io.cc
+++ b/src/common/io.cc
@@ -28,7 +28,7 @@
 #include <cstddef>       // for size_t
 #include <cstdint>       // for int32_t, uint32_t
 #include <cstring>       // for memcpy
-#include <filesystem>    // for filesystem
+#include <filesystem>    // for filesystem, weakly_canonical
 #include <fstream>       // for ifstream
 #include <iterator>      // for distance
 #include <limits>        // for numeric_limits
@@ -154,7 +154,8 @@ std::string LoadSequentialFile(std::string uri, bool stream) {
     // Open in binary mode so that correct file size can be computed with
     // seekg(). This accommodates Windows platform:
     // https://docs.microsoft.com/en-us/cpp/standard-library/basic-istream-class?view=vs-2019#seekg
-    std::ifstream ifs(std::filesystem::u8path(uri), std::ios_base::binary | std::ios_base::in);
+    auto path = std::filesystem::weakly_canonical(std::filesystem::u8path(uri));
+    std::ifstream ifs(path, std::ios_base::binary | std::ios_base::in);
     if (!ifs) {
       // https://stackoverflow.com/a/17338934
       OpenErr();
diff --git a/src/data/data.cc b/src/data/data.cc
index 7c76c6d25..e8ecccb81 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -4,42 +4,57 @@
  */
 #include "xgboost/data.h"
 
-#include <dmlc/registry.h>
+#include <dmlc/registry.h>  // for DMLC_REGISTRY_ENABLE, DMLC_REGISTRY_LINK_TAG
 
-#include <array>
-#include <cstddef>
-#include <cstring>
+#include <algorithm>    // for copy, max, none_of, min
+#include <atomic>       // for atomic
+#include <cmath>        // for abs
+#include <cstdint>      // for uint64_t, int32_t, uint8_t, uint32_t
+#include <cstring>      // for size_t, strcmp, memcpy
+#include <exception>    // for exception
+#include <iostream>     // for operator<<, basic_ostream, basic_ostream::op...
+#include <map>          // for map, operator!=
+#include <numeric>      // for accumulate, partial_sum
+#include <tuple>        // for get, apply
+#include <type_traits>  // for remove_pointer_t, remove_reference
 
-#include "../collective/communicator-inl.h"
-#include "../collective/communicator.h"
-#include "../common/algorithm.h"  // for StableSort
-#include "../common/api_entry.h"  // for XGBAPIThreadLocalEntry
-#include "../common/common.h"
-#include "../common/error_msg.h"  // for InfInData, GroupWeight, GroupSize
-#include "../common/group_data.h"
-#include "../common/io.h"
-#include "../common/linalg_op.h"
-#include "../common/math.h"
-#include "../common/numeric.h"  // for Iota
-#include "../common/threading_utils.h"
-#include "../common/version.h"
-#include "../data/adapter.h"
-#include "../data/iterative_dmatrix.h"
-#include "./sparse_page_dmatrix.h"
-#include "./sparse_page_source.h"
-#include "dmlc/io.h"
-#include "file_iterator.h"
-#include "simple_dmatrix.h"
-#include "sparse_page_writer.h"
-#include "validation.h"
-#include "xgboost/c_api.h"
-#include "xgboost/context.h"
-#include "xgboost/host_device_vector.h"
-#include "xgboost/learner.h"
-#include "xgboost/linalg.h"  // Vector
-#include "xgboost/logging.h"
-#include "xgboost/string_view.h"
-#include "xgboost/version_config.h"
+#include "../collective/communicator-inl.h"  // for GetRank, GetWorldSize, Allreduce, IsFederated
+#include "../collective/communicator.h"      // for Operation
+#include "../common/algorithm.h"             // for StableSort
+#include "../common/api_entry.h"             // for XGBAPIThreadLocalEntry
+#include "../common/common.h"                // for Split
+#include "../common/error_msg.h"             // for GroupSize, GroupWeight, InfInData
+#include "../common/group_data.h"            // for ParallelGroupBuilder
+#include "../common/io.h"                    // for PeekableInStream
+#include "../common/linalg_op.h"             // for ElementWiseTransformHost
+#include "../common/math.h"                  // for CheckNAN
+#include "../common/numeric.h"               // for Iota, RunLengthEncode
+#include "../common/threading_utils.h"       // for ParallelFor
+#include "../common/version.h"               // for Version
+#include "../data/adapter.h"                 // for COOTuple, FileAdapter, IsValidFunctor
+#include "../data/iterative_dmatrix.h"       // for IterativeDMatrix
+#include "./sparse_page_dmatrix.h"           // for SparsePageDMatrix
+#include "array_interface.h"                 // for ArrayInterfaceHandler, ArrayInterface, Dispa...
+#include "dmlc/base.h"                       // for BeginPtr
+#include "dmlc/common.h"                     // for OMPException
+#include "dmlc/data.h"                       // for Parser
+#include "dmlc/endian.h"                     // for ByteSwap, DMLC_IO_NO_ENDIAN_SWAP
+#include "dmlc/io.h"                         // for Stream
+#include "dmlc/thread_local.h"               // for ThreadLocalStore
+#include "ellpack_page.h"                    // for EllpackPage
+#include "file_iterator.h"                   // for ValidateFileFormat, FileIterator, Next, Reset
+#include "gradient_index.h"                  // for GHistIndexMatrix
+#include "simple_dmatrix.h"                  // for SimpleDMatrix
+#include "sparse_page_writer.h"              // for SparsePageFormatReg
+#include "validation.h"                      // for LabelsCheck, WeightsCheck, ValidateQueryGroup
+#include "xgboost/base.h"                    // for bst_group_t, bst_row_t, bst_float, bst_ulong
+#include "xgboost/context.h"                 // for Context
+#include "xgboost/host_device_vector.h"      // for HostDeviceVector
+#include "xgboost/learner.h"                 // for HostDeviceVector
+#include "xgboost/linalg.h"                  // for Tensor, Stack, TensorView, Vector, ArrayInte...
+#include "xgboost/logging.h"                 // for Error, LogCheck_EQ, CHECK, CHECK_EQ, LOG
+#include "xgboost/span.h"                    // for Span, operator!=, SpanIterator
+#include "xgboost/string_view.h"             // for operator==, operator<<, StringView
 
 namespace dmlc {
 DMLC_REGISTRY_ENABLE(::xgboost::data::SparsePageFormatReg<::xgboost::SparsePage>);
@@ -811,10 +826,10 @@ DMatrix::~DMatrix() {
   }
 }
 
-DMatrix *TryLoadBinary(std::string fname, bool silent) {
-  int magic;
-  std::unique_ptr<dmlc::Stream> fi(
-      dmlc::Stream::Create(fname.c_str(), "r", true));
+namespace {
+DMatrix* TryLoadBinary(std::string fname, bool silent) {
+  std::int32_t magic;
+  std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname.c_str(), "r", true));
   if (fi != nullptr) {
     common::PeekableInStream is(fi.get());
     if (is.PeekRead(&magic, sizeof(magic)) == sizeof(magic)) {
@@ -822,11 +837,10 @@ DMatrix *TryLoadBinary(std::string fname, bool silent) {
         dmlc::ByteSwap(&magic, sizeof(magic), 1);
       }
       if (magic == data::SimpleDMatrix::kMagic) {
-        DMatrix *dmat = new data::SimpleDMatrix(&is);
+        DMatrix* dmat = new data::SimpleDMatrix(&is);
         if (!silent) {
-          LOG(CONSOLE) << dmat->Info().num_row_ << 'x' << dmat->Info().num_col_
-                       << " matrix with " << dmat->Info().num_nonzero_
-                       << " entries loaded from " << fname;
+          LOG(CONSOLE) << dmat->Info().num_row_ << 'x' << dmat->Info().num_col_ << " matrix with "
+                       << dmat->Info().num_nonzero_ << " entries loaded from " << fname;
         }
         return dmat;
       }
@@ -834,6 +848,7 @@ DMatrix *TryLoadBinary(std::string fname, bool silent) {
   }
   return nullptr;
 }
+}  // namespace
 
 DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_split_mode) {
   auto need_split = false;
@@ -845,7 +860,7 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
   }
 
   std::string fname, cache_file;
-  size_t dlm_pos = uri.find('#');
+  auto dlm_pos = uri.find('#');
   if (dlm_pos != std::string::npos) {
     cache_file = uri.substr(dlm_pos + 1, uri.length());
     fname = uri.substr(0, dlm_pos);
@@ -857,14 +872,11 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
       for (size_t i = 0; i < cache_shards.size(); ++i) {
         size_t pos = cache_shards[i].rfind('.');
         if (pos == std::string::npos) {
-          os << cache_shards[i]
-             << ".r" << collective::GetRank()
-             << "-" <<  collective::GetWorldSize();
+          os << cache_shards[i] << ".r" << collective::GetRank() << "-"
+             << collective::GetWorldSize();
         } else {
-          os << cache_shards[i].substr(0, pos)
-             << ".r" << collective::GetRank()
-             << "-" <<  collective::GetWorldSize()
-             << cache_shards[i].substr(pos, cache_shards[i].length());
+          os << cache_shards[i].substr(0, pos) << ".r" << collective::GetRank() << "-"
+             << collective::GetWorldSize() << cache_shards[i].substr(pos, cache_shards[i].length());
         }
         if (i + 1 != cache_shards.size()) {
           os << ':';
@@ -895,12 +907,12 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
     LOG(CONSOLE) << "Load part of data " << partid << " of " << npart << " parts";
   }
 
-  data::ValidateFileFormat(fname);
-  DMatrix* dmat {nullptr};
+  DMatrix* dmat{nullptr};
 
   if (cache_file.empty()) {
-    std::unique_ptr<dmlc::Parser<uint32_t>> parser(
-        dmlc::Parser<uint32_t>::Create(fname.c_str(), partid, npart, "auto"));
+    fname = data::ValidateFileFormat(fname);
+    std::unique_ptr<dmlc::Parser<std::uint32_t>> parser(
+        dmlc::Parser<std::uint32_t>::Create(fname.c_str(), partid, npart, "auto"));
     data::FileAdapter adapter(parser.get());
     dmat = DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), Context{}.Threads(),
                            cache_file, data_split_mode);
diff --git a/src/data/file_iterator.cc b/src/data/file_iterator.cc
new file mode 100644
index 000000000..cebfbdc19
--- /dev/null
+++ b/src/data/file_iterator.cc
@@ -0,0 +1,51 @@
+/**
+ * Copyright 2021-2023, XGBoost contributors
+ */
+#include "file_iterator.h"
+
+#include <xgboost/logging.h>  // for LogCheck_EQ, LogCheck_LE, CHECK_EQ, CHECK_LE, LOG, LOG_...
+
+#include <filesystem>  // for weakly_canonical, path, u8path
+#include <map>         // for map, operator==
+#include <ostream>     // for operator<<, basic_ostream, istringstream
+#include <vector>      // for vector
+
+#include "../common/common.h"     // for Split
+#include "xgboost/string_view.h"  // for operator<<, StringView
+
+namespace xgboost::data {
+std::string ValidateFileFormat(std::string const& uri) {
+  std::vector<std::string> name_args_cache = common::Split(uri, '#');
+  CHECK_LE(name_args_cache.size(), 2)
+      << "Only one `#` is allowed in file path for cachefile specification";
+
+  std::vector<std::string> name_args = common::Split(name_args_cache[0], '?');
+  StringView msg{"URI parameter `format` is required for loading text data: filename?format=csv"};
+  CHECK_EQ(name_args.size(), 2) << msg;
+
+  std::map<std::string, std::string> args;
+  std::vector<std::string> arg_list = common::Split(name_args[1], '&');
+  for (size_t i = 0; i < arg_list.size(); ++i) {
+    std::istringstream is(arg_list[i]);
+    std::pair<std::string, std::string> kv;
+    CHECK(std::getline(is, kv.first, '=')) << "Invalid uri argument format"
+                                           << " for key in arg " << i + 1;
+    CHECK(std::getline(is, kv.second)) << "Invalid uri argument format"
+                                       << " for value in arg " << i + 1;
+    args.insert(kv);
+  }
+  if (args.find("format") == args.cend()) {
+    LOG(FATAL) << msg;
+  }
+
+  auto path = common::Split(uri, '?')[0];
+
+  namespace fs = std::filesystem;
+  name_args[0] = fs::weakly_canonical(fs::u8path(path)).string();
+  if (name_args_cache.size() == 1) {
+    return name_args[0] + "?" + name_args[1];
+  } else {
+    return name_args[0] + "?" + name_args[1] + '#' + name_args_cache[1];
+  }
+}
+}  // namespace xgboost::data
diff --git a/src/data/file_iterator.h b/src/data/file_iterator.h
index 4d7239677..c7f23b478 100644
--- a/src/data/file_iterator.h
+++ b/src/data/file_iterator.h
@@ -4,46 +4,20 @@
 #ifndef XGBOOST_DATA_FILE_ITERATOR_H_
 #define XGBOOST_DATA_FILE_ITERATOR_H_
 
-#include <map>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
+#include <algorithm>  // for max_element
+#include <cstddef>    // for size_t
+#include <cstdint>    // for uint32_t
+#include <memory>     // for unique_ptr
+#include <string>     // for string
+#include <utility>    // for move
 
-#include "array_interface.h"
-#include "dmlc/data.h"
-#include "xgboost/c_api.h"
-#include "xgboost/json.h"
-#include "xgboost/linalg.h"
+#include "dmlc/data.h"        // for RowBlock, Parser
+#include "xgboost/c_api.h"    // for XGDMatrixSetDenseInfo, XGDMatrixFree, XGProxyDMatrixCreate
+#include "xgboost/linalg.h"   // for ArrayInterfaceStr, MakeVec
+#include "xgboost/logging.h"  // for CHECK
 
-namespace xgboost {
-namespace data {
-inline void ValidateFileFormat(std::string const& uri) {
-  std::vector<std::string> name_cache = common::Split(uri, '#');
-  CHECK_LE(name_cache.size(), 2)
-      << "Only one `#` is allowed in file path for cachefile specification";
-
-  std::vector<std::string> name_args = common::Split(name_cache[0], '?');
-  CHECK_LE(name_args.size(), 2) << "only one `?` is allowed in file path.";
-
-  StringView msg{"URI parameter `format` is required for loading text data: filename?format=csv"};
-  CHECK_EQ(name_args.size(), 2) << msg;
-
-  std::map<std::string, std::string> args;
-  std::vector<std::string> arg_list = common::Split(name_args[1], '&');
-  for (size_t i = 0; i < arg_list.size(); ++i) {
-    std::istringstream is(arg_list[i]);
-    std::pair<std::string, std::string> kv;
-    CHECK(std::getline(is, kv.first, '=')) << "Invalid uri argument format"
-                                           << " for key in arg " << i + 1;
-    CHECK(std::getline(is, kv.second)) << "Invalid uri argument format"
-                                       << " for value in arg " << i + 1;
-    args.insert(kv);
-  }
-  if (args.find("format") == args.cend()) {
-    LOG(FATAL) << msg;
-  }
-}
+namespace xgboost::data {
+[[nodiscard]] std::string ValidateFileFormat(std::string const& uri);
 
 /**
  * An iterator for implementing external memory support with file inputs.  Users of
@@ -72,8 +46,7 @@ class FileIterator {
 
  public:
   FileIterator(std::string uri, unsigned part_index, unsigned num_parts)
-      : uri_{std::move(uri)}, part_idx_{part_index}, n_parts_{num_parts} {
-    ValidateFileFormat(uri_);
+      : uri_{ValidateFileFormat(std::move(uri))}, part_idx_{part_index}, n_parts_{num_parts} {
     XGProxyDMatrixCreate(&proxy_);
   }
   ~FileIterator() {
@@ -132,6 +105,5 @@ inline int Next(DataIterHandle self) {
   return static_cast<FileIterator*>(self)->Next();
 }
 }  // namespace fileiter
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
 #endif  // XGBOOST_DATA_FILE_ITERATOR_H_

From 801116c30707d818e084cf95201db5f0ec361b17 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sun, 13 Aug 2023 23:41:49 +0800
Subject: [PATCH 087/136] Test scikit-learn model IO with gblinear. (#9459)

---
 tests/python/test_with_sklearn.py | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
index 9a58b7277..69f144caf 100644
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -792,19 +792,19 @@ def test_kwargs_grid_search():
     from sklearn import datasets
     from sklearn.model_selection import GridSearchCV
 
-    params = {'tree_method': 'hist'}
-    clf = xgb.XGBClassifier(n_estimators=1, learning_rate=1.0, **params)
-    assert clf.get_params()['tree_method'] == 'hist'
-    # 'max_leaves' is not a default argument of XGBClassifier
+    params = {"tree_method": "hist"}
+    clf = xgb.XGBClassifier(n_estimators=3, **params)
+    assert clf.get_params()["tree_method"] == "hist"
+    # 'eta' is not a default argument of XGBClassifier
     # Check we can still do grid search over this parameter
-    search_params = {'max_leaves': range(2, 5)}
+    search_params = {"eta": [0, 0.2, 0.4]}
     grid_cv = GridSearchCV(clf, search_params, cv=5)
     iris = datasets.load_iris()
     grid_cv.fit(iris.data, iris.target)
 
     # Expect unique results for each parameter value
     # This confirms sklearn is able to successfully update the parameter
-    means = grid_cv.cv_results_['mean_test_score']
+    means = grid_cv.cv_results_["mean_test_score"]
     assert len(means) == len(set(means))
 
 
@@ -928,6 +928,25 @@ def save_load_model(model_path):
             xgb_model = xgb.XGBModel()
             xgb_model.load_model(model_path)
 
+    clf = xgb.XGBClassifier(booster="gblinear", early_stopping_rounds=1)
+    clf.fit(X, y, eval_set=[(X, y)])
+    best_iteration = clf.best_iteration
+    best_score = clf.best_score
+    predt_0 = clf.predict(X)
+    clf.save_model(model_path)
+    clf.load_model(model_path)
+    predt_1 = clf.predict(X)
+    np.testing.assert_allclose(predt_0, predt_1)
+    assert clf.best_iteration == best_iteration
+    assert clf.best_score == best_score
+
+    clfpkl = pickle.dumps(clf)
+    clf = pickle.loads(clfpkl)
+    predt_2 = clf.predict(X)
+    np.testing.assert_allclose(predt_0, predt_2)
+    assert clf.best_iteration == best_iteration
+    assert clf.best_score == best_score
+
 
 def test_save_load_model():
     with tempfile.TemporaryDirectory() as tempdir:

From fd4335d0bfb5795abeedf6f166bf3292d74fe5ed Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sun, 13 Aug 2023 23:42:27 +0800
Subject: [PATCH 088/136] [doc] Document the current status of some features.
 (#9469)

---
 demo/guide-python/quantile_regression.py |  5 +++++
 doc/tutorials/categorical.rst            | 19 ++++++++++---------
 doc/tutorials/multioutput.rst            |  7 ++++++-
 3 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/demo/guide-python/quantile_regression.py b/demo/guide-python/quantile_regression.py
index 6d3e08df5..4f69a8c80 100644
--- a/demo/guide-python/quantile_regression.py
+++ b/demo/guide-python/quantile_regression.py
@@ -7,6 +7,11 @@ Quantile Regression
 The script is inspired by this awesome example in sklearn:
 https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_quantile.html
 
+.. note::
+
+    The feature is only supported using the Python package. In addition, quantile
+    crossing can happen due to limitation in the algorithm.
+
 """
 import argparse
 from typing import Dict
diff --git a/doc/tutorials/categorical.rst b/doc/tutorials/categorical.rst
index 2a84080cf..fb612bca1 100644
--- a/doc/tutorials/categorical.rst
+++ b/doc/tutorials/categorical.rst
@@ -4,16 +4,17 @@ Categorical Data
 
 .. note::
 
-   As of XGBoost 1.6, the feature is experimental and has limited features
+   As of XGBoost 1.6, the feature is experimental and has limited features. Only the
+   Python package is fully supported.
 
-Starting from version 1.5, XGBoost has experimental support for categorical data available
-for public testing. For numerical data, the split condition is defined as :math:`value <
-threshold`, while for categorical data the split is defined depending on whether
-partitioning or onehot encoding is used. For partition-based splits, the splits are
-specified as :math:`value \in categories`, where ``categories`` is the set of categories
-in one feature.  If onehot encoding is used instead, then the split is defined as
-:math:`value == category`. More advanced categorical split strategy is planned for future
-releases and this tutorial details how to inform XGBoost about the data type.
+Starting from version 1.5, the XGBoost Python package has experimental support for
+categorical data available for public testing. For numerical data, the split condition is
+defined as :math:`value < threshold`, while for categorical data the split is defined
+depending on whether partitioning or onehot encoding is used. For partition-based splits,
+the splits are specified as :math:`value \in categories`, where ``categories`` is the set
+of categories in one feature.  If onehot encoding is used instead, then the split is
+defined as :math:`value == category`. More advanced categorical split strategy is planned
+for future releases and this tutorial details how to inform XGBoost about the data type.
 
 ************************************
 Training with scikit-learn Interface
diff --git a/doc/tutorials/multioutput.rst b/doc/tutorials/multioutput.rst
index 983002aed..73e89fe03 100644
--- a/doc/tutorials/multioutput.rst
+++ b/doc/tutorials/multioutput.rst
@@ -11,6 +11,11 @@ can be simultaneously classified as both sci-fi and comedy.  For detailed explan
 terminologies related to different multi-output models please refer to the
 :doc:`scikit-learn user guide <sklearn:modules/multiclass>`.
 
+.. note::
+
+   As of XGBoost 2.0, the feature is experimental and has limited features. Only the
+   Python package is tested.
+
 **********************************
 Training with One-Model-Per-Target
 **********************************
@@ -49,7 +54,7 @@ Training with Vector Leaf
 
 .. note::
 
-   This is still working-in-progress, and many features are missing.
+   This is still working-in-progress, and most features are missing.
 
 XGBoost can optionally build multi-output trees with the size of leaf equals to the number
 of targets when the tree method `hist` is used. The behavior can be controlled by the

From f03463c45b9256d27009bba192a558e4d7b188c2 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Sun, 13 Aug 2023 18:54:37 -0700
Subject: [PATCH 089/136] [CI] Update RAPIDS to latest stable (#9464)

* [CI] Update RAPIDS to latest stable

* [CI] Use CMake 3.26.4

---------

Co-authored-by: hcho3 <hcho3@users.noreply.github.com>
Co-authored-by: Hyunsu Philip Cho <chohyu01@cs.washington.edu>
---
 tests/buildkite/conftest.sh                 | 2 +-
 tests/ci_build/Dockerfile.gpu_build_centos7 | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/buildkite/conftest.sh b/tests/buildkite/conftest.sh
index 0036a06fe..9e821f0fe 100755
--- a/tests/buildkite/conftest.sh
+++ b/tests/buildkite/conftest.sh
@@ -24,7 +24,7 @@ set -x
 
 CUDA_VERSION=11.8.0
 NCCL_VERSION=2.16.5-1
-RAPIDS_VERSION=23.06
+RAPIDS_VERSION=23.08
 SPARK_VERSION=3.4.0
 JDK_VERSION=8
 
diff --git a/tests/ci_build/Dockerfile.gpu_build_centos7 b/tests/ci_build/Dockerfile.gpu_build_centos7
index 4f9823baa..6134d49aa 100644
--- a/tests/ci_build/Dockerfile.gpu_build_centos7
+++ b/tests/ci_build/Dockerfile.gpu_build_centos7
@@ -17,8 +17,8 @@ RUN \
     bash conda.sh -b -p /opt/mambaforge && \
     /opt/mambaforge/bin/python -m pip install awscli && \
     # CMake
-    wget -nv -nc https://cmake.org/files/v3.24/cmake-3.24.0-linux-x86_64.sh --no-check-certificate && \
-    bash cmake-3.24.0-linux-x86_64.sh --skip-license --prefix=/usr
+    wget -nv -nc https://cmake.org/files/v3.26/cmake-3.26.4-linux-x86_64.sh --no-check-certificate && \
+    bash cmake-3.26.4-linux-x86_64.sh --skip-license --prefix=/usr
 
 # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
 RUN \

From 05d70000968acaf7c16e72eaaa32cb3f86fa209d Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 14 Aug 2023 15:49:00 +0800
Subject: [PATCH 090/136] Handle special characters in JSON model dump. (#9474)

---
 src/common/common.cc              | 63 ++++++++++++++++------
 src/common/common.h               | 41 ++++++--------
 src/common/json.cc                | 90 ++++++++++++-------------------
 src/common/numeric.h              |  1 +
 src/learner.cc                    |  4 +-
 src/tree/tree_model.cc            | 11 ++--
 tests/python/test_basic_models.py | 20 +++++++
 7 files changed, 127 insertions(+), 103 deletions(-)

diff --git a/src/common/common.cc b/src/common/common.cc
index 8f4f4b5c8..086f4c00d 100644
--- a/src/common/common.cc
+++ b/src/common/common.cc
@@ -1,16 +1,17 @@
-/*!
- * Copyright 2015-2019 by Contributors
- * \file common.cc
- * \brief Enable all kinds of global variables in common.
+/**
+ * Copyright 2015-2023 by Contributors
  */
-#include <dmlc/thread_local.h>
-#include <xgboost/logging.h>
-
 #include "common.h"
-#include "./random.h"
 
-namespace xgboost {
-namespace common {
+#include <dmlc/thread_local.h>  // for ThreadLocalStore
+
+#include <cstdint>  // for uint8_t
+#include <cstdio>   // for snprintf, size_t
+#include <string>   // for string
+
+#include "./random.h"  // for GlobalRandomEngine, GlobalRandom
+
+namespace xgboost::common {
 /*! \brief thread local entry for random. */
 struct RandomThreadLocalEntry {
   /*! \brief the random engine instance. */
@@ -19,15 +20,43 @@ struct RandomThreadLocalEntry {
 
 using RandomThreadLocalStore = dmlc::ThreadLocalStore<RandomThreadLocalEntry>;
 
-GlobalRandomEngine& GlobalRandom() {
-  return RandomThreadLocalStore::Get()->engine;
+GlobalRandomEngine &GlobalRandom() { return RandomThreadLocalStore::Get()->engine; }
+
+void EscapeU8(std::string const &string, std::string *p_buffer) {
+  auto &buffer = *p_buffer;
+  for (size_t i = 0; i < string.length(); i++) {
+    const auto ch = string[i];
+    if (ch == '\\') {
+      if (i < string.size() && string[i + 1] == 'u') {
+        buffer += "\\";
+      } else {
+        buffer += "\\\\";
+      }
+    } else if (ch == '"') {
+      buffer += "\\\"";
+    } else if (ch == '\b') {
+      buffer += "\\b";
+    } else if (ch == '\f') {
+      buffer += "\\f";
+    } else if (ch == '\n') {
+      buffer += "\\n";
+    } else if (ch == '\r') {
+      buffer += "\\r";
+    } else if (ch == '\t') {
+      buffer += "\\t";
+    } else if (static_cast<uint8_t>(ch) <= 0x1f) {
+      // Unit separator
+      char buf[8];
+      snprintf(buf, sizeof buf, "\\u%04x", ch);
+      buffer += buf;
+    } else {
+      buffer += ch;
+    }
+  }
 }
 
 #if !defined(XGBOOST_USE_CUDA)
-int AllVisibleGPUs() {
-  return 0;
-}
+int AllVisibleGPUs() { return 0; }
 #endif  // !defined(XGBOOST_USE_CUDA)
 
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
diff --git a/src/common/common.h b/src/common/common.h
index 35c807bef..bedff80b3 100644
--- a/src/common/common.h
+++ b/src/common/common.h
@@ -6,20 +6,19 @@
 #ifndef XGBOOST_COMMON_COMMON_H_
 #define XGBOOST_COMMON_COMMON_H_
 
-#include <xgboost/base.h>
-#include <xgboost/logging.h>
-#include <xgboost/span.h>
+#include <algorithm>  // for max
+#include <array>      // for array
+#include <cmath>      // for ceil
+#include <cstddef>    // for size_t
+#include <cstdint>    // for int32_t, int64_t
+#include <sstream>    // for basic_istream, operator<<, istringstream
+#include <string>     // for string, basic_string, getline, char_traits
+#include <tuple>      // for make_tuple
+#include <utility>    // for forward, index_sequence, make_index_sequence
+#include <vector>     // for vector
 
-#include <algorithm>
-#include <exception>
-#include <functional>
-#include <limits>
-#include <numeric>
-#include <sstream>
-#include <string>
-#include <type_traits>
-#include <utility>
-#include <vector>
+#include "xgboost/base.h"     // for XGBOOST_DEVICE
+#include "xgboost/logging.h"  // for LOG, LOG_FATAL, LogMessageFatal
 
 #if defined(__CUDACC__)
 #include <thrust/system/cuda/error.h>
@@ -52,8 +51,7 @@ inline cudaError_t ThrowOnCudaError(cudaError_t code, const char *file,
 #endif  // defined(__CUDACC__)
 }  // namespace dh
 
-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 /*!
  * \brief Split a string by delimiter
  * \param s String to be split.
@@ -69,19 +67,13 @@ inline std::vector<std::string> Split(const std::string& s, char delim) {
   return ret;
 }
 
+void EscapeU8(std::string const &string, std::string *p_buffer);
+
 template <typename T>
 XGBOOST_DEVICE T Max(T a, T b) {
   return a < b ? b : a;
 }
 
-// simple routine to convert any data to string
-template<typename T>
-inline std::string ToString(const T& data) {
-  std::ostringstream os;
-  os << data;
-  return os.str();
-}
-
 template <typename T1, typename T2>
 XGBOOST_DEVICE T1 DivRoundUp(const T1 a, const T2 b) {
   return static_cast<T1>(std::ceil(static_cast<double>(a) / b));
@@ -195,6 +187,5 @@ template <typename Indexable>
 XGBOOST_DEVICE size_t LastOf(size_t group, Indexable const &indptr) {
   return indptr[group + 1] - 1;
 }
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
 #endif  // XGBOOST_COMMON_COMMON_H_
diff --git a/src/common/json.cc b/src/common/json.cc
index c3d61b47d..de9a89f78 100644
--- a/src/common/json.cc
+++ b/src/common/json.cc
@@ -1,23 +1,29 @@
-/*!
- * Copyright (c) by Contributors 2019-2022
+/**
+ * Copyright 2019-2023, XGBoost Contributors
  */
 #include "xgboost/json.h"
 
-#include <dmlc/endian.h>
+#include <array>             // for array
+#include <cctype>            // for isdigit
+#include <cmath>             // for isinf, isnan
+#include <cstdio>            // for EOF
+#include <cstdlib>           // for size_t, strtof
+#include <cstring>           // for memcpy
+#include <initializer_list>  // for initializer_list
+#include <iterator>          // for distance
+#include <limits>            // for numeric_limits
+#include <memory>            // for allocator
+#include <sstream>           // for operator<<, basic_ostream, operator&, ios, stringstream
+#include <system_error>      // for errc
 
-#include <cctype>
-#include <cmath>
-#include <cstddef>
-#include <iterator>
-#include <limits>
-#include <sstream>
-
-#include "./math.h"
-#include "charconv.h"
-#include "xgboost/base.h"
-#include "xgboost/json_io.h"
-#include "xgboost/logging.h"
-#include "xgboost/string_view.h"
+#include "./math.h"                 // for CheckNAN
+#include "charconv.h"               // for to_chars, NumericLimits, from_chars, to_chars_result
+#include "common.h"                 // for EscapeU8
+#include "xgboost/base.h"           // for XGBOOST_EXPECT
+#include "xgboost/intrusive_ptr.h"  // for IntrusivePtr
+#include "xgboost/json_io.h"        // for JsonReader, UBJReader, UBJWriter, JsonWriter, ToBigEn...
+#include "xgboost/logging.h"        // for LOG, LOG_FATAL, LogMessageFatal, LogCheck_NE, CHECK
+#include "xgboost/string_view.h"    // for StringView, operator<<
 
 namespace xgboost {
 
@@ -57,12 +63,12 @@ void JsonWriter::Visit(JsonObject const* obj) {
 }
 
 void JsonWriter::Visit(JsonNumber const* num) {
-  char number[NumericLimits<float>::kToCharsSize];
-  auto res = to_chars(number, number + sizeof(number), num->GetNumber());
+  std::array<char, NumericLimits<float>::kToCharsSize> number;
+  auto res = to_chars(number.data(), number.data() + number.size(), num->GetNumber());
   auto end = res.ptr;
   auto ori_size = stream_->size();
-  stream_->resize(stream_->size() + end - number);
-  std::memcpy(stream_->data() + ori_size, number, end - number);
+  stream_->resize(stream_->size() + end - number.data());
+  std::memcpy(stream_->data() + ori_size, number.data(), end - number.data());
 }
 
 void JsonWriter::Visit(JsonInteger const* num) {
@@ -88,43 +94,15 @@ void JsonWriter::Visit(JsonNull const* ) {
 }
 
 void JsonWriter::Visit(JsonString const* str) {
-  std::string buffer;
-  buffer += '"';
-  auto const& string = str->GetString();
-  for (size_t i = 0; i < string.length(); i++) {
-    const char ch = string[i];
-    if (ch == '\\') {
-      if (i < string.size() && string[i+1] == 'u') {
-        buffer += "\\";
-      } else {
-        buffer += "\\\\";
-      }
-    } else if (ch == '"') {
-      buffer += "\\\"";
-    } else if (ch == '\b') {
-      buffer += "\\b";
-    } else if (ch == '\f') {
-      buffer += "\\f";
-    } else if (ch == '\n') {
-      buffer += "\\n";
-    } else if (ch == '\r') {
-      buffer += "\\r";
-    } else if (ch == '\t') {
-      buffer += "\\t";
-    } else if (static_cast<uint8_t>(ch) <= 0x1f) {
-      // Unit separator
-      char buf[8];
-      snprintf(buf, sizeof buf, "\\u%04x", ch);
-      buffer += buf;
-    } else {
-      buffer += ch;
-    }
-  }
-  buffer += '"';
+    std::string buffer;
+    buffer += '"';
+    auto const& string = str->GetString();
+    common::EscapeU8(string, &buffer);
+    buffer += '"';
 
-  auto s = stream_->size();
-  stream_->resize(s + buffer.size());
-  std::memcpy(stream_->data() + s, buffer.data(), buffer.size());
+    auto s = stream_->size();
+    stream_->resize(s + buffer.size());
+    std::memcpy(stream_->data() + s, buffer.data(), buffer.size());
 }
 
 void JsonWriter::Visit(JsonBoolean const* boolean) {
diff --git a/src/common/numeric.h b/src/common/numeric.h
index 2da85502a..5b45bba8c 100644
--- a/src/common/numeric.h
+++ b/src/common/numeric.h
@@ -10,6 +10,7 @@
 #include <cstddef>    // for size_t
 #include <cstdint>    // for int32_t
 #include <iterator>   // for iterator_traits
+#include <numeric>    // for accumulate
 #include <vector>
 
 #include "common.h"                      // AssertGPUSupport
diff --git a/src/learner.cc b/src/learner.cc
index b2d6baff0..81d1b795b 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -797,7 +797,7 @@ class LearnerConfiguration : public Learner {
     bool has_nc {cfg_.find("num_class") != cfg_.cend()};
     // Inject num_class into configuration.
     // FIXME(jiamingy): Remove the duplicated parameter in softmax
-    cfg_["num_class"] = common::ToString(mparam_.num_class);
+    cfg_["num_class"] = std::to_string(mparam_.num_class);
     auto& args = *p_args;
     args = {cfg_.cbegin(), cfg_.cend()};  // renew
     obj_->Configure(args);
@@ -1076,7 +1076,7 @@ class LearnerIO : public LearnerConfiguration {
     mparam_.major_version = std::get<0>(Version::Self());
     mparam_.minor_version = std::get<1>(Version::Self());
 
-    cfg_["num_feature"] = common::ToString(mparam_.num_feature);
+    cfg_["num_feature"] = std::to_string(mparam_.num_feature);
 
     auto n = tparam_.__DICT__();
     cfg_.insert(n.cbegin(), n.cend());
diff --git a/src/tree/tree_model.cc b/src/tree/tree_model.cc
index f32ea701f..d37be14b8 100644
--- a/src/tree/tree_model.cc
+++ b/src/tree/tree_model.cc
@@ -398,11 +398,14 @@ class JsonGenerator : public TreeGenerator {
     static std::string const kIndicatorTemplate =
         R"ID( "nodeid": {nid}, "depth": {depth}, "split": "{fname}", "yes": {yes}, "no": {no})ID";
     auto split_index = tree[nid].SplitIndex();
+    auto fname = fmap_.Name(split_index);
+    std::string qfname;  // quoted
+    common::EscapeU8(fname, &qfname);
     auto result = SuperT::Match(
         kIndicatorTemplate,
         {{"{nid}",   std::to_string(nid)},
          {"{depth}", std::to_string(depth)},
-         {"{fname}", fmap_.Name(split_index)},
+         {"{fname}", qfname},
          {"{yes}",   std::to_string(nyes)},
          {"{no}",    std::to_string(tree[nid].DefaultChild())}});
     return result;
@@ -430,12 +433,14 @@ class JsonGenerator : public TreeGenerator {
                             std::string const &template_str, std::string cond,
                             uint32_t depth) const {
     auto split_index = tree[nid].SplitIndex();
+    auto fname = split_index < fmap_.Size() ? fmap_.Name(split_index) : std::to_string(split_index);
+    std::string qfname;  // quoted
+    common::EscapeU8(fname, &qfname);
     std::string const result = SuperT::Match(
         template_str,
         {{"{nid}",     std::to_string(nid)},
          {"{depth}",   std::to_string(depth)},
-         {"{fname}",   split_index < fmap_.Size() ? fmap_.Name(split_index) :
-                                                    std::to_string(split_index)},
+         {"{fname}",   qfname},
          {"{cond}",    cond},
          {"{left}",    std::to_string(tree[nid].LeftChild())},
          {"{right}",   std::to_string(tree[nid].RightChild())},
diff --git a/tests/python/test_basic_models.py b/tests/python/test_basic_models.py
index 610a9236e..f0c80124d 100644
--- a/tests/python/test_basic_models.py
+++ b/tests/python/test_basic_models.py
@@ -439,6 +439,26 @@ class TestModels:
                       'objective': 'multi:softmax'}
         validate_model(parameters)
 
+    def test_special_model_dump_characters(self):
+        params = {"objective": "reg:squarederror", "max_depth": 3}
+        feature_names = ['"feature 0"', "\tfeature\n1", "feature 2"]
+        X, y, w = tm.make_regression(n_samples=128, n_features=3, use_cupy=False)
+        Xy = xgb.DMatrix(X, label=y, feature_names=feature_names)
+        booster = xgb.train(params, Xy, num_boost_round=3)
+        json_dump = booster.get_dump(dump_format="json")
+        assert len(json_dump) == 3
+
+        def validate(obj: dict) -> None:
+            for k, v in obj.items():
+                if k == "split":
+                    assert v in feature_names
+                elif isinstance(v, dict):
+                    validate(v)
+
+        for j_tree in json_dump:
+            loaded = json.loads(j_tree)
+            validate(loaded)
+
     def test_categorical_model_io(self):
         X, y = tm.make_categorical(256, 16, 71, False)
         Xy = xgb.DMatrix(X, y, enable_categorical=True)

From 344f90b67ba0966c04ef05321eb99c127c3c2552 Mon Sep 17 00:00:00 2001
From: Bobby Wang <wbo4958@gmail.com>
Date: Mon, 14 Aug 2023 17:52:14 +0800
Subject: [PATCH 091/136] [jvm-packages] throw exception when
 tree_method=approx and device=cuda (#9478)

---------

Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
---
 .../dmlc/xgboost4j/scala/spark/XGBoost.scala  | 76 ++++++++++---------
 .../spark/params/LearningTaskParams.scala     |  2 +
 .../scala/spark/ParameterSuite.scala          | 11 +++
 3 files changed, 54 insertions(+), 35 deletions(-)

diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
index 7bb245035..d12431479 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
@@ -93,12 +93,14 @@ private[this] class XGBoostExecutionParamsFactory(rawParams: Map[String, Any], s
 
   private val overridedParams = overrideParams(rawParams, sc)
 
+  validateSparkSslConf()
+
   /**
    * Check to see if Spark expects SSL encryption (`spark.ssl.enabled` set to true).
    * If so, throw an exception unless this safety measure has been explicitly overridden
    * via conf `xgboost.spark.ignoreSsl`.
    */
-  private def validateSparkSslConf: Unit = {
+  private def validateSparkSslConf(): Unit = {
     val (sparkSslEnabled: Boolean, xgboostSparkIgnoreSsl: Boolean) =
       SparkSession.getActiveSession match {
         case Some(ss) =>
@@ -148,55 +150,59 @@ private[this] class XGBoostExecutionParamsFactory(rawParams: Map[String, Any], s
     overridedParams
   }
 
+  /**
+   * The Map parameters accepted by estimator's constructor may have string type,
+   * Eg, Map("num_workers" -> "6", "num_round" -> 5), we need to convert these
+   * kind of parameters into the correct type in the function.
+   *
+   * @return XGBoostExecutionParams
+   */
   def buildXGBRuntimeParams: XGBoostExecutionParams = {
-    val nWorkers = overridedParams("num_workers").asInstanceOf[Int]
-    val round = overridedParams("num_round").asInstanceOf[Int]
-    val useExternalMemory = overridedParams
-      .getOrElse("use_external_memory", false).asInstanceOf[Boolean]
+
     val obj = overridedParams.getOrElse("custom_obj", null).asInstanceOf[ObjectiveTrait]
     val eval = overridedParams.getOrElse("custom_eval", null).asInstanceOf[EvalTrait]
-    val missing = overridedParams.getOrElse("missing", Float.NaN).asInstanceOf[Float]
-    val allowNonZeroForMissing = overridedParams
-                                 .getOrElse("allow_non_zero_for_missing", false)
-                                 .asInstanceOf[Boolean]
-    validateSparkSslConf
-    var treeMethod: Option[String] = None
-    if (overridedParams.contains("tree_method")) {
-      require(overridedParams("tree_method") == "hist" ||
-        overridedParams("tree_method") == "approx" ||
-        overridedParams("tree_method") == "auto" ||
-        overridedParams("tree_method") == "gpu_hist", "xgboost4j-spark only supports tree_method" +
-        " as 'hist', 'approx', 'gpu_hist', and 'auto'")
-      treeMethod = Some(overridedParams("tree_method").asInstanceOf[String])
-    }
-
-    // back-compatible with "gpu_hist"
-    val device: Option[String] = if (treeMethod.exists(_ == "gpu_hist")) {
-      Some("cuda")
-    } else overridedParams.get("device").map(_.toString)
-
-    if (overridedParams.contains("train_test_ratio")) {
-      logger.warn("train_test_ratio is deprecated since XGBoost 0.82, we recommend to explicitly" +
-        " pass a training and multiple evaluation datasets by passing 'eval_sets' and " +
-        "'eval_set_names'")
-    }
-    require(nWorkers > 0, "you must specify more than 0 workers")
     if (obj != null) {
       require(overridedParams.get("objective_type").isDefined, "parameter \"objective_type\" " +
         "is not defined, you have to specify the objective type as classification or regression" +
         " with a customized objective function")
     }
+
+    var trainTestRatio = 1.0
+    if (overridedParams.contains("train_test_ratio")) {
+      logger.warn("train_test_ratio is deprecated since XGBoost 0.82, we recommend to explicitly" +
+        " pass a training and multiple evaluation datasets by passing 'eval_sets' and " +
+        "'eval_set_names'")
+      trainTestRatio = overridedParams.get("train_test_ratio").get.asInstanceOf[Double]
+    }
+
+    val nWorkers = overridedParams("num_workers").asInstanceOf[Int]
+    val round = overridedParams("num_round").asInstanceOf[Int]
+    val useExternalMemory = overridedParams
+      .getOrElse("use_external_memory", false).asInstanceOf[Boolean]
+
+    val missing = overridedParams.getOrElse("missing", Float.NaN).asInstanceOf[Float]
+    val allowNonZeroForMissing = overridedParams
+                                 .getOrElse("allow_non_zero_for_missing", false)
+                                 .asInstanceOf[Boolean]
+
+    val treeMethod: Option[String] = overridedParams.get("tree_method").map(_.toString)
+    // back-compatible with "gpu_hist"
+    val device: Option[String] = if (treeMethod.exists(_ == "gpu_hist")) {
+      Some("cuda")
+    } else overridedParams.get("device").map(_.toString)
+
+    require(!(treeMethod.exists(_ == "approx") && device.exists(_ == "cuda")),
+      "The tree method \"approx\" is not yet supported for Spark GPU cluster")
+
     val trackerConf = overridedParams.get("tracker_conf") match {
       case None => TrackerConf()
       case Some(conf: TrackerConf) => conf
       case _ => throw new IllegalArgumentException("parameter \"tracker_conf\" must be an " +
         "instance of TrackerConf.")
     }
-    val checkpointParam =
-      ExternalCheckpointParams.extractParams(overridedParams)
 
-    val trainTestRatio = overridedParams.getOrElse("train_test_ratio", 1.0)
-      .asInstanceOf[Double]
+    val checkpointParam = ExternalCheckpointParams.extractParams(overridedParams)
+
     val seed = overridedParams.getOrElse("seed", System.nanoTime()).asInstanceOf[Long]
     val inputParams = XGBoostExecutionInputParams(trainTestRatio, seed)
 
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/LearningTaskParams.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/LearningTaskParams.scala
index bcbd7548f..b73e6cbaa 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/LearningTaskParams.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/LearningTaskParams.scala
@@ -68,11 +68,13 @@ private[spark] trait LearningTaskParams extends Params {
   /**
    * Fraction of training points to use for testing.
    */
+  @Deprecated
   final val trainTestRatio = new DoubleParam(this, "trainTestRatio",
     "fraction of training points to use for testing",
     ParamValidators.inRange(0, 1))
   setDefault(trainTestRatio, 1.0)
 
+  @Deprecated
   final def getTrainTestRatio: Double = $(trainTestRatio)
 
   /**
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ParameterSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ParameterSuite.scala
index 11b60e74d..f187f7394 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ParameterSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ParameterSuite.scala
@@ -92,4 +92,15 @@ class ParameterSuite extends AnyFunSuite with PerTest with BeforeAndAfterAll {
       classifier.getBaseScore
     }
   }
+
+  test("approx can't be used for gpu train") {
+    val paramMap = Map("tree_method" -> "approx", "device" -> "cuda")
+    val trainingDF = buildDataFrame(MultiClassification.train)
+    val xgb = new XGBoostClassifier(paramMap)
+    val thrown = intercept[IllegalArgumentException] {
+      xgb.fit(trainingDF)
+    }
+    assert(thrown.getMessage.contains("The tree method \"approx\" is not yet supported " +
+      "for Spark GPU cluster"))
+  }
 }

From 2c84daeca7a7098b9ad3594426be3eac4707a675 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Mon, 14 Aug 2023 09:18:02 -0500
Subject: [PATCH 092/136] [R] [doc] remove documentation index entries for
 internal functions (#9477)

---
 R-package/R/xgb.ggplot.R                  |  2 ++
 R-package/man/normalize.Rd                | 18 ---------------
 R-package/man/prepare.ggplot.shap.data.Rd | 27 -----------------------
 3 files changed, 2 insertions(+), 45 deletions(-)
 delete mode 100644 R-package/man/normalize.Rd
 delete mode 100644 R-package/man/prepare.ggplot.shap.data.Rd

diff --git a/R-package/R/xgb.ggplot.R b/R-package/R/xgb.ggplot.R
index f96a8a37f..69d26e2c4 100644
--- a/R-package/R/xgb.ggplot.R
+++ b/R-package/R/xgb.ggplot.R
@@ -142,6 +142,7 @@ xgb.ggplot.shap.summary <- function(data, shap_contrib = NULL, features = NULL,
 #'
 #' @return A data.table containing the observation ID, the feature name, the
 #'   feature value (normalized if specified), and the SHAP contribution value.
+#' @noRd
 prepare.ggplot.shap.data <- function(data_list, normalize = FALSE) {
   data <- data_list[["data"]]
   shap_contrib <- data_list[["shap_contrib"]]
@@ -170,6 +171,7 @@ prepare.ggplot.shap.data <- function(data_list, normalize = FALSE) {
 #' @param x Numeric vector
 #'
 #' @return Numeric vector with mean 0 and sd 1.
+#' @noRd
 normalize <- function(x) {
   loc <- mean(x, na.rm = TRUE)
   scale <- stats::sd(x, na.rm = TRUE)
diff --git a/R-package/man/normalize.Rd b/R-package/man/normalize.Rd
deleted file mode 100644
index 6a05e8342..000000000
--- a/R-package/man/normalize.Rd
+++ /dev/null
@@ -1,18 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/xgb.ggplot.R
-\name{normalize}
-\alias{normalize}
-\title{Scale feature value to have mean 0, standard deviation 1}
-\usage{
-normalize(x)
-}
-\arguments{
-\item{x}{Numeric vector}
-}
-\value{
-Numeric vector with mean 0 and sd 1.
-}
-\description{
-This is used to compare multiple features on the same plot.
-Internal utility function
-}
diff --git a/R-package/man/prepare.ggplot.shap.data.Rd b/R-package/man/prepare.ggplot.shap.data.Rd
deleted file mode 100644
index 57f71a3ff..000000000
--- a/R-package/man/prepare.ggplot.shap.data.Rd
+++ /dev/null
@@ -1,27 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/xgb.ggplot.R
-\name{prepare.ggplot.shap.data}
-\alias{prepare.ggplot.shap.data}
-\title{Combine and melt feature values and SHAP contributions for sample
-observations.}
-\usage{
-prepare.ggplot.shap.data(data_list, normalize = FALSE)
-}
-\arguments{
-\item{data_list}{List containing 'data' and 'shap_contrib' returned by
-\code{xgb.shap.data()}.}
-
-\item{normalize}{Whether to standardize feature values to have mean 0 and
-standard deviation 1 (useful for comparing multiple features on the same
-plot). Default \code{FALSE}.}
-}
-\value{
-A data.table containing the observation ID, the feature name, the
-  feature value (normalized if specified), and the SHAP contribution value.
-}
-\description{
-Conforms to data format required for ggplot functions.
-}
-\details{
-Internal utility function.
-}

From e3f624d8e765d15eb0b560d5ad8b1c140c61c314 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Mon, 14 Aug 2023 09:18:33 -0500
Subject: [PATCH 093/136] [R] remove more uses of default values in internal
 functions (#9476)

---
 R-package/R/xgb.DMatrix.R | 2 +-
 R-package/R/xgb.cv.R      | 8 +++++++-
 R-package/R/xgb.ggplot.R  | 2 +-
 R-package/R/xgboost.R     | 8 +++++++-
 4 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R
index 20aab5335..fc2609416 100644
--- a/R-package/R/xgb.DMatrix.R
+++ b/R-package/R/xgb.DMatrix.R
@@ -88,7 +88,7 @@ xgb.DMatrix <- function(data, info = list(), missing = NA, silent = FALSE, nthre
 
 # get dmatrix from data, label
 # internal helper method
-xgb.get.DMatrix <- function(data, label = NULL, missing = NA, weight = NULL, nthread = NULL) {
+xgb.get.DMatrix <- function(data, label, missing, weight, nthread) {
   if (inherits(data, "dgCMatrix") || is.matrix(data)) {
     if (is.null(label)) {
       stop("label must be provided when data is a matrix")
diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R
index 24c1b3f3c..0b1baaa84 100644
--- a/R-package/R/xgb.cv.R
+++ b/R-package/R/xgb.cv.R
@@ -194,7 +194,13 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
 
   # create the booster-folds
   # train_folds
-  dall <- xgb.get.DMatrix(data, label, missing, nthread = params$nthread)
+  dall <- xgb.get.DMatrix(
+    data = data,
+    label = label,
+    missing = missing,
+    weight = NULL,
+    nthread = params$nthread
+  )
   bst_folds <- lapply(seq_along(folds), function(k) {
     dtest  <- slice(dall, folds[[k]])
     # code originally contributed by @RolandASc on stackoverflow
diff --git a/R-package/R/xgb.ggplot.R b/R-package/R/xgb.ggplot.R
index 69d26e2c4..e79644543 100644
--- a/R-package/R/xgb.ggplot.R
+++ b/R-package/R/xgb.ggplot.R
@@ -183,7 +183,7 @@ normalize <- function(x) {
 # ... the plots
 # cols number of columns
 # internal utility function
-multiplot <- function(..., cols = 1) {
+multiplot <- function(..., cols) {
   plots <- list(...)
   num_plots <- length(plots)
 
diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R
index a1a8f9573..db4fd67aa 100644
--- a/R-package/R/xgboost.R
+++ b/R-package/R/xgboost.R
@@ -10,7 +10,13 @@ xgboost <- function(data = NULL, label = NULL, missing = NA, weight = NULL,
                     save_period = NULL, save_name = "xgboost.model",
                     xgb_model = NULL, callbacks = list(), ...) {
   merged <- check.booster.params(params, ...)
-  dtrain <- xgb.get.DMatrix(data, label, missing, weight, nthread = merged$nthread)
+  dtrain <- xgb.get.DMatrix(
+    data = data,
+    label = label,
+    missing = missing,
+    weight = weight,
+    nthread = merged$nthread
+  )
 
   watchlist <- list(train = dtrain)
 

From 19b59938b7bf5bcc29b200268a5dd44e470b0705 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 15 Aug 2023 02:27:58 +0800
Subject: [PATCH 094/136] Convert input to str for hypothesis note. (#9480)

---
 tests/python-gpu/test_gpu_updaters.py                |  8 ++++----
 tests/python/test_updaters.py                        | 12 ++++++------
 .../test_gpu_with_dask/test_gpu_with_dask.py         |  2 +-
 .../test_with_dask/test_with_dask.py                 |  2 +-
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/tests/python-gpu/test_gpu_updaters.py b/tests/python-gpu/test_gpu_updaters.py
index 653a99f3a..587210cf2 100644
--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@@ -36,7 +36,7 @@ class TestGPUUpdatersMulti:
         param["tree_method"] = "gpu_hist"
         param = dataset.set_params(param)
         result = train_result(param, dataset.get_dmat(), num_rounds)
-        note(result)
+        note(str(result))
         assert tm.non_increasing(result["train"][dataset.metric])
 
 
@@ -90,12 +90,12 @@ class TestGPUUpdaters:
     def test_sparse(self, dataset):
         param = {"tree_method": "hist", "max_bin": 64}
         hist_result = train_result(param, dataset.get_dmat(), 16)
-        note(hist_result)
+        note(str(hist_result))
         assert tm.non_increasing(hist_result["train"][dataset.metric])
 
         param = {"tree_method": "gpu_hist", "max_bin": 64}
         gpu_hist_result = train_result(param, dataset.get_dmat(), 16)
-        note(gpu_hist_result)
+        note(str(gpu_hist_result))
         assert tm.non_increasing(gpu_hist_result["train"][dataset.metric])
 
         np.testing.assert_allclose(
@@ -221,7 +221,7 @@ class TestGPUUpdaters:
             dataset.get_device_dmat(max_bin=param.get("max_bin", None)),
             num_rounds,
         )
-        note(result)
+        note(str(result))
         assert tm.non_increasing(result["train"][dataset.metric], tolerance=1e-3)
 
     @given(
diff --git a/tests/python/test_updaters.py b/tests/python/test_updaters.py
index 3fa32660d..c4c0de032 100644
--- a/tests/python/test_updaters.py
+++ b/tests/python/test_updaters.py
@@ -58,7 +58,7 @@ class TestTreeMethodMulti:
         param.update(hist_param)
         param.update(cache_param)
         result = train_result(param, dataset.get_dmat(), num_rounds)
-        note(result)
+        note(str(result))
         assert tm.non_increasing(result["train"][dataset.metric])
 
     @given(
@@ -84,7 +84,7 @@ class TestTreeMethodMulti:
         param.update(hist_param)
         param.update(cache_param)
         result = train_result(param, dataset.get_dmat(), num_rounds)
-        note(result)
+        note(str(result))
         assert tm.non_increasing(result["train"][dataset.metric])
 
 
@@ -125,7 +125,7 @@ class TestTreeMethod:
         param.update(hist_param)
         param.update(cache_param)
         result = train_result(param, dataset.get_dmat(), num_rounds)
-        note(result)
+        note(str(result))
         assert tm.non_increasing(result["train"][dataset.metric])
 
     @pytest.mark.skipif(**tm.no_sklearn())
@@ -172,7 +172,7 @@ class TestTreeMethod:
         param.update(hist_param)
         param.update(cache_param)
         result = train_result(param, dataset.get_dmat(), num_rounds)
-        note(result)
+        note(str(result))
         assert tm.non_increasing(result["train"][dataset.metric])
 
     def test_hist_categorical(self):
@@ -224,12 +224,12 @@ class TestTreeMethod:
     def test_sparse(self, dataset):
         param = {"tree_method": "hist", "max_bin": 64}
         hist_result = train_result(param, dataset.get_dmat(), 16)
-        note(hist_result)
+        note(str(hist_result))
         assert tm.non_increasing(hist_result['train'][dataset.metric])
 
         param = {"tree_method": "approx", "max_bin": 64}
         approx_result = train_result(param, dataset.get_dmat(), 16)
-        note(approx_result)
+        note(str(approx_result))
         assert tm.non_increasing(approx_result['train'][dataset.metric])
 
         np.testing.assert_allclose(
diff --git a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
index 4cc934579..893582ee1 100644
--- a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
+++ b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
@@ -187,7 +187,7 @@ def run_gpu_hist(
         num_boost_round=num_rounds,
         evals=[(m, "train")],
     )["history"]["train"][dataset.metric]
-    note(history)
+    note(str(history))
 
     # See note on `ObjFunction::UpdateTreeLeaf`.
     update_leaf = dataset.name.endswith("-l1")
diff --git a/tests/test_distributed/test_with_dask/test_with_dask.py b/tests/test_distributed/test_with_dask/test_with_dask.py
index 5630e5f3e..664c0b89c 100644
--- a/tests/test_distributed/test_with_dask/test_with_dask.py
+++ b/tests/test_distributed/test_with_dask/test_with_dask.py
@@ -1484,7 +1484,7 @@ class TestWithDask:
             num_boost_round=num_rounds,
             evals=[(m, "train")],
         )["history"]
-        note(history)
+        note(str(history))
         history = history["train"][dataset.metric]
 
         def is_stump():

From 8463107013c3f9e6466fdde6d03aff8938546af1 Mon Sep 17 00:00:00 2001
From: Boris <mail@dotbg.name>
Date: Mon, 14 Aug 2023 23:28:28 +0200
Subject: [PATCH 095/136] Updated versions. Reorganised dependencies. (#9479)

---
 jvm-packages/pom.xml               | 19 +------------------
 jvm-packages/xgboost4j-gpu/pom.xml | 15 +++++++++++++++
 jvm-packages/xgboost4j/pom.xml     | 15 +++++++++++++++
 3 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index d2e363601..baad9258d 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -78,7 +78,7 @@
             <id>scala-2.13</id>
             <properties>
                 <scala.binary.version>2.13</scala.binary.version>
-                <scala.version>2.13.10</scala.version>
+                <scala.version>2.13.11</scala.version>
             </properties>
         </profile>
 
@@ -473,28 +473,11 @@
         </plugins>
     </reporting>
     <dependencies>
-
         <dependency>
             <groupId>com.esotericsoftware</groupId>
             <artifactId>kryo</artifactId>
             <version>5.5.0</version>
         </dependency>
-        <dependency>
-            <groupId>org.scala-lang</groupId>
-            <artifactId>scala-compiler</artifactId>
-            <version>${scala.version}</version>
-            <scope>provided</scope>
-        </dependency>
-        <dependency>
-            <groupId>org.scala-lang</groupId>
-            <artifactId>scala-library</artifactId>
-            <version>${scala.version}</version>
-        </dependency>
-        <dependency>
-          <groupId>org.scala-lang.modules</groupId>
-          <artifactId>scala-collection-compat_${scala.binary.version}</artifactId>
-          <version>${scala-collection-compat.version}</version>
-        </dependency>
         <dependency>
             <groupId>commons-logging</groupId>
             <artifactId>commons-logging</artifactId>
diff --git a/jvm-packages/xgboost4j-gpu/pom.xml b/jvm-packages/xgboost4j-gpu/pom.xml
index f34680302..61f9fb1cb 100644
--- a/jvm-packages/xgboost4j-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-gpu/pom.xml
@@ -14,6 +14,21 @@
     <packaging>jar</packaging>
 
     <dependencies>
+        <dependency>
+          <groupId>org.scala-lang</groupId>
+          <artifactId>scala-compiler</artifactId>
+          <version>${scala.version}</version>
+        </dependency>
+        <dependency>
+          <groupId>org.scala-lang</groupId>
+          <artifactId>scala-library</artifactId>
+          <version>${scala.version}</version>
+        </dependency>
+        <dependency>
+          <groupId>org.scala-lang.modules</groupId>
+          <artifactId>scala-collection-compat_${scala.binary.version}</artifactId>
+          <version>${scala-collection-compat.version}</version>
+        </dependency>
         <dependency>
           <groupId>ai.rapids</groupId>
           <artifactId>cudf</artifactId>
diff --git a/jvm-packages/xgboost4j/pom.xml b/jvm-packages/xgboost4j/pom.xml
index 4352aab12..3fb133dc8 100644
--- a/jvm-packages/xgboost4j/pom.xml
+++ b/jvm-packages/xgboost4j/pom.xml
@@ -14,6 +14,21 @@
     <packaging>jar</packaging>
 
     <dependencies>
+      <dependency>
+          <groupId>org.scala-lang</groupId>
+          <artifactId>scala-compiler</artifactId>
+          <version>${scala.version}</version>
+      </dependency>
+      <dependency>
+            <groupId>org.scala-lang</groupId>
+            <artifactId>scala-library</artifactId>
+            <version>${scala.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.scala-lang.modules</groupId>
+            <artifactId>scala-collection-compat_${scala.binary.version}</artifactId>
+            <version>${scala-collection-compat.version}</version>
+        </dependency>
         <dependency>
             <groupId>org.apache.hadoop</groupId>
             <artifactId>hadoop-hdfs</artifactId>

From b82e78c1693f63c49cb4d0be62a220698d43f880 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Tue, 15 Aug 2023 00:44:08 -0500
Subject: [PATCH 096/136] [R] remove commented-out code (#9481)

---
 R-package/R/xgb.cv.R | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R
index 0b1baaa84..9e1ffeddc 100644
--- a/R-package/R/xgb.cv.R
+++ b/R-package/R/xgb.cv.R
@@ -135,9 +135,6 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
   check.custom.obj()
   check.custom.eval()
 
-  #if (is.null(params[['eval_metric']]) && is.null(feval))
-  #  stop("Either 'eval_metric' or 'feval' must be provided for CV")
-
   # Check the labels
   if ((inherits(data, 'xgb.DMatrix') && is.null(getinfo(data, 'label'))) ||
       (!inherits(data, 'xgb.DMatrix') && is.null(label))) {
@@ -161,10 +158,6 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
     folds <- generate.cv.folds(nfold, nrow(data), stratified, cv_label, params)
   }
 
-  # Potential TODO: sequential CV
-  #if (strategy == 'sequential')
-  #  stop('Sequential CV strategy is not yet implemented')
-
   # verbosity & evaluation printing callback:
   params <- c(params, list(silent = 1))
   print_every_n <- max(as.integer(print_every_n), 1L)

From c061e3ae50931242a6695dfd7ec7b3a33f3f2c5d Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 16 Aug 2023 07:26:42 +0800
Subject: [PATCH 097/136] [jvm-packages] Bump rapids version. (#9482)

---
 jvm-packages/pom.xml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index baad9258d..baec6bd6b 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -43,8 +43,8 @@
         <maven.wagon.http.retryHandler.count>5</maven.wagon.http.retryHandler.count>
         <log.capi.invocation>OFF</log.capi.invocation>
         <use.cuda>OFF</use.cuda>
-        <cudf.version>23.04.0</cudf.version>
-        <spark.rapids.version>23.04.1</spark.rapids.version>
+        <cudf.version>23.08.0</cudf.version>
+        <spark.rapids.version>23.08.0</spark.rapids.version>
         <cudf.classifier>cuda11</cudf.classifier>
         <scalatest.version>3.2.16</scalatest.version>
         <scala-collection-compat.version>2.10.0</scala-collection-compat.version>

From b2e93d2742314cb0594171442d7a4c33b830dca3 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 16 Aug 2023 13:35:55 +0800
Subject: [PATCH 098/136] [doc] Quick note for the `device` parameter. [skip
 ci] (#9483)

---
 doc/parameter.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/parameter.rst b/doc/parameter.rst
index fdb4b8357..6e4756ee8 100644
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -46,7 +46,7 @@ General Parameters
     + ``gpu``: Default GPU device selection from the list of available and supported devices. Only ``cuda`` devices are supported currently.
     + ``gpu:<ordinal>``: Default GPU device selection from the list of available and supported devices. Only ``cuda`` devices are supported currently.
 
-    For more information about GPU acceleration, see :doc:`/gpu/index`.
+    For more information about GPU acceleration, see :doc:`/gpu/index`. In distributed environments, ordinal selection is handled by distributed frameworks instead of XGBoost. As a result, using ``cuda:<ordinal>`` will result in an error. Use ``cuda`` instead.
 
 * ``verbosity`` [default=1]
 

From 12fe2fc06c47064299bcb9693aab45878e610005 Mon Sep 17 00:00:00 2001
From: Sean Yang <seany314@gmail.com>
Date: Wed, 16 Aug 2023 00:25:05 -0700
Subject: [PATCH 099/136] Fix federated learning demos and tests (#9488)

---
 demo/nvflare/.gitignore                       |  1 +
 demo/nvflare/config/config_fed_client.json    | 23 +++++++++++++++++++
 demo/nvflare/config/config_fed_server.json    | 22 ++++++++++++++++++
 demo/nvflare/horizontal/README.md             |  2 +-
 demo/nvflare/horizontal/prepare_data.sh       |  2 +-
 demo/nvflare/vertical/README.md               |  2 +-
 demo/nvflare/vertical/custom/trainer.py       |  5 +++-
 demo/nvflare/vertical/prepare_data.sh         |  2 +-
 .../test_federated/runtests-federated.sh      |  4 ++--
 .../test_federated/test_federated.py          |  8 +++----
 10 files changed, 60 insertions(+), 11 deletions(-)
 create mode 100644 demo/nvflare/.gitignore
 create mode 100644 demo/nvflare/config/config_fed_client.json
 create mode 100644 demo/nvflare/config/config_fed_server.json

diff --git a/demo/nvflare/.gitignore b/demo/nvflare/.gitignore
new file mode 100644
index 000000000..d5702b886
--- /dev/null
+++ b/demo/nvflare/.gitignore
@@ -0,0 +1 @@
+!config
diff --git a/demo/nvflare/config/config_fed_client.json b/demo/nvflare/config/config_fed_client.json
new file mode 100644
index 000000000..cfe294172
--- /dev/null
+++ b/demo/nvflare/config/config_fed_client.json
@@ -0,0 +1,23 @@
+{
+  "format_version": 2,
+  "executors": [
+    {
+      "tasks": [
+        "train"
+      ],
+      "executor": {
+        "path": "trainer.XGBoostTrainer",
+        "args": {
+          "server_address": "localhost:9091",
+          "world_size": 2,
+          "server_cert_path": "server-cert.pem",
+          "client_key_path": "client-key.pem",
+          "client_cert_path": "client-cert.pem",
+          "use_gpus": false
+        }
+      }
+    }
+  ],
+  "task_result_filters": [],
+  "task_data_filters": []
+}
diff --git a/demo/nvflare/config/config_fed_server.json b/demo/nvflare/config/config_fed_server.json
new file mode 100644
index 000000000..32993b652
--- /dev/null
+++ b/demo/nvflare/config/config_fed_server.json
@@ -0,0 +1,22 @@
+{
+  "format_version": 2,
+  "server": {
+    "heart_beat_timeout": 600
+  },
+  "task_data_filters": [],
+  "task_result_filters": [],
+  "workflows": [
+    {
+      "id": "server_workflow",
+      "path": "controller.XGBoostController",
+      "args": {
+        "port": 9091,
+        "world_size": 2,
+        "server_key_path": "server-key.pem",
+        "server_cert_path": "server-cert.pem",
+        "client_cert_path": "client-cert.pem"
+      }
+    }
+  ],
+  "components": []
+}
diff --git a/demo/nvflare/horizontal/README.md b/demo/nvflare/horizontal/README.md
index 744e90915..19ac4cf4e 100644
--- a/demo/nvflare/horizontal/README.md
+++ b/demo/nvflare/horizontal/README.md
@@ -6,7 +6,7 @@ This directory contains a demo of Horizontal Federated Learning using
 ## Training with CPU only
 
 To run the demo, first build XGBoost with the federated learning plugin enabled (see the
-[README](../../plugin/federated/README.md)).
+[README](../../../plugin/federated/README.md)).
 
 Install NVFlare (note that currently NVFlare only supports Python 3.8):
 ```shell
diff --git a/demo/nvflare/horizontal/prepare_data.sh b/demo/nvflare/horizontal/prepare_data.sh
index eed1390b5..eb3a19d50 100755
--- a/demo/nvflare/horizontal/prepare_data.sh
+++ b/demo/nvflare/horizontal/prepare_data.sh
@@ -16,7 +16,7 @@ split -n l/${world_size} --numeric-suffixes=1 -a 1 ../../data/agaricus.txt.test
 
 nvflare poc -n 2 --prepare
 mkdir -p /tmp/nvflare/poc/admin/transfer/horizontal-xgboost
-cp -fr config custom /tmp/nvflare/poc/admin/transfer/horizontal-xgboost
+cp -fr ../config custom /tmp/nvflare/poc/admin/transfer/horizontal-xgboost
 cp server-*.pem client-cert.pem /tmp/nvflare/poc/server/
 for (( site=1; site<=world_size; site++ )); do
   cp server-cert.pem client-*.pem /tmp/nvflare/poc/site-"$site"/
diff --git a/demo/nvflare/vertical/README.md b/demo/nvflare/vertical/README.md
index 83c3111b6..f9cca57d9 100644
--- a/demo/nvflare/vertical/README.md
+++ b/demo/nvflare/vertical/README.md
@@ -6,7 +6,7 @@ This directory contains a demo of Vertical Federated Learning using
 ## Training with CPU only
 
 To run the demo, first build XGBoost with the federated learning plugin enabled (see the
-[README](../../plugin/federated/README.md)).
+[README](../../../plugin/federated/README.md)).
 
 Install NVFlare (note that currently NVFlare only supports Python 3.8):
 ```shell
diff --git a/demo/nvflare/vertical/custom/trainer.py b/demo/nvflare/vertical/custom/trainer.py
index cd420129c..1c235a439 100644
--- a/demo/nvflare/vertical/custom/trainer.py
+++ b/demo/nvflare/vertical/custom/trainer.py
@@ -16,7 +16,7 @@ class SupportedTasks(object):
 
 class XGBoostTrainer(Executor):
     def __init__(self, server_address: str, world_size: int, server_cert_path: str,
-                 client_key_path: str, client_cert_path: str):
+                 client_key_path: str, client_cert_path: str, use_gpus: bool):
         """Trainer for federated XGBoost.
 
         Args:
@@ -32,6 +32,7 @@ class XGBoostTrainer(Executor):
         self._server_cert_path = server_cert_path
         self._client_key_path = client_key_path
         self._client_cert_path = client_cert_path
+        self._use_gpus = use_gpus
 
     def execute(self, task_name: str, shareable: Shareable, fl_ctx: FLContext,
                 abort_signal: Signal) -> Shareable:
@@ -81,6 +82,8 @@ class XGBoostTrainer(Executor):
                 'objective': 'binary:logistic',
                 'eval_metric': 'auc',
             }
+            if self._use_gpus:
+                self.log_info(fl_ctx, 'GPUs are not currently supported by vertical federated XGBoost')
 
             # specify validations set to watch performance
             watchlist = [(dtest, "eval"), (dtrain, "train")]
diff --git a/demo/nvflare/vertical/prepare_data.sh b/demo/nvflare/vertical/prepare_data.sh
index 86ec3dfa2..398ba2a10 100755
--- a/demo/nvflare/vertical/prepare_data.sh
+++ b/demo/nvflare/vertical/prepare_data.sh
@@ -56,7 +56,7 @@ fi
 
 nvflare poc -n 2 --prepare
 mkdir -p /tmp/nvflare/poc/admin/transfer/vertical-xgboost
-cp -fr config custom /tmp/nvflare/poc/admin/transfer/vertical-xgboost
+cp -fr ../config custom /tmp/nvflare/poc/admin/transfer/vertical-xgboost
 cp server-*.pem client-cert.pem /tmp/nvflare/poc/server/
 for (( site=1; site<=world_size; site++ )); do
   cp server-cert.pem client-*.pem /tmp/nvflare/poc/site-"${site}"/
diff --git a/tests/test_distributed/test_federated/runtests-federated.sh b/tests/test_distributed/test_federated/runtests-federated.sh
index 81a40c350..8bdb2bc5b 100755
--- a/tests/test_distributed/test_federated/runtests-federated.sh
+++ b/tests/test_distributed/test_federated/runtests-federated.sh
@@ -11,7 +11,7 @@ openssl req -x509 -newkey rsa:2048 -days 7 -nodes -keyout server-key.pem -out se
 openssl req -x509 -newkey rsa:2048 -days 7 -nodes -keyout client-key.pem -out client-cert.pem -subj "/C=US/CN=localhost"
 
 # Split train and test files manually to simulate a federated environment.
-split -n l/"${world_size}" -d ../../demo/data/agaricus.txt.train agaricus.txt.train-
-split -n l/"${world_size}" -d ../../demo/data/agaricus.txt.test agaricus.txt.test-
+split -n l/"${world_size}" -d ../../../demo/data/agaricus.txt.train agaricus.txt.train-
+split -n l/"${world_size}" -d ../../../demo/data/agaricus.txt.test agaricus.txt.test-
 
 python test_federated.py "${world_size}"
diff --git a/tests/test_distributed/test_federated/test_federated.py b/tests/test_distributed/test_federated/test_federated.py
index 9b8e55915..dba797078 100644
--- a/tests/test_distributed/test_federated/test_federated.py
+++ b/tests/test_distributed/test_federated/test_federated.py
@@ -35,14 +35,14 @@ def run_worker(port: int, world_size: int, rank: int, with_ssl: bool, with_gpu:
     # Always call this before using distributed module
     with xgb.collective.CommunicatorContext(**communicator_env):
         # Load file, file will not be sharded in federated mode.
-        dtrain = xgb.DMatrix('agaricus.txt.train-%02d' % rank)
-        dtest = xgb.DMatrix('agaricus.txt.test-%02d' % rank)
+        dtrain = xgb.DMatrix('agaricus.txt.train-%02d?format=libsvm' % rank)
+        dtest = xgb.DMatrix('agaricus.txt.test-%02d?format=libsvm' % rank)
 
         # Specify parameters via map, definition are same as c++ version
         param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
         if with_gpu:
-            param['tree_method'] = 'gpu_hist'
-            param['gpu_id'] = rank
+            param['tree_method'] = 'hist'
+            param['device'] = f"cuda:{rank}"
 
         # Specify validations set to watch performance
         watchlist = [(dtest, 'eval'), (dtrain, 'train')]

From f380c10a939b87f8d7d26502ce2cad888998934f Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 16 Aug 2023 16:08:41 +0800
Subject: [PATCH 100/136] Use hint for find nccl. (#9490)

---
 cmake/modules/FindNccl.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/modules/FindNccl.cmake b/cmake/modules/FindNccl.cmake
index f37955f6f..5f06f96b8 100644
--- a/cmake/modules/FindNccl.cmake
+++ b/cmake/modules/FindNccl.cmake
@@ -52,11 +52,11 @@ endif (BUILD_WITH_SHARED_NCCL)
 
 find_path(NCCL_INCLUDE_DIR
   NAMES nccl.h
-  PATHS $ENV{NCCL_ROOT}/include ${NCCL_ROOT}/include)
+  HINTS  ${NCCL_ROOT}/include $ENV{NCCL_ROOT}/include)
 
 find_library(NCCL_LIBRARY
   NAMES ${NCCL_LIB_NAME}
-  PATHS $ENV{NCCL_ROOT}/lib/ ${NCCL_ROOT}/lib)
+  HINTS ${NCCL_ROOT}/lib $ENV{NCCL_ROOT}/lib/)
 
 message(STATUS "Using nccl library: ${NCCL_LIBRARY}")
 

From 5188e275135306652fb0fdb2b3a6a771c147ad8e Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 16 Aug 2023 22:44:58 +0800
Subject: [PATCH 101/136] Fix version parsing with rc release. (#9493)

---
 python-package/xgboost/core.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index fbedfd7fb..27d62cdf2 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -235,8 +235,11 @@ Error message(s): {os_error_list}
 
     def parse(ver: str) -> Tuple[int, int, int]:
         """Avoid dependency on packaging (PEP 440)."""
-        # 2.0.0-dev or 2.0.0
+        # 2.0.0-dev, 2.0.0, or 2.0.0rc1
         major, minor, patch = ver.split("-")[0].split(".")
+        rc = patch.find("rc")
+        if rc != -1:
+            patch = patch[:rc]
         return int(major), int(minor), int(patch)
 
     libver = _lib_version(lib)

From 68be454cfa5512a1c3a46eef80bcc48ca53fb4ba Mon Sep 17 00:00:00 2001
From: Bobby Wang <wbo4958@gmail.com>
Date: Thu, 17 Aug 2023 16:01:39 +0800
Subject: [PATCH 102/136] [pyspark] hotfix for GPU setup validation (#9495)

* [pyspark] fix a bug of validating gpu configuration

---------

Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
---
 python-package/xgboost/spark/core.py | 44 ++++++++++++++++------------
 1 file changed, 25 insertions(+), 19 deletions(-)

diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py
index a072e9961..af58c994f 100644
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@@ -424,35 +424,41 @@ class _SparkXGBParams(
 
             if is_local:
                 # checking spark local mode.
-                if gpu_per_task:
+                if gpu_per_task is not None:
                     raise RuntimeError(
-                        "The spark cluster does not support gpu configuration for local mode. "
-                        "Please delete spark.executor.resource.gpu.amount and "
+                        "The spark local mode does not support gpu configuration."
+                        "Please remove spark.executor.resource.gpu.amount and "
                         "spark.task.resource.gpu.amount"
                     )
 
-                # Support GPU training in Spark local mode is just for debugging purposes,
-                # so it's okay for printing the below warning instead of checking the real
-                # gpu numbers and raising the exception.
+                # Support GPU training in Spark local mode is just for debugging
+                # purposes, so it's okay for printing the below warning instead of
+                # checking the real gpu numbers and raising the exception.
                 get_logger(self.__class__.__name__).warning(
-                    "You enabled GPU in spark local mode. Please make sure your local "
-                    "node has at least %d GPUs",
+                    "You have enabled GPU in spark local mode. Please make sure your"
+                    " local node has at least %d GPUs",
                     self.getOrDefault(self.num_workers),
                 )
             else:
                 # checking spark non-local mode.
-                if not gpu_per_task or int(gpu_per_task) < 1:
-                    raise RuntimeError(
-                        "The spark cluster does not have the necessary GPU"
-                        + "configuration for the spark task. Therefore, we cannot"
-                        + "run xgboost training using GPU."
-                    )
+                if gpu_per_task is not None:
+                    if float(gpu_per_task) < 1.0:
+                        raise ValueError(
+                            "XGBoost doesn't support GPU fractional configurations. "
+                            "Please set `spark.task.resource.gpu.amount=spark.executor"
+                            ".resource.gpu.amount`"
+                        )
 
-                if int(gpu_per_task) > 1:
-                    get_logger(self.__class__.__name__).warning(
-                        "You configured %s GPU cores for each spark task, but in "
-                        "XGBoost training, every Spark task will only use one GPU core.",
-                        gpu_per_task,
+                    if float(gpu_per_task) > 1.0:
+                        get_logger(self.__class__.__name__).warning(
+                            "%s GPUs for each Spark task is configured, but each "
+                            "XGBoost training task uses only 1 GPU.",
+                            gpu_per_task,
+                        )
+                else:
+                    raise ValueError(
+                        "The `spark.task.resource.gpu.amount` is required for training"
+                        " on GPU."
                     )
 
 

From 58530b1bc459b81e318062f17fa2d6175eda8532 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 18 Aug 2023 01:04:04 +0800
Subject: [PATCH 103/136] Bump version to 2.1. (#9498)

---
 CMakeLists.txt                           |  2 +-
 R-package/DESCRIPTION                    |  4 ++--
 R-package/configure                      | 18 +++++++++---------
 R-package/configure.ac                   |  2 +-
 include/xgboost/version_config.h         |  2 +-
 jvm-packages/pom.xml                     |  2 +-
 jvm-packages/xgboost4j-example/pom.xml   |  4 ++--
 jvm-packages/xgboost4j-flink/pom.xml     |  4 ++--
 jvm-packages/xgboost4j-gpu/pom.xml       |  4 ++--
 jvm-packages/xgboost4j-spark-gpu/pom.xml |  2 +-
 jvm-packages/xgboost4j-spark/pom.xml     |  2 +-
 jvm-packages/xgboost4j/pom.xml           |  4 ++--
 python-package/pyproject.toml            |  2 +-
 python-package/xgboost/VERSION           |  2 +-
 14 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a5eebef2e..3bffa6b07 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
-project(xgboost LANGUAGES CXX C VERSION 2.0.0)
+project(xgboost LANGUAGES CXX C VERSION 2.1.0)
 include(cmake/Utils.cmake)
 list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules")
 cmake_policy(SET CMP0022 NEW)
diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index 9ceef2fda..6eb0cdc28 100644
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: xgboost
 Type: Package
 Title: Extreme Gradient Boosting
-Version: 2.0.0.1
-Date: 2022-10-18
+Version: 2.1.0.1
+Date: 2023-08-17
 Authors@R: c(
   person("Tianqi", "Chen", role = c("aut"),
          email = "tianqi.tchen@gmail.com"),
diff --git a/R-package/configure b/R-package/configure
index 19ea48a91..3bbfa7150 100755
--- a/R-package/configure
+++ b/R-package/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for xgboost 2.0.0.
+# Generated by GNU Autoconf 2.71 for xgboost 2.1.0.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -607,8 +607,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='xgboost'
 PACKAGE_TARNAME='xgboost'
-PACKAGE_VERSION='2.0.0'
-PACKAGE_STRING='xgboost 2.0.0'
+PACKAGE_VERSION='2.1.0'
+PACKAGE_STRING='xgboost 2.1.0'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1225,7 +1225,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures xgboost 2.0.0 to adapt to many kinds of systems.
+\`configure' configures xgboost 2.1.0 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1287,7 +1287,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of xgboost 2.0.0:";;
+     short | recursive ) echo "Configuration of xgboost 2.1.0:";;
    esac
   cat <<\_ACEOF
 
@@ -1367,7 +1367,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-xgboost configure 2.0.0
+xgboost configure 2.1.0
 generated by GNU Autoconf 2.71
 
 Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1533,7 +1533,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by xgboost $as_me 2.0.0, which was
+It was created by xgboost $as_me 2.1.0, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
 
   $ $0$ac_configure_args_raw
@@ -3412,7 +3412,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by xgboost $as_me 2.0.0, which was
+This file was extended by xgboost $as_me 2.1.0, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -3467,7 +3467,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-xgboost config.status 2.0.0
+xgboost config.status 2.1.0
 configured by $0, generated by GNU Autoconf 2.71,
   with options \\"\$ac_cs_config\\"
 
diff --git a/R-package/configure.ac b/R-package/configure.ac
index 1fb6ea35a..89f8635fe 100644
--- a/R-package/configure.ac
+++ b/R-package/configure.ac
@@ -2,7 +2,7 @@
 
 AC_PREREQ(2.69)
 
-AC_INIT([xgboost],[2.0.0],[],[xgboost],[])
+AC_INIT([xgboost],[2.1.0],[],[xgboost],[])
 
 : ${R_HOME=`R RHOME`}
 if test -z "${R_HOME}"; then
diff --git a/include/xgboost/version_config.h b/include/xgboost/version_config.h
index 8005b8391..70e5417af 100644
--- a/include/xgboost/version_config.h
+++ b/include/xgboost/version_config.h
@@ -5,7 +5,7 @@
 #define XGBOOST_VERSION_CONFIG_H_
 
 #define XGBOOST_VER_MAJOR 2  /* NOLINT */
-#define XGBOOST_VER_MINOR 0  /* NOLINT */
+#define XGBOOST_VER_MINOR 1  /* NOLINT */
 #define XGBOOST_VER_PATCH 0  /* NOLINT */
 
 #endif  // XGBOOST_VERSION_CONFIG_H_
diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index baec6bd6b..738d57994 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -6,7 +6,7 @@
 
     <groupId>ml.dmlc</groupId>
     <artifactId>xgboost-jvm</artifactId>
-    <version>2.0.0-SNAPSHOT</version>
+    <version>2.1.0-SNAPSHOT</version>
     <packaging>pom</packaging>
     <name>XGBoost JVM Package</name>
     <description>JVM Package for XGBoost</description>
diff --git a/jvm-packages/xgboost4j-example/pom.xml b/jvm-packages/xgboost4j-example/pom.xml
index e6ed8a600..3a56615d6 100644
--- a/jvm-packages/xgboost4j-example/pom.xml
+++ b/jvm-packages/xgboost4j-example/pom.xml
@@ -6,11 +6,11 @@
     <parent>
         <groupId>ml.dmlc</groupId>
         <artifactId>xgboost-jvm</artifactId>
-        <version>2.0.0-SNAPSHOT</version>
+        <version>2.1.0-SNAPSHOT</version>
     </parent>
     <name>xgboost4j-example</name>
     <artifactId>xgboost4j-example_${scala.binary.version}</artifactId>
-    <version>2.0.0-SNAPSHOT</version>
+    <version>2.1.0-SNAPSHOT</version>
     <packaging>jar</packaging>
     <build>
         <plugins>
diff --git a/jvm-packages/xgboost4j-flink/pom.xml b/jvm-packages/xgboost4j-flink/pom.xml
index 8d51a9dcf..6f700ca0a 100644
--- a/jvm-packages/xgboost4j-flink/pom.xml
+++ b/jvm-packages/xgboost4j-flink/pom.xml
@@ -6,12 +6,12 @@
     <parent>
         <groupId>ml.dmlc</groupId>
         <artifactId>xgboost-jvm</artifactId>
-        <version>2.0.0-SNAPSHOT</version>
+        <version>2.1.0-SNAPSHOT</version>
     </parent>
 
     <name>xgboost4j-flink</name>
     <artifactId>xgboost4j-flink_${scala.binary.version}</artifactId>
-    <version>2.0.0-SNAPSHOT</version>
+    <version>2.1.0-SNAPSHOT</version>
     <properties>
       <flink-ml.version>2.2.0</flink-ml.version>
     </properties>
diff --git a/jvm-packages/xgboost4j-gpu/pom.xml b/jvm-packages/xgboost4j-gpu/pom.xml
index 61f9fb1cb..8a65afeb5 100644
--- a/jvm-packages/xgboost4j-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-gpu/pom.xml
@@ -6,11 +6,11 @@
     <parent>
         <groupId>ml.dmlc</groupId>
         <artifactId>xgboost-jvm</artifactId>
-        <version>2.0.0-SNAPSHOT</version>
+        <version>2.1.0-SNAPSHOT</version>
     </parent>
     <artifactId>xgboost4j-gpu_${scala.binary.version}</artifactId>
     <name>xgboost4j-gpu</name>
-    <version>2.0.0-SNAPSHOT</version>
+    <version>2.1.0-SNAPSHOT</version>
     <packaging>jar</packaging>
 
     <dependencies>
diff --git a/jvm-packages/xgboost4j-spark-gpu/pom.xml b/jvm-packages/xgboost4j-spark-gpu/pom.xml
index b7be69e69..a29b4e056 100644
--- a/jvm-packages/xgboost4j-spark-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-spark-gpu/pom.xml
@@ -6,7 +6,7 @@
     <parent>
         <groupId>ml.dmlc</groupId>
         <artifactId>xgboost-jvm</artifactId>
-        <version>2.0.0-SNAPSHOT</version>
+        <version>2.1.0-SNAPSHOT</version>
     </parent>
     <name>xgboost4j-spark-gpu</name>
     <artifactId>xgboost4j-spark-gpu_${scala.binary.version}</artifactId>
diff --git a/jvm-packages/xgboost4j-spark/pom.xml b/jvm-packages/xgboost4j-spark/pom.xml
index d8f4cb914..179b1c762 100644
--- a/jvm-packages/xgboost4j-spark/pom.xml
+++ b/jvm-packages/xgboost4j-spark/pom.xml
@@ -6,7 +6,7 @@
     <parent>
         <groupId>ml.dmlc</groupId>
         <artifactId>xgboost-jvm</artifactId>
-        <version>2.0.0-SNAPSHOT</version>
+        <version>2.1.0-SNAPSHOT</version>
     </parent>
     <name>xgboost4j-spark</name>
     <artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>
diff --git a/jvm-packages/xgboost4j/pom.xml b/jvm-packages/xgboost4j/pom.xml
index 3fb133dc8..46ee9158f 100644
--- a/jvm-packages/xgboost4j/pom.xml
+++ b/jvm-packages/xgboost4j/pom.xml
@@ -6,11 +6,11 @@
     <parent>
         <groupId>ml.dmlc</groupId>
         <artifactId>xgboost-jvm</artifactId>
-        <version>2.0.0-SNAPSHOT</version>
+        <version>2.1.0-SNAPSHOT</version>
     </parent>
     <name>xgboost4j</name>
     <artifactId>xgboost4j_${scala.binary.version}</artifactId>
-    <version>2.0.0-SNAPSHOT</version>
+    <version>2.1.0-SNAPSHOT</version>
     <packaging>jar</packaging>
 
     <dependencies>
diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml
index 2bf463db6..199e0f06c 100644
--- a/python-package/pyproject.toml
+++ b/python-package/pyproject.toml
@@ -7,7 +7,7 @@ build-backend = "packager.pep517"
 
 [project]
 name = "xgboost"
-version = "2.0.0-dev"
+version = "2.1.0-dev"
 authors = [
     { name = "Hyunsu Cho", email = "chohyu01@cs.washington.edu" },
     { name = "Jiaming Yuan", email = "jm.yuan@outlook.com" }
diff --git a/python-package/xgboost/VERSION b/python-package/xgboost/VERSION
index d72f26267..c10edc3fa 100644
--- a/python-package/xgboost/VERSION
+++ b/python-package/xgboost/VERSION
@@ -1 +1 @@
-2.0.0-dev
+2.1.0-dev

From b74802dea9260c4c12d3b89d10dbbb6abb017856 Mon Sep 17 00:00:00 2001
From: Thomas Zeger <thomaszeger@gmail.com>
Date: Thu, 17 Aug 2023 16:36:06 -0400
Subject: [PATCH 104/136] Fix safe_xgboost macro on c++ (#9501)

---
 doc/tutorials/c_api_tutorial.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/tutorials/c_api_tutorial.rst b/doc/tutorials/c_api_tutorial.rst
index 3c33278be..bb1db8249 100644
--- a/doc/tutorials/c_api_tutorial.rst
+++ b/doc/tutorials/c_api_tutorial.rst
@@ -104,8 +104,8 @@ b. In a C++ application: modify the macro ``safe_xgboost`` to throw an exception
   #define safe_xgboost(call) {  \
     int err = (call); \
     if (err != 0) { \
-      throw new Exception(std::string(__FILE__) + ":" + std::to_string(__LINE__) + \
-                          ": error in " + #call + ":" + XGBGetLastError()));  \
+      throw std::runtime_error(std::string(__FILE__) + ":" + std::to_string(__LINE__) + \
+                          ": error in " + #call + ":" + XGBGetLastError());  \
     } \
   }
 

From 0bb87b5b35bf7632d9b292bbaa3bea022ef80bfc Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 18 Aug 2023 20:59:04 +0800
Subject: [PATCH 105/136] Bump hadoop.version from 3.3.5 to 3.3.6 in
 /jvm-packages (#9331)

Bumps `hadoop.version` from 3.3.5 to 3.3.6.

Updates `hadoop-hdfs` from 3.3.5 to 3.3.6

Updates `hadoop-common` from 3.3.5 to 3.3.6

---
updated-dependencies:
- dependency-name: org.apache.hadoop:hadoop-hdfs
  dependency-type: direct:production
  update-type: version-update:semver-patch
- dependency-name: org.apache.hadoop:hadoop-common
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 jvm-packages/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index 738d57994..0ab1f008e 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -39,7 +39,7 @@
         <spark.version.gpu>3.3.2</spark.version.gpu>
         <scala.version>2.12.18</scala.version>
         <scala.binary.version>2.12</scala.binary.version>
-        <hadoop.version>3.3.5</hadoop.version>
+        <hadoop.version>3.3.6</hadoop.version>
         <maven.wagon.http.retryHandler.count>5</maven.wagon.http.retryHandler.count>
         <log.capi.invocation>OFF</log.capi.invocation>
         <use.cuda>OFF</use.cuda>

From 7f29a238e6041a052b015b4ad99c5ddfd89f2640 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 19 Aug 2023 12:28:02 +0800
Subject: [PATCH 106/136] Return base score as intercept. (#9486)

---
 python-package/xgboost/sklearn.py | 24 ++++++++++++------------
 tests/python/test_with_sklearn.py | 16 ++++++++++++++++
 2 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index e791be51c..cb738477b 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -1359,25 +1359,25 @@ class XGBModel(XGBModelBase):
 
     @property
     def intercept_(self) -> np.ndarray:
-        """
-        Intercept (bias) property
+        """Intercept (bias) property
 
-        .. note:: Intercept is defined only for linear learners
-
-            Intercept (bias) is only defined when the linear model is chosen as base
-            learner (`booster=gblinear`). It is not defined for other base learner types,
-            such as tree learners (`booster=gbtree`).
+        For tree-based model, the returned value is the `base_score`.
 
         Returns
         -------
         intercept_ : array of shape ``(1,)`` or ``[n_classes]``
+
         """
-        if self.get_xgb_params()["booster"] != "gblinear":
-            raise AttributeError(
-                f"Intercept (bias) is not defined for Booster type {self.booster}"
-            )
+        booster_config = self.get_xgb_params()["booster"]
         b = self.get_booster()
-        return np.array(json.loads(b.get_dump(dump_format="json")[0])["bias"])
+        if booster_config != "gblinear":  # gbtree, dart
+            config = json.loads(b.save_config())
+            intercept = config["learner"]["learner_model_param"]["base_score"]
+            return np.array([float(intercept)], dtype=np.float32)
+
+        return np.array(
+            json.loads(b.get_dump(dump_format="json")[0])["bias"], dtype=np.float32
+        )
 
 
 PredtT = TypeVar("PredtT", bound=np.ndarray)
diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
index 69f144caf..b40ae67c5 100644
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -1507,6 +1507,7 @@ def test_evaluation_metric():
         # shape check inside the `merror` function
         clf.fit(X, y, eval_set=[(X, y)])
 
+
 def test_weighted_evaluation_metric():
     from sklearn.datasets import make_hastie_10_2
     from sklearn.metrics import log_loss
@@ -1544,3 +1545,18 @@ def test_weighted_evaluation_metric():
         internal["validation_0"]["logloss"],
         atol=1e-6
     )
+
+
+def test_intercept() -> None:
+    X, y, w = tm.make_regression(256, 3, use_cupy=False)
+    reg = xgb.XGBRegressor()
+    reg.fit(X, y, sample_weight=w)
+    result = reg.intercept_
+    assert result.dtype == np.float32
+    assert result[0] < 0.5
+
+    reg = xgb.XGBRegressor(booster="gblinear")
+    reg.fit(X, y, sample_weight=w)
+    result = reg.intercept_
+    assert result.dtype == np.float32
+    assert result[0] < 0.5

From d016309a15aacf1d634995ac81568a14e7f1682b Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sat, 19 Aug 2023 18:14:35 +0800
Subject: [PATCH 107/136] Bump spark.version from 3.4.0 to 3.4.1 in
 /jvm-packages/xgboost4j-spark (#9326)

Bumps `spark.version` from 3.4.0 to 3.4.1.

Updates `spark-core_2.12` from 3.4.0 to 3.4.1

Updates `spark-sql_2.12` from 3.4.0 to 3.4.1

Updates `spark-mllib_2.12` from 3.4.0 to 3.4.1

---
updated-dependencies:
- dependency-name: org.apache.spark:spark-core_2.12
  dependency-type: direct:production
  update-type: version-update:semver-patch
- dependency-name: org.apache.spark:spark-sql_2.12
  dependency-type: direct:production
  update-type: version-update:semver-patch
- dependency-name: org.apache.spark:spark-mllib_2.12
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 jvm-packages/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index 0ab1f008e..79614cf03 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -35,7 +35,7 @@
         <maven.compiler.target>1.8</maven.compiler.target>
         <flink.version>1.17.1</flink.version>
         <junit.version>4.13.2</junit.version>
-        <spark.version>3.4.0</spark.version>
+        <spark.version>3.4.1</spark.version>
         <spark.version.gpu>3.3.2</spark.version.gpu>
         <scala.version>2.12.18</scala.version>
         <scala.binary.version>2.12</scala.binary.version>

From 5358e1ebf02a5149d59af2dc2016f345a2d63cba Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sun, 20 Aug 2023 00:37:15 +0800
Subject: [PATCH 108/136] Bump org.apache.commons:commons-lang3 in
 /jvm-packages (#9489)

Bumps org.apache.commons:commons-lang3 from 3.12.0 to 3.13.0.

---
updated-dependencies:
- dependency-name: org.apache.commons:commons-lang3
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 jvm-packages/xgboost4j-gpu/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jvm-packages/xgboost4j-gpu/pom.xml b/jvm-packages/xgboost4j-gpu/pom.xml
index 8a65afeb5..c08988ac8 100644
--- a/jvm-packages/xgboost4j-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-gpu/pom.xml
@@ -63,7 +63,7 @@
         <dependency>
             <groupId>org.apache.commons</groupId>
             <artifactId>commons-lang3</artifactId>
-            <version>3.12.0</version>
+            <version>3.13.0</version>
         </dependency>
     </dependencies>
 

From db87d481bc1ffa3023bbe19213af37f0a0574cfd Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sun, 20 Aug 2023 02:58:58 +0800
Subject: [PATCH 109/136] [R] Differentiate dev version with release version.
 (#9503)

Use 2.1.0.0 as development version, we will change it to 2.1.0.1 during release.
---
 R-package/DESCRIPTION            |  4 ++--
 tests/ci_build/change_version.py | 17 ++++++++++-------
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index 6eb0cdc28..d301b0a5c 100644
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: xgboost
 Type: Package
 Title: Extreme Gradient Boosting
-Version: 2.1.0.1
-Date: 2023-08-17
+Version: 2.1.0.0
+Date: 2023-08-19
 Authors@R: c(
   person("Tianqi", "Chen", role = c("aut"),
          email = "tianqi.tchen@gmail.com"),
diff --git a/tests/ci_build/change_version.py b/tests/ci_build/change_version.py
index 25561859c..d1ef8ce09 100644
--- a/tests/ci_build/change_version.py
+++ b/tests/ci_build/change_version.py
@@ -61,8 +61,11 @@ def pypkg(
 
 
 @cd(R_PACKAGE)
-def rpkg(major: int, minor: int, patch: int) -> None:
-    version = f"{major}.{minor}.{patch}.1"
+def rpkg(major: int, minor: int, patch: int, is_dev: bool) -> None:
+    if is_dev:
+        version = f"{major}.{minor}.{patch}.0"
+    else:
+        version = f"{major}.{minor}.{patch}.1"
     # Version: 2.0.0.1
     desc_path = "DESCRIPTION"
     with open(desc_path, "r") as fd:
@@ -119,8 +122,8 @@ def main(args: argparse.Namespace) -> None:
     minor = args.minor
     patch = args.patch
     rc = args.rc
-    is_rc = args.is_rc == 1
-    is_dev = args.is_dev == 1
+    is_rc = args.is_rc
+    is_dev = args.is_dev
     if is_rc and is_dev:
         raise ValueError("It cannot be both a rc and a dev branch.")
     if is_rc:
@@ -130,7 +133,7 @@ def main(args: argparse.Namespace) -> None:
 
     cmake(major, minor, patch)
     pypkg(major, minor, patch, rc, is_rc, is_dev)
-    rpkg(major, minor, patch)
+    rpkg(major, minor, patch, is_dev=is_dev)
     jvmpkgs(major, minor, patch, rc, is_rc, is_dev)
 
     print(
@@ -149,8 +152,8 @@ if __name__ == "__main__":
     parser.add_argument("--minor", type=int)
     parser.add_argument("--patch", type=int)
     parser.add_argument("--rc", type=int, default=0)
-    parser.add_argument("--is-rc", type=int, choices=[0, 1])
-    parser.add_argument("--is-dev", type=int, choices=[0, 1])
+    parser.add_argument("--is-rc", action="store_true")
+    parser.add_argument("--is-dev", action="store_true")
     args = parser.parse_args()
     try:
         main(args)

From 74d5056c614a0121dcda509318134e37c214705b Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sun, 20 Aug 2023 04:20:07 +0800
Subject: [PATCH 110/136] Bump spark.version.gpu in
 /jvm-packages/xgboost4j-spark-gpu (#9328)

Bumps `spark.version.gpu` from 3.3.2 to 3.4.1.

Updates `spark-core_2.12` from 3.3.2 to 3.4.1

Updates `spark-sql_2.12` from 3.3.2 to 3.4.1

Updates `spark-mllib_2.12` from 3.3.2 to 3.4.1

---
updated-dependencies:
- dependency-name: org.apache.spark:spark-core_2.12
  dependency-type: direct:production
  update-type: version-update:semver-minor
- dependency-name: org.apache.spark:spark-sql_2.12
  dependency-type: direct:production
  update-type: version-update:semver-minor
- dependency-name: org.apache.spark:spark-mllib_2.12
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 jvm-packages/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index 79614cf03..e93f9fef1 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -36,7 +36,7 @@
         <flink.version>1.17.1</flink.version>
         <junit.version>4.13.2</junit.version>
         <spark.version>3.4.1</spark.version>
-        <spark.version.gpu>3.3.2</spark.version.gpu>
+        <spark.version.gpu>3.4.1</spark.version.gpu>
         <scala.version>2.12.18</scala.version>
         <scala.binary.version>2.12</scala.binary.version>
         <hadoop.version>3.3.6</hadoop.version>

From 38a3e1b858c12e19e1584474f61ba0dd50582e4d Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 21 Aug 2023 05:24:35 +0800
Subject: [PATCH 111/136] Fix release script for RC [skip ci] (#9505)

---
 dev/release-artifacts.py | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/dev/release-artifacts.py b/dev/release-artifacts.py
index eab64ff0c..d9b9d6203 100644
--- a/dev/release-artifacts.py
+++ b/dev/release-artifacts.py
@@ -98,17 +98,24 @@ def download_wheels(
     return filenames
 
 
-def make_pysrc_wheel(release: str, outdir: str) -> None:
+def make_pysrc_wheel(
+    release: str, rc: Optional[str], rc_ver: Optional[int], outdir: str
+) -> None:
     """Make Python source distribution."""
-    dist = os.path.join(outdir, "dist")
+    dist = os.path.abspath(os.path.normpath(os.path.join(outdir, "dist")))
     if not os.path.exists(dist):
         os.mkdir(dist)
 
     with DirectoryExcursion(os.path.join(ROOT, "python-package")):
         subprocess.check_call(["python", "-m", "build", "--sdist"])
-        src = os.path.join(DIST, f"xgboost-{release}.tar.gz")
+        if rc is not None:
+            name = f"xgboost-{release}{rc}{rc_ver}.tar.gz"
+        else:
+            name = f"xgboost-{release}.tar.gz"
+        src = os.path.join(DIST, name)
         subprocess.check_call(["twine", "check", src])
-        shutil.move(src, os.path.join(dist, f"xgboost-{release}.tar.gz"))
+        target = os.path.join(dist, name)
+        shutil.move(src, target)
 
 
 def download_py_packages(
@@ -172,7 +179,9 @@ def download_r_packages(
     hashes = []
     with DirectoryExcursion(os.path.join(outdir, "r-packages")):
         for f in filenames:
-            ret = subprocess.run(["sha256sum", os.path.basename(f)], capture_output=True)
+            ret = subprocess.run(
+                ["sha256sum", os.path.basename(f)], capture_output=True
+            )
             h = ret.stdout.decode().strip()
             hashes.append(h)
     return urls, hashes
@@ -306,7 +315,7 @@ def main(args: argparse.Namespace) -> None:
     hashes.extend(hr)
 
     # Python source wheel
-    make_pysrc_wheel(release, args.outdir)
+    make_pysrc_wheel(release, rc, rc_ver, args.outdir)
 
     # Python binary wheels
     download_py_packages(branch, major, minor, commit_hash, args.outdir)

From e6cf7a12785ab73f8aac79052a876ab40c65684f Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 21 Aug 2023 06:47:48 +0800
Subject: [PATCH 112/136] Deprecate the command line interface. (#9485)

---------

Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
---
 .github/workflows/main.yml                    |  4 +-
 .github/workflows/python_tests.yml            |  6 +-
 CMakeLists.txt                                | 67 +++++++++++++------
 demo/CLI/README.rst                           |  4 ++
 src/cli_main.cc                               |  4 +-
 tests/buildkite/build-win64-gpu.ps1           |  2 +-
 tests/ci_build/build_r_pkg_with_cuda.sh       |  1 -
 tests/ci_build/build_r_pkg_with_cuda_win64.sh |  1 -
 tests/ci_build/build_via_cmake.sh             |  2 +-
 9 files changed, 59 insertions(+), 32 deletions(-)
 create mode 100644 demo/CLI/README.rst

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index ab2a58fe9..0288b0c97 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -29,7 +29,7 @@ jobs:
       run: |
         mkdir build
         cd build
-        cmake .. -DGOOGLE_TEST=ON -DUSE_OPENMP=ON -DUSE_DMLC_GTEST=ON -DPLUGIN_DENSE_PARSER=ON -GNinja
+        cmake .. -DGOOGLE_TEST=ON -DUSE_OPENMP=ON -DUSE_DMLC_GTEST=ON -DPLUGIN_DENSE_PARSER=ON -GNinja -DBUILD_DEPRECATED_CLI=ON
         ninja -v
     - name: Run gtest binary
       run: |
@@ -56,7 +56,7 @@ jobs:
       run: |
         mkdir build
         cd build
-        cmake .. -GNinja -DGOOGLE_TEST=ON  -DUSE_DMLC_GTEST=ON -DUSE_OPENMP=OFF
+        cmake .. -GNinja -DGOOGLE_TEST=ON  -DUSE_DMLC_GTEST=ON -DUSE_OPENMP=OFF -DBUILD_DEPRECATED_CLI=ON
         ninja -v
     - name: Run gtest binary
       run: |
diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml
index b9e97d439..532c9277a 100644
--- a/.github/workflows/python_tests.yml
+++ b/.github/workflows/python_tests.yml
@@ -143,7 +143,7 @@ jobs:
         # Set prefix, to use OpenMP library from Conda env
         # See https://github.com/dmlc/xgboost/issues/7039#issuecomment-1025038228
         # to learn why we don't use libomp from Homebrew.
-        cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
+        cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -DBUILD_DEPRECATED_CLI=ON
         ninja
 
     - name: Install Python package
@@ -190,7 +190,7 @@ jobs:
       run: |
         mkdir build_msvc
         cd build_msvc
-        cmake .. -G"Visual Studio 17 2022" -DCMAKE_CONFIGURATION_TYPES="Release" -A x64 -DGOOGLE_TEST=ON  -DUSE_DMLC_GTEST=ON
+        cmake .. -G"Visual Studio 17 2022" -DCMAKE_CONFIGURATION_TYPES="Release" -A x64 -DGOOGLE_TEST=ON  -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON
         cmake --build . --config Release --parallel $(nproc)
 
     - name: Install Python package
@@ -234,7 +234,7 @@ jobs:
       run: |
         mkdir build
         cd build
-        cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
+        cmake .. -GNinja -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -DBUILD_DEPRECATED_CLI=ON
         ninja
 
     - name: Install Python package
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3bffa6b07..29043b594 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -45,6 +45,7 @@ set_default_configuration_release()
 option(BUILD_C_DOC "Build documentation for C APIs using Doxygen." OFF)
 option(USE_OPENMP "Build with OpenMP support." ON)
 option(BUILD_STATIC_LIB "Build static library" OFF)
+option(BUILD_DEPRECATED_CLI "Build the deprecated command line interface" OFF)
 option(FORCE_SHARED_CRT "Build with dynamic CRT on Windows (/MD)" OFF)
 option(RABIT_BUILD_MPI "Build MPI" OFF)
 ## Bindings
@@ -273,19 +274,30 @@ target_include_directories(xgboost
 #-- End shared library
 
 #-- CLI for xgboost
-add_executable(runxgboost ${xgboost_SOURCE_DIR}/src/cli_main.cc)
-target_link_libraries(runxgboost PRIVATE objxgboost)
-target_include_directories(runxgboost
-  PRIVATE
-  ${xgboost_SOURCE_DIR}/include
-  ${xgboost_SOURCE_DIR}/dmlc-core/include
-  ${xgboost_SOURCE_DIR}/rabit/include
-)
-set_target_properties(runxgboost PROPERTIES OUTPUT_NAME xgboost)
+if (BUILD_DEPRECATED_CLI)
+  add_executable(runxgboost ${xgboost_SOURCE_DIR}/src/cli_main.cc)
+  target_link_libraries(runxgboost PRIVATE objxgboost)
+  target_include_directories(runxgboost
+    PRIVATE
+    ${xgboost_SOURCE_DIR}/include
+    ${xgboost_SOURCE_DIR}/dmlc-core/include
+    ${xgboost_SOURCE_DIR}/rabit/include
+  )
+  set_target_properties(runxgboost PROPERTIES OUTPUT_NAME xgboost)
+  xgboost_target_properties(runxgboost)
+  xgboost_target_link_libraries(runxgboost)
+  xgboost_target_defs(runxgboost)
+
+  if (KEEP_BUILD_ARTIFACTS_IN_BINARY_DIR)
+    set_output_directory(runxgboost ${xgboost_BINARY_DIR})
+  else ()
+    set_output_directory(runxgboost ${xgboost_SOURCE_DIR})
+  endif (KEEP_BUILD_ARTIFACTS_IN_BINARY_DIR)
+endif (BUILD_DEPRECATED_CLI)
 #-- End CLI for xgboost
 
 # Common setup for all targets
-foreach(target xgboost objxgboost dmlc runxgboost)
+foreach(target xgboost objxgboost dmlc)
   xgboost_target_properties(${target})
   xgboost_target_link_libraries(${target})
   xgboost_target_defs(${target})
@@ -298,14 +310,15 @@ if (JVM_BINDINGS)
 endif (JVM_BINDINGS)
 
 if (KEEP_BUILD_ARTIFACTS_IN_BINARY_DIR)
-  set_output_directory(runxgboost ${xgboost_BINARY_DIR})
   set_output_directory(xgboost ${xgboost_BINARY_DIR}/lib)
 else ()
-  set_output_directory(runxgboost ${xgboost_SOURCE_DIR})
   set_output_directory(xgboost ${xgboost_SOURCE_DIR}/lib)
 endif ()
+
 # Ensure these two targets do not build simultaneously, as they produce outputs with conflicting names
-add_dependencies(xgboost runxgboost)
+if (BUILD_DEPRECATED_CLI)
+  add_dependencies(xgboost runxgboost)
+endif (BUILD_DEPRECATED_CLI)
 
 #-- Installing XGBoost
 if (R_LIB)
@@ -341,9 +354,17 @@ install(DIRECTORY ${xgboost_SOURCE_DIR}/include/xgboost
 #
 # https://github.com/dmlc/xgboost/issues/6085
 if (BUILD_STATIC_LIB)
-  set(INSTALL_TARGETS xgboost runxgboost objxgboost dmlc)
+  if (BUILD_DEPRECATED_CLI)
+    set(INSTALL_TARGETS xgboost runxgboost objxgboost dmlc)
+  else()
+    set(INSTALL_TARGETS xgboost objxgboost dmlc)
+  endif (BUILD_DEPRECATED_CLI)
 else (BUILD_STATIC_LIB)
-  set(INSTALL_TARGETS xgboost runxgboost)
+  if (BUILD_DEPRECATED_CLI)
+    set(INSTALL_TARGETS xgboost runxgboost)
+  else(BUILD_DEPRECATED_CLI)
+    set(INSTALL_TARGETS xgboost)
+  endif (BUILD_DEPRECATED_CLI)
 endif (BUILD_STATIC_LIB)
 
 install(TARGETS ${INSTALL_TARGETS}
@@ -393,13 +414,15 @@ if (GOOGLE_TEST)
     ${xgboost_SOURCE_DIR}/tests/cli/machine.conf.in
     ${xgboost_BINARY_DIR}/tests/cli/machine.conf
     @ONLY)
-  add_test(
-    NAME TestXGBoostCLI
-    COMMAND runxgboost ${xgboost_BINARY_DIR}/tests/cli/machine.conf
-    WORKING_DIRECTORY ${xgboost_BINARY_DIR})
-  set_tests_properties(TestXGBoostCLI
-    PROPERTIES
-    PASS_REGULAR_EXPRESSION ".*test-rmse:0.087.*")
+  if (BUILD_DEPRECATED_CLI)
+    add_test(
+      NAME TestXGBoostCLI
+      COMMAND runxgboost ${xgboost_BINARY_DIR}/tests/cli/machine.conf
+      WORKING_DIRECTORY ${xgboost_BINARY_DIR})
+    set_tests_properties(TestXGBoostCLI
+      PROPERTIES
+      PASS_REGULAR_EXPRESSION ".*test-rmse:0.087.*")
+  endif (BUILD_DEPRECATED_CLI)
 endif (GOOGLE_TEST)
 
 # For MSVC: Call msvc_use_static_runtime() once again to completely
diff --git a/demo/CLI/README.rst b/demo/CLI/README.rst
new file mode 100644
index 000000000..e828cd8ae
--- /dev/null
+++ b/demo/CLI/README.rst
@@ -0,0 +1,4 @@
+XGBoost Command Line Interface Walkthrough
+==========================================
+
+Please note that the command line interface is deprecated in 2.1.0, use other language bindings instead. For a list of available bindings, see https://xgboost.readthedocs.io/en/stable/
diff --git a/src/cli_main.cc b/src/cli_main.cc
index 9507e6e89..8f088a516 100644
--- a/src/cli_main.cc
+++ b/src/cli_main.cc
@@ -514,7 +514,9 @@ class CLI {
 };
 }  // namespace xgboost
 
-int main(int argc, char *argv[]) {
+int main(int argc, char* argv[]) {
+  LOG(WARNING)
+      << "The command line interface is deprecated and will be removed in future releases.";
   try {
     xgboost::CLI cli(argc, argv);
     return cli.Run();
diff --git a/tests/buildkite/build-win64-gpu.ps1 b/tests/buildkite/build-win64-gpu.ps1
index 32cd2806a..092d4b192 100644
--- a/tests/buildkite/build-win64-gpu.ps1
+++ b/tests/buildkite/build-win64-gpu.ps1
@@ -13,7 +13,7 @@ if ( $is_release_branch -eq 0 ) {
 mkdir build
 cd build
 cmake .. -G"Visual Studio 17 2022" -A x64 -DUSE_CUDA=ON -DCMAKE_VERBOSE_MAKEFILE=ON `
-  -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON ${arch_flag}
+  -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON ${arch_flag}
 $msbuild = -join @(
   "C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\MSBuild\\Current"
   "\\Bin\\MSBuild.exe"
diff --git a/tests/ci_build/build_r_pkg_with_cuda.sh b/tests/ci_build/build_r_pkg_with_cuda.sh
index 5d36a16e1..78a2afc1c 100755
--- a/tests/ci_build/build_r_pkg_with_cuda.sh
+++ b/tests/ci_build/build_r_pkg_with_cuda.sh
@@ -19,7 +19,6 @@ cmake .. -GNinja -DUSE_CUDA=ON -DR_LIB=ON
 ninja
 cd ..
 
-rm xgboost
 # This super wacky hack is found in cmake/RPackageInstall.cmake.in and
 # cmake/RPackageInstallTargetSetup.cmake. This hack lets us bypass the normal build process of R
 # and have R use xgboost.so that we've already built.
diff --git a/tests/ci_build/build_r_pkg_with_cuda_win64.sh b/tests/ci_build/build_r_pkg_with_cuda_win64.sh
index ca67704b5..d44a418d1 100644
--- a/tests/ci_build/build_r_pkg_with_cuda_win64.sh
+++ b/tests/ci_build/build_r_pkg_with_cuda_win64.sh
@@ -22,7 +22,6 @@ cmake .. -G"Visual Studio 17 2022" -A x64 -DUSE_CUDA=ON -DR_LIB=ON -DLIBR_HOME="
 cmake --build . --config Release --parallel
 cd ..
 
-rm xgboost
 # This super wacky hack is found in cmake/RPackageInstall.cmake.in and
 # cmake/RPackageInstallTargetSetup.cmake. This hack lets us bypass the normal build process of R
 # and have R use xgboost.dll that we've already built.
diff --git a/tests/ci_build/build_via_cmake.sh b/tests/ci_build/build_via_cmake.sh
index ef5b8dc0e..d67a673b0 100755
--- a/tests/ci_build/build_via_cmake.sh
+++ b/tests/ci_build/build_via_cmake.sh
@@ -24,7 +24,7 @@ fi
 rm -rf build
 mkdir build
 cd build
-cmake .. ${cmake_args} -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DCMAKE_VERBOSE_MAKEFILE=ON -DENABLE_ALL_WARNINGS=ON -DCMAKE_COMPILE_WARNING_AS_ERROR=ON -GNinja ${cmake_prefix_flag} -DHIDE_CXX_SYMBOLS=ON
+cmake .. ${cmake_args} -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DCMAKE_VERBOSE_MAKEFILE=ON -DENABLE_ALL_WARNINGS=ON -DCMAKE_COMPILE_WARNING_AS_ERROR=ON -GNinja ${cmake_prefix_flag} -DHIDE_CXX_SYMBOLS=ON -DBUILD_DEPRECATED_CLI=ON
 ninja clean
 time ninja -v
 cd ..

From d779a11af9979b222b15931709c41638a7cf5475 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 21 Aug 2023 10:27:35 +0800
Subject: [PATCH 113/136] Bump scala-collection-compat_2.12 from 2.10.0 to
 2.11.0 in /jvm-packages (#9311)

Bumps [scala-collection-compat_2.12](https://github.com/scala/scala-collection-compat) from 2.10.0 to 2.11.0.
- [Release notes](https://github.com/scala/scala-collection-compat/releases)
- [Commits](https://github.com/scala/scala-collection-compat/compare/v2.10.0...v2.11.0)

---
updated-dependencies:
- dependency-name: org.scala-lang.modules:scala-collection-compat_2.12
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 jvm-packages/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index e93f9fef1..ba4bbdf72 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -47,7 +47,7 @@
         <spark.rapids.version>23.08.0</spark.rapids.version>
         <cudf.classifier>cuda11</cudf.classifier>
         <scalatest.version>3.2.16</scalatest.version>
-        <scala-collection-compat.version>2.10.0</scala-collection-compat.version>
+        <scala-collection-compat.version>2.11.0</scala-collection-compat.version>
       </properties>
     <repositories>
         <repository>

From 044fea1281e3910a70368068a8d72ccbe6b406f0 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 21 Aug 2023 23:34:05 +0800
Subject: [PATCH 114/136] Drop support for loading remote files. (#9504)

---
 CMakeLists.txt                             |  4 --
 doc/jvm/xgboost4j_spark_tutorial.rst       | 33 ---------------
 python-package/xgboost/core.py             |  6 +--
 src/c_api/c_api.cc                         |  8 ++--
 src/cli_main.cc                            |  8 ++--
 src/common/io.cc                           | 48 ++++++----------------
 src/common/io.h                            | 12 +++---
 tests/cpp/c_api/test_c_api.cc              |  4 +-
 tests/cpp/common/test_io.cc                | 18 ++++----
 tests/cpp/common/test_json.cc              |  8 ++--
 tests/cpp/data/test_sparse_page_dmatrix.cc |  4 +-
 tests/cpp/test_learner.cc                  |  2 +-
 12 files changed, 43 insertions(+), 112 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 29043b594..27da42376 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -72,10 +72,6 @@ option(USE_NCCL  "Build with NCCL to enable distributed GPU support." OFF)
 option(BUILD_WITH_SHARED_NCCL "Build with shared NCCL library." OFF)
 set(GPU_COMPUTE_VER "" CACHE STRING
   "Semicolon separated list of compute versions to be built against, e.g. '35;61'")
-## Copied From dmlc
-option(USE_HDFS "Build with HDFS support" OFF)
-option(USE_AZURE "Build with AZURE support" OFF)
-option(USE_S3 "Build with S3 support" OFF)
 ## Sanitizers
 option(USE_SANITIZER "Use santizer flags" OFF)
 option(SANITIZER_PATH "Path to sanitizes.")
diff --git a/doc/jvm/xgboost4j_spark_tutorial.rst b/doc/jvm/xgboost4j_spark_tutorial.rst
index 1cf3ba2c8..90859dfba 100644
--- a/doc/jvm/xgboost4j_spark_tutorial.rst
+++ b/doc/jvm/xgboost4j_spark_tutorial.rst
@@ -390,39 +390,6 @@ Then we can load this model with single node Python XGBoost:
   bst = xgb.Booster({'nthread': 4})
   bst.load_model(nativeModelPath)
 
-.. note:: Using HDFS and S3 for exporting the models with nativeBooster.saveModel()
-
-  When interacting with other language bindings, XGBoost also supports saving-models-to and loading-models-from file systems other than the local one. You can use HDFS and S3 by prefixing the path with ``hdfs://`` and ``s3://`` respectively. However, for this capability, you must do **one** of the following:
-
-  1. Build XGBoost4J-Spark with the steps described in :ref:`here <install_jvm_packages>`, but turning `USE_HDFS <https://github.com/dmlc/xgboost/blob/e939192978a0c152ad7b49b744630e99d54cffa8/jvm-packages/create_jni.py#L18>`_ (or USE_S3, etc. in the same place) switch on. With this approach, you can reuse the above code example by replacing "nativeModelPath" with a HDFS path.
-
-     - However, if you build with USE_HDFS, etc. you have to ensure that the involved shared object file, e.g. libhdfs.so, is put in the LIBRARY_PATH of your cluster. To avoid the complicated cluster environment configuration, choose the other option.
-
-  2. Use bindings of HDFS, S3, etc. to pass model files around. Here are the steps (taking HDFS as an example):
-
-     - Create a new file with
-
-       .. code-block:: scala
-
-         val outputStream = fs.create("hdfs_path")
-
-       where "fs" is an instance of `org.apache.hadoop.fs.FileSystem <https://hadoop.apache.org/docs/stable/api/org/apache/hadoop/fs/FileSystem.html>`_ class in Hadoop.
-
-     - Pass the returned OutputStream in the first step to nativeBooster.saveModel():
-
-       .. code-block:: scala
-
-         xgbClassificationModel.nativeBooster.saveModel(outputStream)
-
-     - Download file in other languages from HDFS and load with the pre-built (without the requirement of libhdfs.so) version of XGBoost. (The function "download_from_hdfs" is a helper function to be implemented by the user)
-
-       .. code-block:: python
-
-         import xgboost as xgb
-         bst = xgb.Booster({'nthread': 4})
-         local_path = download_from_hdfs("hdfs_path")
-         bst.load_model(local_path)
-
 .. note:: Consistency issue between XGBoost4J-Spark and other bindings
 
   There is a consistency issue between XGBoost4J-Spark and other language bindings of XGBoost.
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 27d62cdf2..d59d2f1d1 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -505,8 +505,7 @@ class DataIter(ABC):  # pylint: disable=too-many-instance-attributes
     Parameters
     ----------
     cache_prefix :
-        Prefix to the cache files, only used in external memory.  It can be either an
-        URI or a file path.
+        Prefix to the cache files, only used in external memory.
     release_data :
         Whether the iterator should release the data during reset. Set it to True if the
         data transformation (converting data to np.float32 type) is expensive.
@@ -2558,8 +2557,7 @@ class Booster:
         return ctypes2buffer(cptr, length.value)
 
     def load_model(self, fname: ModelIn) -> None:
-        """Load the model from a file or bytearray. Path to file can be local
-        or as an URI.
+        """Load the model from a file or a bytearray.
 
         The model is loaded from XGBoost format which is universal among the various
         XGBoost interfaces. Auxiliary attributes of the Python Booster object (such as
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 0c98c0198..5b49d136f 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -1220,12 +1220,12 @@ XGB_DLL int XGBoosterLoadModel(BoosterHandle handle, const char* fname) {
     return str;
   };
   if (common::FileExtension(fname) == "json") {
-    auto str = read_file();
-    Json in{Json::Load(StringView{str})};
+    auto buffer = read_file();
+    Json in{Json::Load(StringView{buffer.data(), buffer.size()})};
     static_cast<Learner*>(handle)->LoadModel(in);
   } else if (common::FileExtension(fname) == "ubj") {
-    auto str = read_file();
-    Json in = Json::Load(StringView{str}, std::ios::binary);
+    auto buffer = read_file();
+    Json in = Json::Load(StringView{buffer.data(), buffer.size()}, std::ios::binary);
     static_cast<Learner *>(handle)->LoadModel(in);
   } else {
     std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname, "r"));
diff --git a/src/cli_main.cc b/src/cli_main.cc
index 8f088a516..276d67da8 100644
--- a/src/cli_main.cc
+++ b/src/cli_main.cc
@@ -345,10 +345,10 @@ class CLI {
 
   void LoadModel(std::string const& path, Learner* learner) const {
     if (common::FileExtension(path) == "json") {
-      auto str = common::LoadSequentialFile(path);
-      CHECK_GT(str.size(), 2);
-      CHECK_EQ(str[0], '{');
-      Json in{Json::Load({str.c_str(), str.size()})};
+      auto buffer = common::LoadSequentialFile(path);
+      CHECK_GT(buffer.size(), 2);
+      CHECK_EQ(buffer[0], '{');
+      Json in{Json::Load({buffer.data(), buffer.size()})};
       learner->LoadModel(in);
     } else {
       std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(path.c_str(), "r"));
diff --git a/src/common/io.cc b/src/common/io.cc
index 8dbeba935..1715669b0 100644
--- a/src/common/io.cc
+++ b/src/common/io.cc
@@ -139,7 +139,7 @@ auto SystemErrorMsg() {
 }
 }  // anonymous namespace
 
-std::string LoadSequentialFile(std::string uri, bool stream) {
+std::vector<char> LoadSequentialFile(std::string uri) {
   auto OpenErr = [&uri]() {
     std::string msg;
     msg = "Opening " + uri + " failed: ";
@@ -148,44 +148,20 @@ std::string LoadSequentialFile(std::string uri, bool stream) {
   };
 
   auto parsed = dmlc::io::URI(uri.c_str());
+  CHECK((parsed.protocol == "file://" || parsed.protocol.length() == 0))
+      << "Only local file is supported.";
   // Read from file.
-  if ((parsed.protocol == "file://" || parsed.protocol.length() == 0) && !stream) {
-    std::string buffer;
-    // Open in binary mode so that correct file size can be computed with
-    // seekg(). This accommodates Windows platform:
-    // https://docs.microsoft.com/en-us/cpp/standard-library/basic-istream-class?view=vs-2019#seekg
-    auto path = std::filesystem::weakly_canonical(std::filesystem::u8path(uri));
-    std::ifstream ifs(path, std::ios_base::binary | std::ios_base::in);
-    if (!ifs) {
-      // https://stackoverflow.com/a/17338934
-      OpenErr();
-    }
-
-    ifs.seekg(0, std::ios_base::end);
-    const size_t file_size = static_cast<size_t>(ifs.tellg());
-    ifs.seekg(0, std::ios_base::beg);
-    buffer.resize(file_size + 1);
-    ifs.read(&buffer[0], file_size);
-    buffer.back() = '\0';
-
-    return buffer;
+  auto path = std::filesystem::weakly_canonical(std::filesystem::u8path(uri));
+  std::ifstream ifs(path, std::ios_base::binary | std::ios_base::in);
+  if (!ifs) {
+    // https://stackoverflow.com/a/17338934
+    OpenErr();
   }
 
-  // Read from remote.
-  std::unique_ptr<dmlc::Stream> fs{dmlc::Stream::Create(uri.c_str(), "r")};
-  std::string buffer;
-  size_t constexpr kInitialSize = 4096;
-  size_t size {kInitialSize}, total {0};
-  while (true) {
-    buffer.resize(total + size);
-    size_t read = fs->Read(&buffer[total], size);
-    total += read;
-    if (read < size) {
-      break;
-    }
-    size *= 2;
-  }
-  buffer.resize(total);
+  auto file_size = std::filesystem::file_size(path);
+  std::vector<char> buffer(file_size);
+  ifs.read(&buffer[0], file_size);
+
   return buffer;
 }
 
diff --git a/src/common/io.h b/src/common/io.h
index 95971abae..07bb60787 100644
--- a/src/common/io.h
+++ b/src/common/io.h
@@ -84,16 +84,14 @@ class FixedSizeStream : public PeekableInStream {
   std::string buffer_;
 };
 
-/*!
- * \brief Helper function for loading consecutive file to avoid dmlc Stream when possible.
+/**
+ * @brief Helper function for loading consecutive file.
  *
- * \param uri    URI or file name to file.
- * \param stream Use dmlc Stream unconditionally if set to true.  Used for running test
- *               without remote filesystem.
+ * @param uri    URI or file name to file.
  *
- * \return File content.
+ * @return File content.
  */
-std::string LoadSequentialFile(std::string uri, bool stream = false);
+std::vector<char> LoadSequentialFile(std::string uri);
 
 /**
  * \brief Get file extension from file name.
diff --git a/tests/cpp/c_api/test_c_api.cc b/tests/cpp/c_api/test_c_api.cc
index 205e5f561..3bf03c955 100644
--- a/tests/cpp/c_api/test_c_api.cc
+++ b/tests/cpp/c_api/test_c_api.cc
@@ -216,8 +216,8 @@ TEST(CAPI, JsonModelIO) {
 
   std::string buffer;
   Json::Dump(Json::Load(l, std::ios::binary), &buffer);
-  ASSERT_EQ(model_str_0.size() - 1, buffer.size());
-  ASSERT_EQ(model_str_0.back(), '\0');
+  ASSERT_EQ(model_str_0.size(), buffer.size());
+  ASSERT_EQ(model_str_0.back(), '}');
   ASSERT_TRUE(std::equal(model_str_0.begin(), model_str_0.end() - 1, buffer.begin()));
 
   ASSERT_EQ(XGBoosterSaveModelToBuffer(handle, R"({})", &len, &data), -1);
diff --git a/tests/cpp/common/test_io.cc b/tests/cpp/common/test_io.cc
index 8bc12698b..e4d65c1f4 100644
--- a/tests/cpp/common/test_io.cc
+++ b/tests/cpp/common/test_io.cc
@@ -63,31 +63,27 @@ TEST(IO, LoadSequentialFile) {
 
   // Generate a JSON file.
   size_t constexpr kRows = 1000, kCols = 100;
-  std::shared_ptr<DMatrix> p_dmat{
-    RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true)};
-  std::unique_ptr<Learner> learner { Learner::Create({p_dmat}) };
+  std::shared_ptr<DMatrix> p_dmat{RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true)};
+  std::unique_ptr<Learner> learner{Learner::Create({p_dmat})};
   learner->SetParam("tree_method", "hist");
   learner->Configure();
 
   for (int32_t iter = 0; iter < 10; ++iter) {
     learner->UpdateOneIter(iter, p_dmat);
   }
-  Json out { Object() };
+  Json out{Object()};
   learner->SaveModel(&out);
-  std::string str;
+  std::vector<char> str;
   Json::Dump(out, &str);
 
   std::string tmpfile = tempdir.path + "/model.json";
   {
-    std::unique_ptr<dmlc::Stream> fo(
-        dmlc::Stream::Create(tmpfile.c_str(), "w"));
-    fo->Write(str.c_str(), str.size());
+    std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(tmpfile.c_str(), "w"));
+    fo->Write(str.data(), str.size());
   }
 
-  auto loaded = LoadSequentialFile(tmpfile, true);
+  auto loaded = LoadSequentialFile(tmpfile);
   ASSERT_EQ(loaded, str);
-
-  ASSERT_THROW(LoadSequentialFile("non-exist", true), dmlc::Error);
 }
 
 TEST(IO, Resource) {
diff --git a/tests/cpp/common/test_json.cc b/tests/cpp/common/test_json.cc
index 1b4ed76ec..4d498ffd5 100644
--- a/tests/cpp/common/test_json.cc
+++ b/tests/cpp/common/test_json.cc
@@ -418,7 +418,7 @@ TEST(Json, AssigningString) {
 
 TEST(Json, LoadDump) {
   std::string ori_buffer = GetModelStr();
-  Json origin {Json::Load(StringView{ori_buffer.c_str(), ori_buffer.size()})};
+  Json origin{Json::Load(StringView{ori_buffer.c_str(), ori_buffer.size()})};
 
   dmlc::TemporaryDirectory tempdir;
   auto const& path = tempdir.path + "test_model_dump";
@@ -430,9 +430,9 @@ TEST(Json, LoadDump) {
   ASSERT_TRUE(fout);
   fout << out << std::flush;
 
-  std::string new_buffer = common::LoadSequentialFile(path);
+  std::vector<char> new_buffer = common::LoadSequentialFile(path);
 
-  Json load_back {Json::Load(StringView(new_buffer.c_str(), new_buffer.size()))};
+  Json load_back{Json::Load(StringView(new_buffer.data(), new_buffer.size()))};
   ASSERT_EQ(load_back, origin);
 }
 
@@ -651,7 +651,7 @@ TEST(UBJson, Basic) {
     }
 
     auto data = common::LoadSequentialFile("test.ubj");
-    UBJReader reader{StringView{data}};
+    UBJReader reader{StringView{data.data(), data.size()}};
     json = reader.Load();
     return json;
   };
diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cc b/tests/cpp/data/test_sparse_page_dmatrix.cc
index 839ea762e..efd323e77 100644
--- a/tests/cpp/data/test_sparse_page_dmatrix.cc
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cc
@@ -250,7 +250,7 @@ auto TestSparsePageDMatrixDeterminism(int32_t threads) {
 
   auto cache_name =
       data::MakeId(filename, dynamic_cast<data::SparsePageDMatrix *>(sparse.get())) + ".row.page";
-  std::string cache = common::LoadSequentialFile(cache_name);
+  auto cache = common::LoadSequentialFile(cache_name);
   return cache;
 }
 
@@ -258,7 +258,7 @@ TEST(SparsePageDMatrix, Determinism) {
 #if defined(_MSC_VER)
   return;
 #endif  // defined(_MSC_VER)
-  std::vector<std::string> caches;
+  std::vector<std::vector<char>> caches;
   for (size_t i = 1; i < 18; i += 2) {
     caches.emplace_back(TestSparsePageDMatrixDeterminism(i));
   }
diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc
index 48fd2d8e9..5a31ce1bd 100644
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@@ -184,7 +184,7 @@ TEST(Learner, JsonModelIO) {
     fout.close();
 
     auto loaded_str = common::LoadSequentialFile(tmpdir.path + "/model.json");
-    Json loaded = Json::Load(StringView{loaded_str.c_str(), loaded_str.size()});
+    Json loaded = Json::Load(StringView{loaded_str.data(), loaded_str.size()});
 
     learner->LoadModel(loaded);
     learner->Configure();

From 302bbdc958ca134238a3ee6ffbc52c817b9c8041 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 22 Aug 2023 13:46:35 +0800
Subject: [PATCH 115/136] mitigate flaky test with distributed l1 error.
 (#9499)

---
 .../test_with_dask/test_with_dask.py              | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/tests/test_distributed/test_with_dask/test_with_dask.py b/tests/test_distributed/test_with_dask/test_with_dask.py
index 664c0b89c..ae8d24139 100644
--- a/tests/test_distributed/test_with_dask/test_with_dask.py
+++ b/tests/test_distributed/test_with_dask/test_with_dask.py
@@ -18,7 +18,7 @@ import numpy as np
 import pytest
 import scipy
 import sklearn
-from hypothesis import HealthCheck, given, note, settings
+from hypothesis import HealthCheck, assume, given, note, settings
 from sklearn.datasets import make_classification, make_regression
 
 import xgboost as xgb
@@ -1462,10 +1462,9 @@ class TestWithDask:
         params["tree_method"] = tree_method
         params["debug_synchronize"] = True
         params = dataset.set_params(params)
-        # It doesn't make sense to distribute a completely
-        # empty dataset.
-        if dataset.X.shape[0] == 0:
-            return
+
+        # It doesn't make sense to distribute a completely empty dataset.
+        assume(dataset.X.shape[0] != 0)
 
         chunk = 128
         y_chunk = chunk if len(dataset.y.shape) == 1 else (chunk, dataset.y.shape[1])
@@ -1498,8 +1497,8 @@ class TestWithDask:
 
         # See note on `ObjFunction::UpdateTreeLeaf`.
         update_leaf = dataset.name.endswith("-l1")
-        if update_leaf and len(history) >= 2:
-            assert history[0] >= history[-1]
+        if update_leaf and (is_stump() or minimum_bin()):
+            assert tm.non_increasing(history, tolerance=1e-2)
             return
         elif minimum_bin() and is_stump():
             assert tm.non_increasing(history, tolerance=1e-3)
@@ -1508,7 +1507,7 @@ class TestWithDask:
         # Make sure that it's decreasing
         if is_stump():
             # we might have already got the best score with base_score.
-            assert history[-1] <= history[0]
+            assert history[-1] <= history[0] + 1e-3
         else:
             assert history[-1] < history[0]
 

From 3c09399f2992ebb73ac63b548cd6e469690600dc Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 23 Aug 2023 00:17:35 +0800
Subject: [PATCH 116/136] Fix device dispatch for linear updater. (#9507)

---
 doc/parameter.rst                    |  2 +-
 include/xgboost/global_config.h      | 22 +++++++--------
 src/gbm/gblinear.cc                  | 22 ++++++++++-----
 src/linear/updater_coordinate.cc     | 11 ++++----
 src/linear/updater_gpu_coordinate.cu | 16 +++++------
 tests/cpp/gbm/test_gblinear.cu       | 42 ++++++++++++++++++++++++++++
 6 files changed, 80 insertions(+), 35 deletions(-)
 create mode 100644 tests/cpp/gbm/test_gblinear.cu

diff --git a/doc/parameter.rst b/doc/parameter.rst
index 6e4756ee8..1162d6f1f 100644
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -329,7 +329,7 @@ Parameters for Linear Booster (``booster=gblinear``)
   - Choice of algorithm to fit linear model
 
     - ``shotgun``: Parallel coordinate descent algorithm based on shotgun algorithm. Uses 'hogwild' parallelism and therefore produces a nondeterministic solution on each run.
-    - ``coord_descent``: Ordinary coordinate descent algorithm. Also multithreaded but still produces a deterministic solution.
+    - ``coord_descent``: Ordinary coordinate descent algorithm. Also multithreaded but still produces a deterministic solution. When the ``device`` parameter is set to ``cuda`` or ``gpu``, a GPU variant would be used.
 
 * ``feature_selector`` [default= ``cyclic``]
 
diff --git a/include/xgboost/global_config.h b/include/xgboost/global_config.h
index 835d63c88..fb06e4516 100644
--- a/include/xgboost/global_config.h
+++ b/include/xgboost/global_config.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2020 by Contributors
+/**
+ * Copyright 2020-2023, XGBoost Contributors
  * \file global_config.h
  * \brief Global configuration for XGBoost
  * \author Hyunsu Cho
@@ -7,24 +7,22 @@
 #ifndef XGBOOST_GLOBAL_CONFIG_H_
 #define XGBOOST_GLOBAL_CONFIG_H_
 
-#include <xgboost/parameter.h>
-#include <vector>
-#include <string>
+#include <dmlc/thread_local.h>  // for ThreadLocalStore
+#include <xgboost/parameter.h>  // for XGBoostParameter
+
+#include <cstdint>  // for int32_t
 
 namespace xgboost {
-class Json;
-
 struct GlobalConfiguration : public XGBoostParameter<GlobalConfiguration> {
-  int verbosity { 1 };
-  bool use_rmm { false };
+  std::int32_t verbosity{1};
+  bool use_rmm{false};
   DMLC_DECLARE_PARAMETER(GlobalConfiguration) {
     DMLC_DECLARE_FIELD(verbosity)
         .set_range(0, 3)
         .set_default(1)  // shows only warning
         .describe("Flag to print out detailed breakdown of runtime.");
-    DMLC_DECLARE_FIELD(use_rmm)
-        .set_default(false)
-        .describe("Whether to use RAPIDS Memory Manager to allocate GPU memory in XGBoost");
+    DMLC_DECLARE_FIELD(use_rmm).set_default(false).describe(
+        "Whether to use RAPIDS Memory Manager to allocate GPU memory in XGBoost");
   }
 };
 
diff --git a/src/gbm/gblinear.cc b/src/gbm/gblinear.cc
index 64e9603de..520f76581 100644
--- a/src/gbm/gblinear.cc
+++ b/src/gbm/gblinear.cc
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2014-2022 by XGBoost Contributors
+/**
+ * Copyright 2014-2023, XGBoost Contributors
  * \file gblinear.cc
  * \brief Implementation of Linear booster, with L1/L2 regularization: Elastic Net
  *        the update rule is parallel coordinate descent (shotgun)
@@ -26,9 +26,9 @@
 #include "../common/timer.h"
 #include "../common/common.h"
 #include "../common/threading_utils.h"
+#include "../common/error_msg.h"
 
-namespace xgboost {
-namespace gbm {
+namespace xgboost::gbm {
 
 DMLC_REGISTRY_FILE_TAG(gblinear);
 
@@ -83,7 +83,16 @@ class GBLinear : public GradientBooster {
     }
     param_.UpdateAllowUnknown(cfg);
     param_.CheckGPUSupport();
-    updater_.reset(LinearUpdater::Create(param_.updater, ctx_));
+    if (param_.updater == "gpu_coord_descent") {
+      LOG(WARNING) << error::DeprecatedFunc("gpu_coord_descent", "2.0.0",
+                                            R"(device="cuda", updater="coord_descent")");
+    }
+
+    if (param_.updater == "coord_descent" && ctx_->IsCUDA()) {
+      updater_.reset(LinearUpdater::Create("gpu_coord_descent", ctx_));
+    } else {
+      updater_.reset(LinearUpdater::Create(param_.updater, ctx_));
+    }
     updater_->Configure(cfg);
     monitor_.Init("GBLinear");
   }
@@ -354,5 +363,4 @@ XGBOOST_REGISTER_GBM(GBLinear, "gblinear")
     .set_body([](LearnerModelParam const* booster_config, Context const* ctx) {
       return new GBLinear(booster_config, ctx);
     });
-}  // namespace gbm
-}  // namespace xgboost
+}  // namespace xgboost::gbm
diff --git a/src/linear/updater_coordinate.cc b/src/linear/updater_coordinate.cc
index 84f15d706..f660a1be8 100644
--- a/src/linear/updater_coordinate.cc
+++ b/src/linear/updater_coordinate.cc
@@ -9,8 +9,7 @@
 #include "coordinate_common.h"
 #include "xgboost/json.h"
 
-namespace xgboost {
-namespace linear {
+namespace xgboost::linear {
 
 DMLC_REGISTER_PARAMETER(CoordinateParam);
 DMLC_REGISTRY_FILE_TAG(updater_coordinate);
@@ -39,8 +38,9 @@ class CoordinateUpdater : public LinearUpdater {
     FromJson(config.at("linear_train_param"), &tparam_);
     FromJson(config.at("coordinate_param"), &cparam_);
   }
-  void SaveConfig(Json* p_out) const override {
-    auto& out = *p_out;
+  void SaveConfig(Json *p_out) const override {
+    LOG(DEBUG) << "Save config for CPU updater.";
+    auto &out = *p_out;
     out["linear_train_param"] = ToJson(tparam_);
     out["coordinate_param"] = ToJson(cparam_);
   }
@@ -99,5 +99,4 @@ class CoordinateUpdater : public LinearUpdater {
 XGBOOST_REGISTER_LINEAR_UPDATER(CoordinateUpdater, "coord_descent")
     .describe("Update linear model according to coordinate descent algorithm.")
     .set_body([]() { return new CoordinateUpdater(); });
-}  // namespace linear
-}  // namespace xgboost
+}  // namespace xgboost::linear
diff --git a/src/linear/updater_gpu_coordinate.cu b/src/linear/updater_gpu_coordinate.cu
index 7d658cf78..b6c817696 100644
--- a/src/linear/updater_gpu_coordinate.cu
+++ b/src/linear/updater_gpu_coordinate.cu
@@ -15,8 +15,7 @@
 #include "../common/timer.h"
 #include "./param.h"
 
-namespace xgboost {
-namespace linear {
+namespace xgboost::linear {
 
 DMLC_REGISTRY_FILE_TAG(updater_gpu_coordinate);
 
@@ -29,7 +28,7 @@ DMLC_REGISTRY_FILE_TAG(updater_gpu_coordinate);
 class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
  public:
   // set training parameter
-  void Configure(Args const& args) override {
+  void Configure(Args const &args) override {
     tparam_.UpdateAllowUnknown(args);
     coord_param_.UpdateAllowUnknown(args);
     selector_.reset(FeatureSelector::Create(tparam_.feature_selector));
@@ -41,8 +40,9 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
     FromJson(config.at("linear_train_param"), &tparam_);
     FromJson(config.at("coordinate_param"), &coord_param_);
   }
-  void SaveConfig(Json* p_out) const override {
-    auto& out = *p_out;
+  void SaveConfig(Json *p_out) const override {
+    LOG(DEBUG) << "Save config for GPU updater.";
+    auto &out = *p_out;
     out["linear_train_param"] = ToJson(tparam_);
     out["coordinate_param"] = ToJson(coord_param_);
   }
@@ -101,10 +101,9 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
     monitor_.Stop("LazyInitDevice");
 
     monitor_.Start("UpdateGpair");
-    auto &in_gpair_host = in_gpair->ConstHostVector();
     // Update gpair
     if (ctx_->gpu_id >= 0) {
-      this->UpdateGpair(in_gpair_host);
+      this->UpdateGpair(in_gpair->ConstHostVector());
     }
     monitor_.Stop("UpdateGpair");
 
@@ -249,5 +248,4 @@ XGBOOST_REGISTER_LINEAR_UPDATER(GPUCoordinateUpdater, "gpu_coord_descent")
         "Update linear model according to coordinate descent algorithm. GPU "
         "accelerated.")
     .set_body([]() { return new GPUCoordinateUpdater(); });
-}  // namespace linear
-}  // namespace xgboost
+}  // namespace xgboost::linear
diff --git a/tests/cpp/gbm/test_gblinear.cu b/tests/cpp/gbm/test_gblinear.cu
new file mode 100644
index 000000000..b158fb32b
--- /dev/null
+++ b/tests/cpp/gbm/test_gblinear.cu
@@ -0,0 +1,42 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+#include <xgboost/global_config.h>  // for GlobalConfigThreadLocalStore
+#include <xgboost/json.h>           // for Json, Object
+#include <xgboost/learner.h>        // for Learner
+
+#include <algorithm>  // for transform
+#include <string>     // for string
+#include <utility>    // for swap
+
+#include "../helpers.h"  // for RandomDataGenerator
+
+namespace xgboost {
+TEST(GBlinear, DispatchUpdater) {
+  auto verbosity = 3;
+  std::swap(GlobalConfigThreadLocalStore::Get()->verbosity, verbosity);
+
+  auto test = [](std::string device) {
+    auto p_fmat = RandomDataGenerator{10, 10, 0.0f}.GenerateDMatrix(true);
+    std::unique_ptr<Learner> learner{Learner::Create({p_fmat})};
+    learner->SetParams(
+        Args{{"booster", "gblinear"}, {"updater", "coord_descent"}, {"device", device}});
+    learner->Configure();
+    for (std::int32_t iter = 0; iter < 3; ++iter) {
+      learner->UpdateOneIter(iter, p_fmat);
+    }
+    Json config{Object{}};
+    ::testing::internal::CaptureStderr();
+    learner->SaveConfig(&config);
+    auto str = ::testing::internal::GetCapturedStderr();
+    std::transform(device.cbegin(), device.cend(), device.begin(),
+                   [](char c) { return std::toupper(c); });
+    ASSERT_NE(str.find(device), std::string::npos);
+  };
+  test("cpu");
+  test("gpu");
+
+  std::swap(GlobalConfigThreadLocalStore::Get()->verbosity, verbosity);
+}
+}  // namespace xgboost

From 8c10af45a0886f2e400114dc4dbf4f9ffb4334f1 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 23 Aug 2023 01:53:40 +0800
Subject: [PATCH 117/136] Delay the check for vector leaf. (#9509)

---
 src/gbm/gbtree.cc | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index e3df38629..92154609c 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -146,14 +146,6 @@ void GBTree::Configure(Args const& cfg) {
   if (specified_updater_) {
     error::WarnManualUpdater();
   }
-
-  if (model_.learner_model_param->IsVectorLeaf()) {
-    CHECK(tparam_.tree_method == TreeMethod::kHist || tparam_.tree_method == TreeMethod::kAuto)
-        << "Only the hist tree method is supported for building multi-target trees with vector "
-           "leaf.";
-    CHECK(ctx_->IsCPU()) << "GPU is not yet supported for vector leaf.";
-  }
-
   LOG(DEBUG) << "Using tree method: " << static_cast<int>(tparam_.tree_method);
 
   if (!specified_updater_) {
@@ -225,6 +217,13 @@ void GBTree::UpdateTreeLeaf(DMatrix const* p_fmat, HostDeviceVector<float> const
 
 void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
                      PredictionCacheEntry* predt, ObjFunction const* obj) {
+  if (model_.learner_model_param->IsVectorLeaf()) {
+    CHECK(tparam_.tree_method == TreeMethod::kHist || tparam_.tree_method == TreeMethod::kAuto)
+        << "Only the hist tree method is supported for building multi-target trees with vector "
+           "leaf.";
+    CHECK(ctx_->IsCPU()) << "GPU is not yet supported for vector leaf.";
+  }
+
   TreesOneIter new_trees;
   bst_target_t const n_groups = model_.learner_model_param->OutputLength();
   monitor_.Start("BoostNewTrees");

From 6103dca0bbc96b635eee8c7a69dbf2e25404faa0 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Wed, 23 Aug 2023 01:33:43 -0700
Subject: [PATCH 118/136] Support column split in GPU evaluate splits (#9511)

---
 src/collective/communicator-inl.cuh           |  14 ++
 src/collective/device_communicator.cuh        |  11 +
 .../device_communicator_adapter.cuh           |  18 +-
 src/collective/nccl_device_communicator.cu    |  11 +
 src/collective/nccl_device_communicator.cuh   |   1 +
 src/tree/gpu_hist/evaluate_splits.cu          |  19 +-
 src/tree/gpu_hist/evaluate_splits.cuh         |   6 +-
 src/tree/gpu_hist/evaluator.cu                |   9 +-
 src/tree/updater_gpu_hist.cu                  |   3 +-
 tests/cpp/plugin/test_federated_adapter.cu    |  24 +-
 .../cpp/tree/gpu_hist/test_evaluate_splits.cu | 237 ++++++++++--------
 11 files changed, 240 insertions(+), 113 deletions(-)

diff --git a/src/collective/communicator-inl.cuh b/src/collective/communicator-inl.cuh
index 0c5fcf910..200a9ff4a 100644
--- a/src/collective/communicator-inl.cuh
+++ b/src/collective/communicator-inl.cuh
@@ -57,6 +57,20 @@ inline void AllReduce(int device, double *send_receive_buffer, size_t count) {
   Communicator::GetDevice(device)->AllReduce(send_receive_buffer, count, DataType::kDouble, op);
 }
 
+/**
+ * @brief Gather values from all all processes.
+ *
+ * This assumes all ranks have the same size.
+ *
+ * @param send_buffer    Buffer storing the data to be sent.
+ * @param receive_buffer Buffer storing the gathered data.
+ * @param send_size      Size of the sent data in bytes.
+ */
+inline void AllGather(int device, void const *send_buffer, void *receive_buffer,
+                      std::size_t send_size) {
+  Communicator::GetDevice(device)->AllGather(send_buffer, receive_buffer, send_size);
+}
+
 /**
  * @brief Gather variable-length values from all processes.
  * @param device         ID of the device.
diff --git a/src/collective/device_communicator.cuh b/src/collective/device_communicator.cuh
index a59891863..69094b382 100644
--- a/src/collective/device_communicator.cuh
+++ b/src/collective/device_communicator.cuh
@@ -27,6 +27,17 @@ class DeviceCommunicator {
   virtual void AllReduce(void *send_receive_buffer, std::size_t count, DataType data_type,
                          Operation op) = 0;
 
+  /**
+   * @brief Gather values from all all processes.
+   *
+   * This assumes all ranks have the same size.
+   *
+   * @param send_buffer    Buffer storing the data to be sent.
+   * @param receive_buffer Buffer storing the gathered data.
+   * @param send_size      Size of the sent data in bytes.
+   */
+  virtual void AllGather(void const *send_buffer, void *receive_buffer, std::size_t send_size) = 0;
+
   /**
    * @brief Gather variable-length values from all processes.
    * @param send_buffer Buffer storing the input data.
diff --git a/src/collective/device_communicator_adapter.cuh b/src/collective/device_communicator_adapter.cuh
index f8135fb94..d10b10486 100644
--- a/src/collective/device_communicator_adapter.cuh
+++ b/src/collective/device_communicator_adapter.cuh
@@ -28,12 +28,26 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
 
     dh::safe_cuda(cudaSetDevice(device_ordinal_));
     auto size = count * GetTypeSize(data_type);
-    host_buffer_.reserve(size);
+    host_buffer_.resize(size);
     dh::safe_cuda(cudaMemcpy(host_buffer_.data(), send_receive_buffer, size, cudaMemcpyDefault));
     Allreduce(host_buffer_.data(), count, data_type, op);
     dh::safe_cuda(cudaMemcpy(send_receive_buffer, host_buffer_.data(), size, cudaMemcpyDefault));
   }
 
+  void AllGather(void const *send_buffer, void *receive_buffer, std::size_t send_size) override {
+    if (world_size_ == 1) {
+      return;
+    }
+
+    dh::safe_cuda(cudaSetDevice(device_ordinal_));
+    host_buffer_.resize(send_size * world_size_);
+    dh::safe_cuda(cudaMemcpy(host_buffer_.data() + rank_ * send_size, send_buffer, send_size,
+                             cudaMemcpyDefault));
+    Allgather(host_buffer_.data(), host_buffer_.size());
+    dh::safe_cuda(
+        cudaMemcpy(receive_buffer, host_buffer_.data(), host_buffer_.size(), cudaMemcpyDefault));
+  }
+
   void AllGatherV(void const *send_buffer, size_t length_bytes, std::vector<std::size_t> *segments,
                   dh::caching_device_vector<char> *receive_buffer) override {
     if (world_size_ == 1) {
@@ -49,7 +63,7 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
     auto total_bytes = std::accumulate(segments->cbegin(), segments->cend(), 0UL);
     receive_buffer->resize(total_bytes);
 
-    host_buffer_.reserve(total_bytes);
+    host_buffer_.resize(total_bytes);
     size_t offset = 0;
     for (int32_t i = 0; i < world_size_; ++i) {
       size_t as_bytes = segments->at(i);
diff --git a/src/collective/nccl_device_communicator.cu b/src/collective/nccl_device_communicator.cu
index 51fa5693c..3d4905cb1 100644
--- a/src/collective/nccl_device_communicator.cu
+++ b/src/collective/nccl_device_communicator.cu
@@ -178,6 +178,17 @@ void NcclDeviceCommunicator::AllReduce(void *send_receive_buffer, std::size_t co
   allreduce_calls_ += 1;
 }
 
+void NcclDeviceCommunicator::AllGather(void const *send_buffer, void *receive_buffer,
+                                       std::size_t send_size) {
+  if (world_size_ == 1) {
+    return;
+  }
+
+  dh::safe_cuda(cudaSetDevice(device_ordinal_));
+  dh::safe_nccl(ncclAllGather(send_buffer, receive_buffer, send_size, ncclInt8, nccl_comm_,
+                              dh::DefaultStream()));
+}
+
 void NcclDeviceCommunicator::AllGatherV(void const *send_buffer, size_t length_bytes,
                                         std::vector<std::size_t> *segments,
                                         dh::caching_device_vector<char> *receive_buffer) {
diff --git a/src/collective/nccl_device_communicator.cuh b/src/collective/nccl_device_communicator.cuh
index d99002685..084db2046 100644
--- a/src/collective/nccl_device_communicator.cuh
+++ b/src/collective/nccl_device_communicator.cuh
@@ -29,6 +29,7 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
   ~NcclDeviceCommunicator() override;
   void AllReduce(void *send_receive_buffer, std::size_t count, DataType data_type,
                  Operation op) override;
+  void AllGather(void const *send_buffer, void *receive_buffer, std::size_t send_size) override;
   void AllGatherV(void const *send_buffer, size_t length_bytes, std::vector<std::size_t> *segments,
                   dh::caching_device_vector<char> *receive_buffer) override;
   void Synchronize() override;
diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu
index c48c8ddf3..30941c060 100644
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -5,8 +5,8 @@
 #include <vector>
 #include <limits>
 
+#include "../../collective/communicator-inl.cuh"
 #include "../../common/categorical.h"
-#include "../../common/device_helpers.cuh"
 #include "../../data/ellpack_page.cuh"
 #include "evaluate_splits.cuh"
 #include "expand_entry.cuh"
@@ -409,6 +409,23 @@ void GPUHistEvaluator::EvaluateSplits(
   this->LaunchEvaluateSplits(max_active_features, d_inputs, shared_inputs,
                              evaluator, out_splits);
 
+  if (is_column_split_) {
+    // With column-wise data split, we gather the split candidates from all the workers and find the
+    // global best candidates.
+    auto const world_size = collective::GetWorldSize();
+    dh::TemporaryArray<DeviceSplitCandidate> all_candidate_storage(out_splits.size() * world_size);
+    auto all_candidates = dh::ToSpan(all_candidate_storage);
+    collective::AllGather(device_, out_splits.data(), all_candidates.data(),
+                          out_splits.size() * sizeof(DeviceSplitCandidate));
+
+    // Reduce to get the best candidate from all workers.
+    dh::LaunchN(out_splits.size(), [world_size, all_candidates, out_splits] __device__(size_t i) {
+      for (auto rank = 0; rank < world_size; rank++) {
+        out_splits[i] = out_splits[i] + all_candidates[rank * out_splits.size() + i];
+      }
+    });
+  }
+
   auto d_sorted_idx = this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size());
   auto d_entries = out_entries;
   auto device_cats_accessor = this->DeviceCatStorage(nidx);
diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh
index 0b44f31aa..25a8cde89 100644
--- a/src/tree/gpu_hist/evaluate_splits.cuh
+++ b/src/tree/gpu_hist/evaluate_splits.cuh
@@ -83,6 +83,9 @@ class GPUHistEvaluator {
   // Number of elements of categorical storage type
   // needed to hold categoricals for a single mode
   std::size_t node_categorical_storage_size_ = 0;
+  // Is the data split column-wise?
+  bool is_column_split_ = false;
+  int32_t device_;
 
   // Copy the categories from device to host asynchronously.
   void CopyToHost( const std::vector<bst_node_t>& nidx);
@@ -136,7 +139,8 @@ class GPUHistEvaluator {
    * \brief Reset the evaluator, should be called before any use.
    */
   void Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
-             bst_feature_t n_features, TrainParam const &param, int32_t device);
+             bst_feature_t n_features, TrainParam const &param, bool is_column_split,
+             int32_t device);
 
   /**
    * \brief Get host category storage for nidx.  Different from the internal version, this
diff --git a/src/tree/gpu_hist/evaluator.cu b/src/tree/gpu_hist/evaluator.cu
index bd1891aa4..69485aa81 100644
--- a/src/tree/gpu_hist/evaluator.cu
+++ b/src/tree/gpu_hist/evaluator.cu
@@ -14,10 +14,9 @@
 
 namespace xgboost {
 namespace tree {
-void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts,
-                                           common::Span<FeatureType const> ft,
-                                           bst_feature_t n_features, TrainParam const &param,
-                                           int32_t device) {
+void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
+                             bst_feature_t n_features, TrainParam const &param,
+                             bool is_column_split, int32_t device) {
   param_ = param;
   tree_evaluator_ = TreeEvaluator{param, n_features, device};
   has_categoricals_ = cuts.HasCategorical();
@@ -65,6 +64,8 @@ void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts,
                         return fidx;
                       });
   }
+  is_column_split_ = is_column_split;
+  device_ = device;
 }
 
 common::Span<bst_feature_t const> GPUHistEvaluator::SortHistogram(
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 5cce89e2c..9e94c46c6 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -242,7 +242,8 @@ struct GPUHistMakerDevice {
     page = sample.page;
     gpair = sample.gpair;
 
-    this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param, ctx_->gpu_id);
+    this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param,
+                           dmat->Info().IsColumnSplit(), ctx_->gpu_id);
 
     quantiser.reset(new GradientQuantiser(this->gpair));
 
diff --git a/tests/cpp/plugin/test_federated_adapter.cu b/tests/cpp/plugin/test_federated_adapter.cu
index 8aa5304ea..cec180e70 100644
--- a/tests/cpp/plugin/test_federated_adapter.cu
+++ b/tests/cpp/plugin/test_federated_adapter.cu
@@ -11,8 +11,8 @@
 #include "../../../plugin/federated/federated_communicator.h"
 #include "../../../src/collective/communicator-inl.cuh"
 #include "../../../src/collective/device_communicator_adapter.cuh"
-#include "./helpers.h"
 #include "../helpers.h"
+#include "./helpers.h"
 
 namespace xgboost::collective {
 
@@ -45,6 +45,28 @@ TEST_F(FederatedAdapterTest, MGPUAllReduceSum) {
   RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyAllReduceSum);
 }
 
+namespace {
+void VerifyAllGather() {
+  auto const world_size = collective::GetWorldSize();
+  auto const rank = collective::GetRank();
+  auto const device = GPUIDX;
+  common::SetDevice(device);
+  thrust::device_vector<double> send_buffer(1, rank);
+  thrust::device_vector<double> receive_buffer(world_size, 0);
+  collective::AllGather(device, send_buffer.data().get(), receive_buffer.data().get(),
+                        sizeof(double));
+  thrust::host_vector<double> host_buffer = receive_buffer;
+  EXPECT_EQ(host_buffer.size(), world_size);
+  for (auto i = 0; i < world_size; i++) {
+    EXPECT_EQ(host_buffer[i], i);
+  }
+}
+}  // anonymous namespace
+
+TEST_F(FederatedAdapterTest, MGPUAllGather) {
+  RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyAllGather);
+}
+
 namespace {
 void VerifyAllGatherV() {
   auto const world_size = collective::GetWorldSize();
diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
index cb2f7d604..f74b7d3ca 100644
--- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
+++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
@@ -2,24 +2,23 @@
  * Copyright 2020-2022 by XGBoost contributors
  */
 #include <gtest/gtest.h>
+#include <thrust/host_vector.h>
 
 #include "../../../../src/tree/gpu_hist/evaluate_splits.cuh"
 #include "../../helpers.h"
 #include "../../histogram_helpers.h"
 #include "../test_evaluate_splits.h"  // TestPartitionBasedSplit
-#include <thrust/host_vector.h>
 
 namespace xgboost {
 namespace tree {
+
 namespace {
 auto ZeroParam() {
-  auto args = Args{{"min_child_weight", "0"},
-                   {"lambda", "0"}};
+  auto args = Args{{"min_child_weight", "0"}, {"lambda", "0"}};
   TrainParam tparam;
   tparam.UpdateAllowUnknown(args);
   return tparam;
 }
-
 }  // anonymous namespace
 
 inline GradientQuantiser DummyRoundingFactor() {
@@ -37,7 +36,6 @@ thrust::device_vector<GradientPairInt64> ConvertToInteger(std::vector<GradientPa
   return y;
 }
 
-
 TEST_F(TestCategoricalSplitWithMissing, GPUHistEvaluator) {
   thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0};
   GPUTrainingParam param{param_};
@@ -61,12 +59,13 @@ TEST_F(TestCategoricalSplitWithMissing, GPUHistEvaluator) {
 
   GPUHistEvaluator evaluator{param_, static_cast<bst_feature_t>(feature_set.size()), 0};
 
-  evaluator.Reset(cuts_, dh::ToSpan(feature_types), feature_set.size(), param_, 0);
+  evaluator.Reset(cuts_, dh::ToSpan(feature_types), feature_set.size(), param_, false, 0);
   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
 
   ASSERT_EQ(result.thresh, 1);
   this->CheckResult(result.loss_chg, result.findex, result.fvalue, result.is_cat,
-                    result.dir == kLeftDir, quantiser.ToFloatingPoint(result.left_sum), quantiser.ToFloatingPoint(result.right_sum));
+                    result.dir == kLeftDir, quantiser.ToFloatingPoint(result.left_sum),
+                    quantiser.ToFloatingPoint(result.right_sum));
 }
 
 TEST(GpuHist, PartitionBasic) {
@@ -102,7 +101,7 @@ TEST(GpuHist, PartitionBasic) {
   };
 
   GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), 0};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, 0);
+  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, 0);
 
   {
     // -1.0s go right
@@ -143,7 +142,8 @@ TEST(GpuHist, PartitionBasic) {
     EXPECT_EQ(result.left_sum + result.right_sum, parent_sum);
   }
   // With 3.0/3.0 missing values
-  // Forward, first 2 categories are selected, while the last one go to left along with missing value
+  // Forward, first 2 categories are selected, while the last one go to left along with missing
+  // value
   {
     auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 6.0});
     auto feature_histogram = ConvertToInteger({{-1.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}});
@@ -213,11 +213,12 @@ TEST(GpuHist, PartitionTwoFeatures) {
                                           false};
 
   GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), 0};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, 0);
+  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, 0);
 
   {
     auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-6.0, 3.0});
-    auto feature_histogram = ConvertToInteger({        {-2.0, 1.0}, {-2.0, 1.0}, {-2.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}, {-4.0, 1.0}});
+    auto feature_histogram = ConvertToInteger(
+        {{-2.0, 1.0}, {-2.0, 1.0}, {-2.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}, {-4.0, 1.0}});
     EvaluateSplitInputs input{0, 0, parent_sum, dh::ToSpan(feature_set),
                               dh::ToSpan(feature_histogram)};
     DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
@@ -229,7 +230,8 @@ TEST(GpuHist, PartitionTwoFeatures) {
 
   {
     auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-6.0, 3.0});
-    auto feature_histogram = ConvertToInteger({        {-2.0, 1.0}, {-2.0, 1.0}, {-2.0, 1.0}, {-1.0, 1.0}, {-2.5, 1.0}, {-2.5, 1.0}});
+    auto feature_histogram = ConvertToInteger(
+        {{-2.0, 1.0}, {-2.0, 1.0}, {-2.0, 1.0}, {-1.0, 1.0}, {-2.5, 1.0}, {-2.5, 1.0}});
     EvaluateSplitInputs input{1, 0, parent_sum, dh::ToSpan(feature_set),
                               dh::ToSpan(feature_histogram)};
     DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
@@ -271,12 +273,12 @@ TEST(GpuHist, PartitionTwoNodes) {
                                           false};
 
   GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), 0};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, 0);
+  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, 0);
 
   {
     auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-6.0, 3.0});
-    auto feature_histogram_a = ConvertToInteger({{-1.0, 1.0}, {-2.5, 1.0}, {-2.5, 1.0},
-                                         {-1.0, 1.0}, {-1.0, 1.0}, {-4.0, 1.0}});
+    auto feature_histogram_a = ConvertToInteger(
+        {{-1.0, 1.0}, {-2.5, 1.0}, {-2.5, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}, {-4.0, 1.0}});
     thrust::device_vector<EvaluateSplitInputs> inputs(2);
     inputs[0] = EvaluateSplitInputs{0, 0, parent_sum, dh::ToSpan(feature_set),
                                     dh::ToSpan(feature_histogram_a)};
@@ -304,8 +306,7 @@ void TestEvaluateSingleSplit(bool is_categorical) {
   // Setup gradients so that second feature gets higher gain
   auto feature_histogram = ConvertToInteger({{-0.5, 0.5}, {0.5, 0.5}, {-1.0, 0.5}, {1.0, 0.5}});
 
-  dh::device_vector<FeatureType> feature_types(feature_set.size(),
-                                               FeatureType::kCategorical);
+  dh::device_vector<FeatureType> feature_types(feature_set.size(), FeatureType::kCategorical);
   common::Span<FeatureType> d_feature_types;
   if (is_categorical) {
     auto max_cat = *std::max_element(cuts.cut_values_.HostVector().begin(),
@@ -324,9 +325,8 @@ void TestEvaluateSingleSplit(bool is_categorical) {
                                           cuts.min_vals_.ConstDeviceSpan(),
                                           false};
 
-  GPUHistEvaluator evaluator{
-      tparam, static_cast<bst_feature_t>(feature_set.size()), 0};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, 0);
+  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), 0};
+  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, 0);
   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
 
   EXPECT_EQ(result.findex, 1);
@@ -338,31 +338,23 @@ void TestEvaluateSingleSplit(bool is_categorical) {
   EXPECT_EQ(result.left_sum + result.right_sum, parent_sum);
 }
 
-TEST(GpuHist, EvaluateSingleSplit) {
-  TestEvaluateSingleSplit(false);
-}
+TEST(GpuHist, EvaluateSingleSplit) { TestEvaluateSingleSplit(false); }
 
-TEST(GpuHist, EvaluateSingleCategoricalSplit) {
-  TestEvaluateSingleSplit(true);
-}
+TEST(GpuHist, EvaluateSingleCategoricalSplit) { TestEvaluateSingleSplit(true); }
 
 TEST(GpuHist, EvaluateSingleSplitMissing) {
   auto quantiser = DummyRoundingFactor();
-    auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{1.0, 1.5});
+  auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{1.0, 1.5});
   TrainParam tparam = ZeroParam();
   GPUTrainingParam param{tparam};
 
-  thrust::device_vector<bst_feature_t> feature_set =
-      std::vector<bst_feature_t>{0};
-  thrust::device_vector<uint32_t> feature_segments =
-      std::vector<bst_row_t>{0, 2};
+  thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0};
+  thrust::device_vector<uint32_t> feature_segments = std::vector<bst_row_t>{0, 2};
   thrust::device_vector<float> feature_values = std::vector<float>{1.0, 2.0};
   thrust::device_vector<float> feature_min_values = std::vector<float>{0.0};
   auto feature_histogram = ConvertToInteger({{-0.5, 0.5}, {0.5, 0.5}});
-  EvaluateSplitInputs input{1,0,
-                                          parent_sum,
-                                          dh::ToSpan(feature_set),
-                                          dh::ToSpan(feature_histogram)};
+  EvaluateSplitInputs input{1, 0, parent_sum, dh::ToSpan(feature_set),
+                            dh::ToSpan(feature_histogram)};
   EvaluateSplitSharedInputs shared_inputs{param,
                                           quantiser,
                                           {},
@@ -377,7 +369,7 @@ TEST(GpuHist, EvaluateSingleSplitMissing) {
   EXPECT_EQ(result.findex, 0);
   EXPECT_EQ(result.fvalue, 1.0);
   EXPECT_EQ(result.dir, kRightDir);
-  EXPECT_EQ(result.left_sum,quantiser.ToFixedPoint(GradientPairPrecise(-0.5, 0.5)));
+  EXPECT_EQ(result.left_sum, quantiser.ToFixedPoint(GradientPairPrecise(-0.5, 0.5)));
   EXPECT_EQ(result.right_sum, quantiser.ToFixedPoint(GradientPairPrecise(1.5, 1.0)));
 }
 
@@ -398,24 +390,18 @@ TEST(GpuHist, EvaluateSingleSplitEmpty) {
 // Feature 0 has a better split, but the algorithm must select feature 1
 TEST(GpuHist, EvaluateSingleSplitFeatureSampling) {
   auto quantiser = DummyRoundingFactor();
-    auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 1.0});
+  auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 1.0});
   TrainParam tparam = ZeroParam();
   tparam.UpdateAllowUnknown(Args{});
   GPUTrainingParam param{tparam};
 
-  thrust::device_vector<bst_feature_t> feature_set =
-      std::vector<bst_feature_t>{1};
-  thrust::device_vector<uint32_t> feature_segments =
-      std::vector<bst_row_t>{0, 2, 4};
-  thrust::device_vector<float> feature_values =
-      std::vector<float>{1.0, 2.0, 11.0, 12.0};
-  thrust::device_vector<float> feature_min_values =
-      std::vector<float>{0.0, 10.0};
-  auto feature_histogram = ConvertToInteger({          {-10.0, 0.5}, {10.0, 0.5}, {-0.5, 0.5}, {0.5, 0.5}});
-  EvaluateSplitInputs input{1,0,
-                                          parent_sum,
-                                          dh::ToSpan(feature_set),
-                                          dh::ToSpan(feature_histogram)};
+  thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{1};
+  thrust::device_vector<uint32_t> feature_segments = std::vector<bst_row_t>{0, 2, 4};
+  thrust::device_vector<float> feature_values = std::vector<float>{1.0, 2.0, 11.0, 12.0};
+  thrust::device_vector<float> feature_min_values = std::vector<float>{0.0, 10.0};
+  auto feature_histogram = ConvertToInteger({{-10.0, 0.5}, {10.0, 0.5}, {-0.5, 0.5}, {0.5, 0.5}});
+  EvaluateSplitInputs input{1, 0, parent_sum, dh::ToSpan(feature_set),
+                            dh::ToSpan(feature_histogram)};
   EvaluateSplitSharedInputs shared_inputs{param,
                                           quantiser,
                                           {},
@@ -429,31 +415,25 @@ TEST(GpuHist, EvaluateSingleSplitFeatureSampling) {
 
   EXPECT_EQ(result.findex, 1);
   EXPECT_EQ(result.fvalue, 11.0);
-  EXPECT_EQ(result.left_sum,quantiser.ToFixedPoint(GradientPairPrecise(-0.5, 0.5)));
+  EXPECT_EQ(result.left_sum, quantiser.ToFixedPoint(GradientPairPrecise(-0.5, 0.5)));
   EXPECT_EQ(result.right_sum, quantiser.ToFixedPoint(GradientPairPrecise(0.5, 0.5)));
 }
 
 // Features 0 and 1 have identical gain, the algorithm must select 0
 TEST(GpuHist, EvaluateSingleSplitBreakTies) {
   auto quantiser = DummyRoundingFactor();
-    auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 1.0});
+  auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 1.0});
   TrainParam tparam = ZeroParam();
   tparam.UpdateAllowUnknown(Args{});
   GPUTrainingParam param{tparam};
 
-  thrust::device_vector<bst_feature_t> feature_set =
-      std::vector<bst_feature_t>{0, 1};
-  thrust::device_vector<uint32_t> feature_segments =
-      std::vector<bst_row_t>{0, 2, 4};
-  thrust::device_vector<float> feature_values =
-      std::vector<float>{1.0, 2.0, 11.0, 12.0};
-  thrust::device_vector<float> feature_min_values =
-      std::vector<float>{0.0, 10.0};
-  auto feature_histogram = ConvertToInteger({          {-0.5, 0.5}, {0.5, 0.5}, {-0.5, 0.5}, {0.5, 0.5}});
-  EvaluateSplitInputs input{1,0,
-                                          parent_sum,
-                                          dh::ToSpan(feature_set),
-                                          dh::ToSpan(feature_histogram)};
+  thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0, 1};
+  thrust::device_vector<uint32_t> feature_segments = std::vector<bst_row_t>{0, 2, 4};
+  thrust::device_vector<float> feature_values = std::vector<float>{1.0, 2.0, 11.0, 12.0};
+  thrust::device_vector<float> feature_min_values = std::vector<float>{0.0, 10.0};
+  auto feature_histogram = ConvertToInteger({{-0.5, 0.5}, {0.5, 0.5}, {-0.5, 0.5}, {0.5, 0.5}});
+  EvaluateSplitInputs input{1, 0, parent_sum, dh::ToSpan(feature_set),
+                            dh::ToSpan(feature_histogram)};
   EvaluateSplitSharedInputs shared_inputs{param,
                                           quantiser,
                                           {},
@@ -463,7 +443,7 @@ TEST(GpuHist, EvaluateSingleSplitBreakTies) {
                                           false};
 
   GPUHistEvaluator evaluator(tparam, feature_min_values.size(), 0);
-  DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input,shared_inputs).split;
+  DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
 
   EXPECT_EQ(result.findex, 0);
   EXPECT_EQ(result.fvalue, 1.0);
@@ -477,41 +457,31 @@ TEST(GpuHist, EvaluateSplits) {
   tparam.UpdateAllowUnknown(Args{});
   GPUTrainingParam param{tparam};
 
-  thrust::device_vector<bst_feature_t> feature_set =
-      std::vector<bst_feature_t>{0, 1};
-  thrust::device_vector<uint32_t> feature_segments =
-      std::vector<bst_row_t>{0, 2, 4};
-  thrust::device_vector<float> feature_values =
-      std::vector<float>{1.0, 2.0, 11.0, 12.0};
-  thrust::device_vector<float> feature_min_values =
-      std::vector<float>{0.0, 0.0};
-  auto feature_histogram_left = ConvertToInteger({          {-0.5, 0.5}, {0.5, 0.5}, {-1.0, 0.5}, {1.0, 0.5}});
-  auto feature_histogram_right = ConvertToInteger({          {-1.0, 0.5}, {1.0, 0.5}, {-0.5, 0.5}, {0.5, 0.5}});
-  EvaluateSplitInputs input_left{
-      1,0,
-      parent_sum,
-      dh::ToSpan(feature_set),
-      dh::ToSpan(feature_histogram_left)};
-  EvaluateSplitInputs input_right{
-      2,0,
-      parent_sum,
-      dh::ToSpan(feature_set),
-      dh::ToSpan(feature_histogram_right)};
-  EvaluateSplitSharedInputs shared_inputs{
-      param,
-      quantiser,
-      {},
-      dh::ToSpan(feature_segments),
-      dh::ToSpan(feature_values),
-      dh::ToSpan(feature_min_values),
-      false
-  };
+  thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0, 1};
+  thrust::device_vector<uint32_t> feature_segments = std::vector<bst_row_t>{0, 2, 4};
+  thrust::device_vector<float> feature_values = std::vector<float>{1.0, 2.0, 11.0, 12.0};
+  thrust::device_vector<float> feature_min_values = std::vector<float>{0.0, 0.0};
+  auto feature_histogram_left =
+      ConvertToInteger({{-0.5, 0.5}, {0.5, 0.5}, {-1.0, 0.5}, {1.0, 0.5}});
+  auto feature_histogram_right =
+      ConvertToInteger({{-1.0, 0.5}, {1.0, 0.5}, {-0.5, 0.5}, {0.5, 0.5}});
+  EvaluateSplitInputs input_left{1, 0, parent_sum, dh::ToSpan(feature_set),
+                                 dh::ToSpan(feature_histogram_left)};
+  EvaluateSplitInputs input_right{2, 0, parent_sum, dh::ToSpan(feature_set),
+                                  dh::ToSpan(feature_histogram_right)};
+  EvaluateSplitSharedInputs shared_inputs{param,
+                                          quantiser,
+                                          {},
+                                          dh::ToSpan(feature_segments),
+                                          dh::ToSpan(feature_values),
+                                          dh::ToSpan(feature_min_values),
+                                          false};
 
-  GPUHistEvaluator evaluator{
-      tparam, static_cast<bst_feature_t>(feature_min_values.size()), 0};
-  dh::device_vector<EvaluateSplitInputs> inputs = std::vector<EvaluateSplitInputs>{input_left,input_right};
-  evaluator.LaunchEvaluateSplits(input_left.feature_set.size(),dh::ToSpan(inputs),shared_inputs, evaluator.GetEvaluator(),
-                           dh::ToSpan(out_splits));
+  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_min_values.size()), 0};
+  dh::device_vector<EvaluateSplitInputs> inputs =
+      std::vector<EvaluateSplitInputs>{input_left, input_right};
+  evaluator.LaunchEvaluateSplits(input_left.feature_set.size(), dh::ToSpan(inputs), shared_inputs,
+                                 evaluator.GetEvaluator(), dh::ToSpan(out_splits));
 
   DeviceSplitCandidate result_left = out_splits[0];
   EXPECT_EQ(result_left.findex, 1);
@@ -530,18 +500,19 @@ TEST_F(TestPartitionBasedSplit, GpuHist) {
   cuts_.cut_values_.SetDevice(0);
   cuts_.min_vals_.SetDevice(0);
 
-  evaluator.Reset(cuts_, dh::ToSpan(ft), info_.num_col_, param_, 0);
+  evaluator.Reset(cuts_, dh::ToSpan(ft), info_.num_col_, param_, false, 0);
 
   // Convert the sample histogram to fixed point
   auto quantiser = DummyRoundingFactor();
   thrust::host_vector<GradientPairInt64> h_hist;
-  for(auto e: hist_[0]){
+  for (auto e : hist_[0]) {
     h_hist.push_back(quantiser.ToFixedPoint(e));
   }
   dh::device_vector<GradientPairInt64> d_hist = h_hist;
   dh::device_vector<bst_feature_t> feature_set{std::vector<bst_feature_t>{0}};
 
-  EvaluateSplitInputs input{0, 0, quantiser.ToFixedPoint(total_gpair_), dh::ToSpan(feature_set), dh::ToSpan(d_hist)};
+  EvaluateSplitInputs input{0, 0, quantiser.ToFixedPoint(total_gpair_), dh::ToSpan(feature_set),
+                            dh::ToSpan(d_hist)};
   EvaluateSplitSharedInputs shared_inputs{GPUTrainingParam{param_},
                                           quantiser,
                                           dh::ToSpan(ft),
@@ -552,5 +523,65 @@ TEST_F(TestPartitionBasedSplit, GpuHist) {
   auto split = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
   ASSERT_NEAR(split.loss_chg, best_score_, 1e-2);
 }
+
+class MGPUHistTest : public BaseMGPUTest {};
+
+namespace {
+void VerifyColumnSplitEvaluateSingleSplit(bool is_categorical) {
+  auto rank = collective::GetRank();
+  auto quantiser = DummyRoundingFactor();
+  auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 1.0});
+  TrainParam tparam = ZeroParam();
+  GPUTrainingParam param{tparam};
+
+  common::HistogramCuts cuts{rank == 0
+                                 ? MakeCutsForTest({1.0, 2.0}, {0, 2, 2}, {0.0, 0.0}, GPUIDX)
+                                 : MakeCutsForTest({11.0, 12.0}, {0, 0, 2}, {0.0, 0.0}, GPUIDX)};
+  thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0, 1};
+
+  // Setup gradients so that second feature gets higher gain
+  auto feature_histogram = rank == 0 ? ConvertToInteger({{-0.5, 0.5}, {0.5, 0.5}})
+                                     : ConvertToInteger({{-1.0, 0.5}, {1.0, 0.5}});
+
+  dh::device_vector<FeatureType> feature_types(feature_set.size(), FeatureType::kCategorical);
+  common::Span<FeatureType> d_feature_types;
+  if (is_categorical) {
+    auto max_cat = *std::max_element(cuts.cut_values_.HostVector().begin(),
+                                     cuts.cut_values_.HostVector().end());
+    cuts.SetCategorical(true, max_cat);
+    d_feature_types = dh::ToSpan(feature_types);
+  }
+
+  EvaluateSplitInputs input{1, 0, parent_sum, dh::ToSpan(feature_set),
+                            dh::ToSpan(feature_histogram)};
+  EvaluateSplitSharedInputs shared_inputs{param,
+                                          quantiser,
+                                          d_feature_types,
+                                          cuts.cut_ptrs_.ConstDeviceSpan(),
+                                          cuts.cut_values_.ConstDeviceSpan(),
+                                          cuts.min_vals_.ConstDeviceSpan(),
+                                          false};
+
+  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), GPUIDX};
+  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, true, GPUIDX);
+  DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+
+  EXPECT_EQ(result.findex, 1) << "rank: " << rank;
+  if (is_categorical) {
+    ASSERT_TRUE(std::isnan(result.fvalue));
+  } else {
+    EXPECT_EQ(result.fvalue, 11.0) << "rank: " << rank;
+  }
+  EXPECT_EQ(result.left_sum + result.right_sum, parent_sum) << "rank: " << rank;
+}
+}  // anonymous namespace
+
+TEST_F(MGPUHistTest, ColumnSplitEvaluateSingleSplit) {
+  DoTest(VerifyColumnSplitEvaluateSingleSplit, false);
+}
+
+TEST_F(MGPUHistTest, ColumnSplitEvaluateSingleCategoricalSplit) {
+  DoTest(VerifyColumnSplitEvaluateSingleSplit, true);
+}
 }  // namespace tree
 }  // namespace xgboost

From 972730cde05569e00e8e0576aea8dcd546f0f1dd Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 24 Aug 2023 05:29:52 +0800
Subject: [PATCH 119/136] Use matrix for gradient. (#9508)

- Use the `linalg::Matrix` for storing gradients.
- New API for the custom objective.
- Custom objective for multi-class/multi-target is now required to return the correct shape.
- Custom objective for Python can accept arrays with any strides. (row-major, column-major)
---
 R-package/R/utils.R                           |   9 +-
 R-package/src/init.c                          |   4 +-
 R-package/src/xgboost_R.cc                    |  43 ++--
 R-package/src/xgboost_R.h                     |   3 +-
 demo/guide-python/custom_softmax.py           |   4 +-
 demo/guide-python/multioutput_regression.py   |   7 +-
 include/xgboost/base.h                        |   4 +-
 include/xgboost/c_api.h                       |  43 ++--
 include/xgboost/gbm.h                         |  34 ++--
 include/xgboost/learner.h                     |  21 +-
 include/xgboost/linalg.h                      |  48 ++++-
 include/xgboost/linear_updater.h              |   5 +-
 include/xgboost/objective.h                   |  31 ++-
 include/xgboost/tree_updater.h                |   2 +-
 .../java/ml/dmlc/xgboost4j/java/Booster.java  |  26 ++-
 .../ml/dmlc/xgboost4j/java/XGBoostJNI.java    |   2 +-
 .../ml/dmlc/xgboost4j/scala/Booster.scala     |  22 ++-
 .../xgboost4j/src/native/xgboost4j.cpp        |  47 +++--
 jvm-packages/xgboost4j/src/native/xgboost4j.h |  18 +-
 plugin/example/custom_obj.cc                  |  41 ++--
 python-package/xgboost/core.py                |  62 ++++--
 python-package/xgboost/testing/__init__.py    |  34 +++-
 python-package/xgboost/training.py            |   2 +-
 src/c_api/c_api.cc                            |  89 ++++++---
 src/c_api/c_api.cu                            |  28 ++-
 src/c_api/c_api_error.h                       |   8 +-
 src/c_api/c_api_utils.h                       |  55 +++++-
 src/data/array_interface.h                    |   4 +-
 src/data/data.cc                              |   2 +-
 src/gbm/gblinear.cc                           |  10 +-
 src/gbm/gbtree.cc                             |  35 ++--
 src/gbm/gbtree.cu                             |  30 +--
 src/gbm/gbtree.h                              |   6 +-
 src/learner.cc                                |  19 +-
 src/linear/updater_coordinate.cc              |  15 +-
 src/linear/updater_gpu_coordinate.cu          |  17 +-
 src/linear/updater_shotgun.cc                 |  25 ++-
 src/objective/aft_obj.cu                      |  19 +-
 src/objective/hinge.cu                        |   9 +-
 src/objective/init_estimation.cc              |   2 +-
 src/objective/lambdarank_obj.cc               |  58 +++---
 src/objective/lambdarank_obj.cu               |  31 +--
 src/objective/lambdarank_obj.cuh              |   6 +-
 src/objective/lambdarank_obj.h                |   6 +-
 src/objective/multiclass_obj.cu               |  17 +-
 src/objective/quantile_obj.cu                 |  30 +--
 src/objective/regression_obj.cu               | 185 ++++++++++--------
 src/tree/fit_stump.cc                         |   7 +-
 src/tree/fit_stump.h                          |   2 +-
 src/tree/updater_approx.cc                    |   7 +-
 src/tree/updater_colmaker.cc                  |   5 +-
 src/tree/updater_gpu_hist.cu                  |  12 +-
 src/tree/updater_prune.cc                     |   2 +-
 src/tree/updater_quantile_hist.cc             |   5 +-
 src/tree/updater_refresh.cc                   |   9 +-
 src/tree/updater_sync.cc                      |   2 +-
 tests/cpp/c_api/test_c_api.cc                 |   4 +-
 tests/cpp/gbm/test_gbtree.cc                  |   7 +-
 tests/cpp/helpers.cc                          |  21 +-
 tests/cpp/helpers.h                           |  32 ++-
 tests/cpp/linear/test_linear.cc               |   8 +-
 tests/cpp/linear/test_linear.cu               |   7 +-
 tests/cpp/objective/test_aft_obj.cc           |  19 +-
 tests/cpp/objective/test_lambdarank_obj.cc    |  32 +--
 tests/cpp/objective/test_lambdarank_obj.cu    |  29 +--
 tests/cpp/objective/test_regression_obj.cc    |  22 +--
 tests/cpp/predictor/test_cpu_predictor.cc     |   7 +-
 tests/cpp/test_multi_target.cc                |  36 +++-
 tests/cpp/tree/test_fit_stump.cc              |  18 +-
 tests/cpp/tree/test_gpu_hist.cu               |  18 +-
 tests/cpp/tree/test_histmaker.cc              |  26 ++-
 tests/cpp/tree/test_prediction_cache.cc       |   2 +-
 tests/cpp/tree/test_prune.cc                  |  12 +-
 tests/cpp/tree/test_quantile_hist.cc          |  18 +-
 tests/cpp/tree/test_refresh.cc                |   8 +-
 tests/cpp/tree/test_tree_stat.cc              |  17 +-
 tests/python-gpu/test_gpu_with_sklearn.py     |  91 +++++++++
 77 files changed, 1052 insertions(+), 651 deletions(-)

diff --git a/R-package/R/utils.R b/R-package/R/utils.R
index 458b119f6..5faca2ef4 100644
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@@ -154,7 +154,14 @@ xgb.iter.update <- function(booster_handle, dtrain, iter, obj) {
     pred <- predict(booster_handle, dtrain, outputmargin = TRUE, training = TRUE,
                     ntreelimit = 0)
     gpair <- obj(pred, dtrain)
-    .Call(XGBoosterBoostOneIter_R, booster_handle, dtrain, gpair$grad, gpair$hess)
+    n_samples <- dim(dtrain)[1]
+    # We still require row-major in R as I'm not quite sure sure how to get the stride of
+    # the matrix in C.
+    gpair$grad <- matrix(gpair$grad, nrow = n_samples, byrow = TRUE)
+    gpair$hess <- matrix(gpair$hess, nrow = n_samples, byrow = TRUE)
+    .Call(
+      XGBoosterBoostOneIter_R, booster_handle, dtrain, iter, gpair$grad, gpair$hess
+    )
   }
   return(TRUE)
 }
diff --git a/R-package/src/init.c b/R-package/src/init.c
index 583dc7e32..09174222e 100644
--- a/R-package/src/init.c
+++ b/R-package/src/init.c
@@ -16,7 +16,7 @@ Check these declarations against the C/Fortran source code.
 */
 
 /* .Call calls */
-extern SEXP XGBoosterBoostOneIter_R(SEXP, SEXP, SEXP, SEXP);
+extern SEXP XGBoosterTrainOneIter_R(SEXP, SEXP, SEXP, SEXP, SEXP);
 extern SEXP XGBoosterCreate_R(SEXP);
 extern SEXP XGBoosterCreateInEmptyObj_R(SEXP, SEXP);
 extern SEXP XGBoosterDumpModel_R(SEXP, SEXP, SEXP, SEXP);
@@ -53,7 +53,7 @@ extern SEXP XGBGetGlobalConfig_R(void);
 extern SEXP XGBoosterFeatureScore_R(SEXP, SEXP);
 
 static const R_CallMethodDef CallEntries[] = {
-  {"XGBoosterBoostOneIter_R",     (DL_FUNC) &XGBoosterBoostOneIter_R,     4},
+  {"XGBoosterBoostOneIter_R",     (DL_FUNC) &XGBoosterTrainOneIter_R,     5},
   {"XGBoosterCreate_R",           (DL_FUNC) &XGBoosterCreate_R,           1},
   {"XGBoosterCreateInEmptyObj_R", (DL_FUNC) &XGBoosterCreateInEmptyObj_R, 2},
   {"XGBoosterDumpModel_R",        (DL_FUNC) &XGBoosterDumpModel_R,        4},
diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc
index 805e63a32..b975ab8ba 100644
--- a/R-package/src/xgboost_R.cc
+++ b/R-package/src/xgboost_R.cc
@@ -48,13 +48,6 @@
 
 using dmlc::BeginPtr;
 
-xgboost::Context const *BoosterCtx(BoosterHandle handle) {
-  CHECK_HANDLE();
-  auto *learner = static_cast<xgboost::Learner *>(handle);
-  CHECK(learner);
-  return learner->Ctx();
-}
-
 xgboost::Context const *DMatrixCtx(DMatrixHandle handle) {
   CHECK_HANDLE();
   auto p_m = static_cast<std::shared_ptr<xgboost::DMatrix> *>(handle);
@@ -394,21 +387,25 @@ XGB_DLL SEXP XGBoosterUpdateOneIter_R(SEXP handle, SEXP iter, SEXP dtrain) {
   return R_NilValue;
 }
 
-XGB_DLL SEXP XGBoosterBoostOneIter_R(SEXP handle, SEXP dtrain, SEXP grad, SEXP hess) {
+XGB_DLL SEXP XGBoosterTrainOneIter_R(SEXP handle, SEXP dtrain, SEXP iter, SEXP grad, SEXP hess) {
   R_API_BEGIN();
-  CHECK_EQ(length(grad), length(hess))
-      << "gradient and hess must have same length";
-  int len = length(grad);
-  std::vector<float> tgrad(len), thess(len);
-  auto ctx = BoosterCtx(R_ExternalPtrAddr(handle));
-  xgboost::common::ParallelFor(len, ctx->Threads(), [&](xgboost::omp_ulong j) {
-    tgrad[j] = REAL(grad)[j];
-    thess[j] = REAL(hess)[j];
-  });
-  CHECK_CALL(XGBoosterBoostOneIter(R_ExternalPtrAddr(handle),
-                                 R_ExternalPtrAddr(dtrain),
-                                 BeginPtr(tgrad), BeginPtr(thess),
-                                 len));
+  CHECK_EQ(length(grad), length(hess)) << "gradient and hess must have same length";
+  SEXP gdim = getAttrib(grad, R_DimSymbol);
+  auto n_samples = static_cast<std::size_t>(INTEGER(gdim)[0]);
+  auto n_targets = static_cast<std::size_t>(INTEGER(gdim)[1]);
+
+  SEXP hdim = getAttrib(hess, R_DimSymbol);
+  CHECK_EQ(INTEGER(hdim)[0], n_samples) << "mismatched size between gradient and hessian";
+  CHECK_EQ(INTEGER(hdim)[1], n_targets) << "mismatched size between gradient and hessian";
+  double const *d_grad = REAL(grad);
+  double const *d_hess = REAL(hess);
+
+  auto ctx = xgboost::detail::BoosterCtx(R_ExternalPtrAddr(handle));
+  auto [s_grad, s_hess] =
+      xgboost::detail::MakeGradientInterface(ctx, d_grad, d_hess, n_samples, n_targets);
+  CHECK_CALL(XGBoosterTrainOneIter(R_ExternalPtrAddr(handle), R_ExternalPtrAddr(dtrain),
+                                   asInteger(iter), s_grad.c_str(), s_hess.c_str()));
+
   R_API_END();
   return R_NilValue;
 }
@@ -460,7 +457,7 @@ XGB_DLL SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_con
     len *= out_shape[i];
   }
   r_out_result = PROTECT(allocVector(REALSXP, len));
-  auto ctx = BoosterCtx(R_ExternalPtrAddr(handle));
+  auto ctx = xgboost::detail::BoosterCtx(R_ExternalPtrAddr(handle));
   xgboost::common::ParallelFor(len, ctx->Threads(), [&](xgboost::omp_ulong i) {
     REAL(r_out_result)[i] = out_result[i];
   });
@@ -669,7 +666,7 @@ XGB_DLL SEXP XGBoosterFeatureScore_R(SEXP handle, SEXP json_config) {
   }
 
   out_scores_sexp = PROTECT(allocVector(REALSXP, len));
-  auto ctx = BoosterCtx(R_ExternalPtrAddr(handle));
+  auto ctx = xgboost::detail::BoosterCtx(R_ExternalPtrAddr(handle));
   xgboost::common::ParallelFor(len, ctx->Threads(), [&](xgboost::omp_ulong i) {
     REAL(out_scores_sexp)[i] = out_scores[i];
   });
diff --git a/R-package/src/xgboost_R.h b/R-package/src/xgboost_R.h
index 45a43a5bd..7f0833b15 100644
--- a/R-package/src/xgboost_R.h
+++ b/R-package/src/xgboost_R.h
@@ -161,12 +161,13 @@ XGB_DLL SEXP XGBoosterUpdateOneIter_R(SEXP ext, SEXP iter, SEXP dtrain);
  * \brief update the model, by directly specify gradient and second order gradient,
  *        this can be used to replace UpdateOneIter, to support customized loss function
  * \param handle handle
+ * \param iter The current training iteration.
  * \param dtrain training data
  * \param grad gradient statistics
  * \param hess second order gradient statistics
  * \return R_NilValue
  */
-XGB_DLL SEXP XGBoosterBoostOneIter_R(SEXP handle, SEXP dtrain, SEXP grad, SEXP hess);
+XGB_DLL SEXP XGBoosterTrainOneIter_R(SEXP handle, SEXP dtrain, SEXP iter, SEXP grad, SEXP hess);
 
 /*!
  * \brief get evaluation statistics for xgboost
diff --git a/demo/guide-python/custom_softmax.py b/demo/guide-python/custom_softmax.py
index 153c5d43b..36265cf4d 100644
--- a/demo/guide-python/custom_softmax.py
+++ b/demo/guide-python/custom_softmax.py
@@ -76,9 +76,7 @@ def softprob_obj(predt: np.ndarray, data: xgb.DMatrix):
             grad[r, c] = g
             hess[r, c] = h
 
-    # Right now (XGBoost 1.0.0), reshaping is necessary
-    grad = grad.reshape((kRows * kClasses, 1))
-    hess = hess.reshape((kRows * kClasses, 1))
+    # After 2.1.0, pass the gradient as it is.
     return grad, hess
 
 
diff --git a/demo/guide-python/multioutput_regression.py b/demo/guide-python/multioutput_regression.py
index 7450fd30a..cc64e4e09 100644
--- a/demo/guide-python/multioutput_regression.py
+++ b/demo/guide-python/multioutput_regression.py
@@ -68,22 +68,21 @@ def rmse_model(plot_result: bool, strategy: str) -> None:
 def custom_rmse_model(plot_result: bool, strategy: str) -> None:
     """Train using Python implementation of Squared Error."""
 
-    # As the experimental support status, custom objective doesn't support matrix as
-    # gradient and hessian, which will be changed in future release.
     def gradient(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray:
         """Compute the gradient squared error."""
         y = dtrain.get_label().reshape(predt.shape)
-        return (predt - y).reshape(y.size)
+        return predt - y
 
     def hessian(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray:
         """Compute the hessian for squared error."""
-        return np.ones(predt.shape).reshape(predt.size)
+        return np.ones(predt.shape)
 
     def squared_log(
         predt: np.ndarray, dtrain: xgb.DMatrix
     ) -> Tuple[np.ndarray, np.ndarray]:
         grad = gradient(predt, dtrain)
         hess = hessian(predt, dtrain)
+        # both numpy.ndarray and cupy.ndarray works.
         return grad, hess
 
     def rmse(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, float]:
diff --git a/include/xgboost/base.h b/include/xgboost/base.h
index f02d75cdc..dec306f0c 100644
--- a/include/xgboost/base.h
+++ b/include/xgboost/base.h
@@ -274,8 +274,8 @@ class GradientPairInt64 {
   GradientPairInt64(GradientPairInt64 const &g) = default;
   GradientPairInt64 &operator=(GradientPairInt64 const &g) = default;
 
-  XGBOOST_DEVICE [[nodiscard]] T GetQuantisedGrad() const { return grad_; }
-  XGBOOST_DEVICE [[nodiscard]] T GetQuantisedHess() const { return hess_; }
+  [[nodiscard]] XGBOOST_DEVICE T GetQuantisedGrad() const { return grad_; }
+  [[nodiscard]] XGBOOST_DEVICE T GetQuantisedHess() const { return hess_; }
 
   XGBOOST_DEVICE GradientPairInt64 &operator+=(const GradientPairInt64 &rhs) {
     grad_ += rhs.grad_;
diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h
index fc60d2e77..afc1f47fd 100644
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@@ -789,16 +789,14 @@ XGB_DLL int XGDMatrixGetUIntInfo(const DMatrixHandle handle,
  * \param out The address to hold number of rows.
  * \return 0 when success, -1 when failure happens
  */
-XGB_DLL int XGDMatrixNumRow(DMatrixHandle handle,
-                            bst_ulong *out);
+XGB_DLL int XGDMatrixNumRow(DMatrixHandle handle, bst_ulong *out);
 /*!
  * \brief get number of columns
  * \param handle the handle to the DMatrix
  * \param out The output of number of columns
  * \return 0 when success, -1 when failure happens
  */
-XGB_DLL int XGDMatrixNumCol(DMatrixHandle handle,
-                            bst_ulong *out);
+XGB_DLL int XGDMatrixNumCol(DMatrixHandle handle, bst_ulong *out);
 
 /*!
  * \brief Get number of valid values from DMatrix.
@@ -945,21 +943,30 @@ XGB_DLL int XGBoosterUpdateOneIter(BoosterHandle handle, int iter, DMatrixHandle
  * @example c-api-demo.c
  */
 
-/*!
- * \brief update the model, by directly specify gradient and second order gradient,
- *        this can be used to replace UpdateOneIter, to support customized loss function
- * \param handle handle
- * \param dtrain training data
- * \param grad gradient statistics
- * \param hess second order gradient statistics
- * \param len length of grad/hess array
- * \return 0 when success, -1 when failure happens
+/**
+ * @deprecated since 2.1.0
  */
-XGB_DLL int XGBoosterBoostOneIter(BoosterHandle handle,
-                                  DMatrixHandle dtrain,
-                                  float *grad,
-                                  float *hess,
-                                  bst_ulong len);
+XGB_DLL int XGBoosterBoostOneIter(BoosterHandle handle, DMatrixHandle dtrain, float *grad,
+                                  float *hess, bst_ulong len);
+
+/**
+ * @brief Update a model with gradient and Hessian. This is used for training with a
+ *        custom objective function.
+ *
+ * @since 2.0.0
+ *
+ * @param handle handle
+ * @param dtrain The training data.
+ * @param iter   The current iteration round. When training continuation is used, the count
+ *               should restart.
+ * @param grad   Json encoded __(cuda)_array_interface__ for gradient.
+ * @param hess   Json encoded __(cuda)_array_interface__ for Hessian.
+ *
+ * @return 0 when success, -1 when failure happens
+ */
+XGB_DLL int XGBoosterTrainOneIter(BoosterHandle handle, DMatrixHandle dtrain, int iter,
+                                  char const *grad, char const *hess);
+
 /*!
  * \brief get evaluation statistics for xgboost
  * \param handle handle
diff --git a/include/xgboost/gbm.h b/include/xgboost/gbm.h
index 6d3832093..ae8652eee 100644
--- a/include/xgboost/gbm.h
+++ b/include/xgboost/gbm.h
@@ -70,22 +70,25 @@ class GradientBooster : public Model, public Configurable {
                      GradientBooster* /*out*/, bool* /*out_of_bound*/) const {
     LOG(FATAL) << "Slice is not supported by the current booster.";
   }
-  /*! \brief Return number of boosted rounds.
+  /**
+   * @brief Return number of boosted rounds.
    */
-  virtual int32_t BoostedRounds() const = 0;
+  [[nodiscard]] virtual std::int32_t BoostedRounds() const = 0;
   /**
    * \brief Whether the model has already been trained. When tree booster is chosen, then
    *        returns true when there are existing trees.
    */
-  virtual bool ModelFitted() const = 0;
-  /*!
-   * \brief perform update to the model(boosting)
-   * \param p_fmat feature matrix that provide access to features
-   * \param in_gpair address of the gradient pair statistics of the data
-   * \param prediction The output prediction cache entry that needs to be updated.
-   * the booster may change content of gpair
+  [[nodiscard]] virtual bool ModelFitted() const = 0;
+  /**
+   * @brief perform update to the model(boosting)
+   *
+   * @param p_fmat feature matrix that provide access to features
+   * @param in_gpair address of the gradient pair statistics of the data
+   * @param prediction The output prediction cache entry that needs to be updated.
+   *                   the booster may change content of gpair
+   * @param obj The objective function used for boosting.
    */
-  virtual void DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
+  virtual void DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
                        PredictionCacheEntry*, ObjFunction const* obj) = 0;
 
   /**
@@ -165,18 +168,17 @@ class GradientBooster : public Model, public Configurable {
    * \param format the format to dump the model in
    * \return a vector of dump for boosters.
    */
-  virtual std::vector<std::string> DumpModel(const FeatureMap& fmap,
-                                             bool with_stats,
-                                             std::string format) const = 0;
+  [[nodiscard]] virtual std::vector<std::string> DumpModel(const FeatureMap& fmap, bool with_stats,
+                                                           std::string format) const = 0;
 
   virtual void FeatureScore(std::string const& importance_type,
                             common::Span<int32_t const> trees,
                             std::vector<bst_feature_t>* features,
                             std::vector<float>* scores) const = 0;
-  /*!
-   * \brief Whether the current booster uses GPU.
+  /**
+   * @brief Whether the current booster uses GPU.
    */
-  virtual bool UseGPU() const = 0;
+  [[nodiscard]] virtual bool UseGPU() const = 0;
   /*!
    * \brief create a gradient booster from given name
    * \param name name of gradient booster
diff --git a/include/xgboost/learner.h b/include/xgboost/learner.h
index 8adb3cb27..cd081a2e8 100644
--- a/include/xgboost/learner.h
+++ b/include/xgboost/learner.h
@@ -76,17 +76,18 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
    * \param iter current iteration number
    * \param train reference to the data matrix.
    */
-  virtual void UpdateOneIter(int iter, std::shared_ptr<DMatrix> train) = 0;
-  /*!
-   * \brief Do customized gradient boosting with in_gpair.
-   *  in_gair can be mutated after this call.
-   * \param iter current iteration number
-   * \param train reference to the data matrix.
-   * \param in_gpair The input gradient statistics.
+  virtual void UpdateOneIter(std::int32_t iter, std::shared_ptr<DMatrix> train) = 0;
+  /**
+   * @brief Do customized gradient boosting with in_gpair.
+   *
+   * @note in_gpair can be mutated after this call.
+   *
+   * @param iter current iteration number
+   * @param train reference to the data matrix.
+   * @param in_gpair The input gradient statistics.
    */
-  virtual void BoostOneIter(int iter,
-                            std::shared_ptr<DMatrix> train,
-                            HostDeviceVector<GradientPair>* in_gpair) = 0;
+  virtual void BoostOneIter(std::int32_t iter, std::shared_ptr<DMatrix> train,
+                            linalg::Matrix<GradientPair>* in_gpair) = 0;
   /*!
    * \brief evaluate the model for specific iteration using the configured metrics.
    * \param iter iteration number
diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h
index 6d2b54f84..ae3489e3b 100644
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -292,7 +292,7 @@ enum Order : std::uint8_t {
 template <typename T, int32_t kDim>
 class TensorView {
  public:
-  using ShapeT = size_t[kDim];
+  using ShapeT = std::size_t[kDim];
   using StrideT = ShapeT;
 
  private:
@@ -400,10 +400,14 @@ class TensorView {
    * \param shape  shape of the tensor
    * \param device Device ordinal
    */
-  template <typename I, int32_t D>
+  template <typename I, std::int32_t D>
   LINALG_HD TensorView(common::Span<T> data, I const (&shape)[D], std::int32_t device)
       : TensorView{data, shape, device, Order::kC} {}
 
+  template <typename I, std::int32_t D>
+  LINALG_HD TensorView(common::Span<T> data, I const (&shape)[D], DeviceOrd device)
+      : TensorView{data, shape, device.ordinal, Order::kC} {}
+
   template <typename I, int32_t D>
   LINALG_HD TensorView(common::Span<T> data, I const (&shape)[D], std::int32_t device, Order order)
       : data_{data}, ptr_{data_.data()}, device_{device} {
@@ -446,6 +450,10 @@ class TensorView {
     });
     this->CalcSize();
   }
+  template <typename I, std::int32_t D>
+  LINALG_HD TensorView(common::Span<T> data, I const (&shape)[D], I const (&stride)[D],
+                       DeviceOrd device)
+      : TensorView{data, shape, stride, device.ordinal} {}
 
   template <
       typename U,
@@ -741,7 +749,7 @@ auto ArrayInterfaceStr(TensorView<T, D> const &t) {
 template <typename T, int32_t kDim = 5>
 class Tensor {
  public:
-  using ShapeT = size_t[kDim];
+  using ShapeT = std::size_t[kDim];
   using StrideT = ShapeT;
 
  private:
@@ -775,6 +783,9 @@ class Tensor {
   template <typename I, int32_t D>
   explicit Tensor(I const (&shape)[D], std::int32_t device, Order order = kC)
       : Tensor{common::Span<I const, D>{shape}, device, order} {}
+  template <typename I, int32_t D>
+  explicit Tensor(I const (&shape)[D], DeviceOrd device, Order order = kC)
+      : Tensor{common::Span<I const, D>{shape}, device.ordinal, order} {}
 
   template <typename I, size_t D>
   explicit Tensor(common::Span<I const, D> shape, std::int32_t device, Order order = kC)
@@ -814,6 +825,10 @@ class Tensor {
     // shape
     this->Initialize(shape, device);
   }
+  template <typename I, int32_t D>
+  explicit Tensor(std::initializer_list<T> data, I const (&shape)[D], DeviceOrd device,
+                  Order order = kC)
+      : Tensor{data, shape, device.ordinal, order} {}
   /**
    * \brief Index operator. Not thread safe, should not be used in performance critical
    *        region. For more efficient indexing, consider getting a view first.
@@ -832,9 +847,9 @@ class Tensor {
   }
 
   /**
-   * \brief Get a \ref TensorView for this tensor.
+   * @brief Get a @ref TensorView for this tensor.
    */
-  TensorView<T, kDim> View(int32_t device) {
+  TensorView<T, kDim> View(std::int32_t device) {
     if (device >= 0) {
       data_.SetDevice(device);
       auto span = data_.DeviceSpan();
@@ -844,7 +859,7 @@ class Tensor {
       return {span, shape_, device, order_};
     }
   }
-  TensorView<T const, kDim> View(int32_t device) const {
+  TensorView<T const, kDim> View(std::int32_t device) const {
     if (device >= 0) {
       data_.SetDevice(device);
       auto span = data_.ConstDeviceSpan();
@@ -854,6 +869,26 @@ class Tensor {
       return {span, shape_, device, order_};
     }
   }
+  auto View(DeviceOrd device) {
+    if (device.IsCUDA()) {
+      data_.SetDevice(device);
+      auto span = data_.DeviceSpan();
+      return TensorView<T, kDim>{span, shape_, device.ordinal, order_};
+    } else {
+      auto span = data_.HostSpan();
+      return TensorView<T, kDim>{span, shape_, device.ordinal, order_};
+    }
+  }
+  auto View(DeviceOrd device) const {
+    if (device.IsCUDA()) {
+      data_.SetDevice(device);
+      auto span = data_.ConstDeviceSpan();
+      return TensorView<T const, kDim>{span, shape_, device.ordinal, order_};
+    } else {
+      auto span = data_.ConstHostSpan();
+      return TensorView<T const, kDim>{span, shape_, device.ordinal, order_};
+    }
+  }
 
   auto HostView() const { return this->View(-1); }
   auto HostView() { return this->View(-1); }
@@ -931,6 +966,7 @@ class Tensor {
    * \brief Set device ordinal for this tensor.
    */
   void SetDevice(int32_t device) const { data_.SetDevice(device); }
+  void SetDevice(DeviceOrd device) const { data_.SetDevice(device); }
   [[nodiscard]] int32_t DeviceIdx() const { return data_.DeviceIdx(); }
 };
 
diff --git a/include/xgboost/linear_updater.h b/include/xgboost/linear_updater.h
index 6faf11230..bcc8dd890 100644
--- a/include/xgboost/linear_updater.h
+++ b/include/xgboost/linear_updater.h
@@ -49,9 +49,8 @@ class LinearUpdater : public Configurable {
    * \param model               Model to be updated.
    * \param sum_instance_weight The sum instance weights, used to normalise l1/l2 penalty.
    */
-  virtual void Update(HostDeviceVector<GradientPair>* in_gpair, DMatrix* data,
-                      gbm::GBLinearModel* model,
-                      double sum_instance_weight) = 0;
+  virtual void Update(linalg::Matrix<GradientPair>* in_gpair, DMatrix* data,
+                      gbm::GBLinearModel* model, double sum_instance_weight) = 0;
 
   /*!
    * \brief Create a linear updater given name
diff --git a/include/xgboost/objective.h b/include/xgboost/objective.h
index a04d2e453..d2623ee01 100644
--- a/include/xgboost/objective.h
+++ b/include/xgboost/objective.h
@@ -41,17 +41,16 @@ class ObjFunction : public Configurable {
    * \param args arguments to the objective function.
    */
   virtual void Configure(const std::vector<std::pair<std::string, std::string> >& args) = 0;
-  /*!
-   * \brief Get gradient over each of predictions, given existing information.
-   * \param preds prediction of current round
-   * \param info information about labels, weights, groups in rank
-   * \param iteration current iteration number.
-   * \param out_gpair output of get gradient, saves gradient and second order gradient in
+  /**
+   * @brief Get gradient over each of predictions, given existing information.
+   *
+   * @param preds prediction of current round
+   * @param info information about labels, weights, groups in rank
+   * @param iteration current iteration number.
+   * @param out_gpair output of get gradient, saves gradient and second order gradient in
    */
-  virtual void GetGradient(const HostDeviceVector<bst_float>& preds,
-                           const MetaInfo& info,
-                           int iteration,
-                           HostDeviceVector<GradientPair>* out_gpair) = 0;
+  virtual void GetGradient(const HostDeviceVector<bst_float>& preds, const MetaInfo& info,
+                           std::int32_t iter, linalg::Matrix<GradientPair>* out_gpair) = 0;
 
   /*! \return the default evaluation metric for the objective */
   virtual const char* DefaultEvalMetric() const = 0;
@@ -81,9 +80,7 @@ class ObjFunction : public Configurable {
    * used by gradient boosting
    * \return transformed value
    */
-  virtual bst_float ProbToMargin(bst_float base_score) const {
-    return base_score;
-  }
+  [[nodiscard]] virtual bst_float ProbToMargin(bst_float base_score) const { return base_score; }
   /**
    * \brief Make initialize estimation of prediction.
    *
@@ -94,14 +91,14 @@ class ObjFunction : public Configurable {
   /*!
    * \brief Return task of this objective.
    */
-  virtual struct ObjInfo Task() const = 0;
+  [[nodiscard]] virtual struct ObjInfo Task() const = 0;
   /**
-   * \brief Return number of targets for input matrix.  Right now XGBoost supports only
+   * @brief Return number of targets for input matrix.  Right now XGBoost supports only
    *        multi-target regression.
    */
-  virtual bst_target_t Targets(MetaInfo const& info) const {
+  [[nodiscard]] virtual bst_target_t Targets(MetaInfo const& info) const {
     if (info.labels.Shape(1) > 1) {
-      LOG(FATAL) << "multioutput is not supported by current objective function";
+      LOG(FATAL) << "multioutput is not supported by the current objective function";
     }
     return 1;
   }
diff --git a/include/xgboost/tree_updater.h b/include/xgboost/tree_updater.h
index 79b80319f..477c8e4a1 100644
--- a/include/xgboost/tree_updater.h
+++ b/include/xgboost/tree_updater.h
@@ -71,7 +71,7 @@ class TreeUpdater : public Configurable {
    *         but maybe different random seeds, usually one tree is passed in at a time,
    *         there can be multiple trees when we train random forest style model
    */
-  virtual void Update(tree::TrainParam const* param, HostDeviceVector<GradientPair>* gpair,
+  virtual void Update(tree::TrainParam const* param, linalg::Matrix<GradientPair>* gpair,
                       DMatrix* data, common::Span<HostDeviceVector<bst_node_t>> out_position,
                       const std::vector<RegTree*>& out_trees) = 0;
 
diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java
index 23b8b1a80..11f5299c0 100644
--- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java
+++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java
@@ -218,34 +218,48 @@ public class Booster implements Serializable, KryoSerializable {
     XGBoostJNI.checkCall(XGBoostJNI.XGBoosterUpdateOneIter(handle, iter, dtrain.getHandle()));
   }
 
+  @Deprecated
+  public void update(DMatrix dtrain, IObjective obj) throws XGBoostError {
+    float[][] predicts = this.predict(dtrain, true, 0, false, false);
+    List<float[]> gradients = obj.getGradient(predicts, dtrain);
+    this.boost(dtrain, gradients.get(0), gradients.get(1));
+  }
+
   /**
    * Update with customize obj func
    *
    * @param dtrain training data
+   * @param iter   The current training iteration.
    * @param obj    customized objective class
    * @throws XGBoostError native error
    */
-  public void update(DMatrix dtrain, IObjective obj) throws XGBoostError {
+  public void update(DMatrix dtrain, int iter, IObjective obj) throws XGBoostError {
     float[][] predicts = this.predict(dtrain, true, 0, false, false);
     List<float[]> gradients = obj.getGradient(predicts, dtrain);
-    boost(dtrain, gradients.get(0), gradients.get(1));
+    this.boost(dtrain, iter, gradients.get(0), gradients.get(1));
+  }
+
+  @Deprecated
+  public void boost(DMatrix dtrain, float[] grad, float[] hess) throws XGBoostError {
+    this.boost(dtrain, 0, grad, hess);
   }
 
   /**
-   * update with give grad and hess
+   * Update with give grad and hess
    *
    * @param dtrain training data
+   * @param iter   The current training iteration.
    * @param grad   first order of gradient
    * @param hess   seconde order of gradient
    * @throws XGBoostError native error
    */
-  public void boost(DMatrix dtrain, float[] grad, float[] hess) throws XGBoostError {
+  public void boost(DMatrix dtrain, int iter, float[] grad, float[] hess) throws XGBoostError {
     if (grad.length != hess.length) {
       throw new AssertionError(String.format("grad/hess length mismatch %s / %s", grad.length,
               hess.length));
     }
-    XGBoostJNI.checkCall(XGBoostJNI.XGBoosterBoostOneIter(handle,
-            dtrain.getHandle(), grad, hess));
+    XGBoostJNI.checkCall(XGBoostJNI.XGBoosterTrainOneIter(handle,
+                                                          dtrain.getHandle(), iter, grad, hess));
   }
 
   /**
diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoostJNI.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoostJNI.java
index abe584f05..d71d0a4f5 100644
--- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoostJNI.java
+++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoostJNI.java
@@ -110,7 +110,7 @@ class XGBoostJNI {
 
   public final static native int XGBoosterUpdateOneIter(long handle, int iter, long dtrain);
 
-  public final static native int XGBoosterBoostOneIter(long handle, long dtrain, float[] grad,
+  public final static native int XGBoosterTrainOneIter(long handle, long dtrain, int iter, float[] grad,
                                                        float[] hess);
 
   public final static native int XGBoosterEvalOneIter(long handle, int iter, long[] dmats,
diff --git a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/Booster.scala b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/Booster.scala
index a1d122679..31be86898 100644
--- a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/Booster.scala
+++ b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/Booster.scala
@@ -106,27 +106,41 @@ class Booster private[xgboost4j](private[xgboost4j] var booster: JBooster)
     booster.update(dtrain.jDMatrix, iter)
   }
 
+  @throws(classOf[XGBoostError])
+  @deprecated
+  def update(dtrain: DMatrix, obj: ObjectiveTrait): Unit = {
+    booster.update(dtrain.jDMatrix, obj)
+  }
+
   /**
    * update with customize obj func
    *
    * @param dtrain training data
+   * @param iter   The current training iteration
    * @param obj    customized objective class
    */
   @throws(classOf[XGBoostError])
-  def update(dtrain: DMatrix, obj: ObjectiveTrait): Unit = {
-    booster.update(dtrain.jDMatrix, obj)
+  def update(dtrain: DMatrix, iter: Int, obj: ObjectiveTrait): Unit = {
+    booster.update(dtrain.jDMatrix, iter, obj)
+  }
+
+  @throws(classOf[XGBoostError])
+  @deprecated
+  def boost(dtrain: DMatrix, grad: Array[Float], hess: Array[Float]): Unit = {
+    booster.boost(dtrain.jDMatrix, grad, hess)
   }
 
   /**
    * update with give grad and hess
    *
    * @param dtrain training data
+   * @param iter   The current training iteration
    * @param grad   first order of gradient
    * @param hess   seconde order of gradient
    */
   @throws(classOf[XGBoostError])
-  def boost(dtrain: DMatrix, grad: Array[Float], hess: Array[Float]): Unit = {
-    booster.boost(dtrain.jDMatrix, grad, hess)
+  def boost(dtrain: DMatrix, iter: Int, grad: Array[Float], hess: Array[Float]): Unit = {
+    booster.boost(dtrain.jDMatrix, iter, grad, hess)
   }
 
   /**
diff --git a/jvm-packages/xgboost4j/src/native/xgboost4j.cpp b/jvm-packages/xgboost4j/src/native/xgboost4j.cpp
index a61a68dbc..60c2f126c 100644
--- a/jvm-packages/xgboost4j/src/native/xgboost4j.cpp
+++ b/jvm-packages/xgboost4j/src/native/xgboost4j.cpp
@@ -28,6 +28,7 @@
 #include <type_traits>
 #include <vector>
 
+#include "../../../src/c_api/c_api_error.h"
 #include "../../../src/c_api/c_api_utils.h"
 
 #define JVM_CHECK_CALL(__expr)                                                 \
@@ -579,22 +580,44 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterUpdateOne
 
 /*
  * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
- * Method:    XGBoosterBoostOneIter
- * Signature: (JJ[F[F)V
+ * Method:    XGBoosterTrainOneIter
+ * Signature: (JJI[F[F)I
  */
-JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterBoostOneIter
-  (JNIEnv *jenv, jclass jcls, jlong jhandle, jlong jdtrain, jfloatArray jgrad, jfloatArray jhess) {
-  BoosterHandle handle = (BoosterHandle) jhandle;
-  DMatrixHandle dtrain = (DMatrixHandle) jdtrain;
-  jfloat* grad = jenv->GetFloatArrayElements(jgrad, 0);
-  jfloat* hess = jenv->GetFloatArrayElements(jhess, 0);
-  bst_ulong len = (bst_ulong)jenv->GetArrayLength(jgrad);
-  int ret = XGBoosterBoostOneIter(handle, dtrain, grad, hess, len);
-  JVM_CHECK_CALL(ret);
-  //release
+JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterTrainOneIter(
+    JNIEnv *jenv, jclass jcls, jlong jhandle, jlong jdtrain, jint jiter, jfloatArray jgrad,
+    jfloatArray jhess) {
+  API_BEGIN();
+  BoosterHandle handle = reinterpret_cast<BoosterHandle *>(jhandle);
+  DMatrixHandle dtrain = reinterpret_cast<DMatrixHandle *>(jdtrain);
+  CHECK(handle);
+  CHECK(dtrain);
+  bst_ulong n_samples{0};
+  JVM_CHECK_CALL(XGDMatrixNumRow(dtrain, &n_samples));
+
+  bst_ulong len = static_cast<bst_ulong>(jenv->GetArrayLength(jgrad));
+  jfloat *grad = jenv->GetFloatArrayElements(jgrad, nullptr);
+  jfloat *hess = jenv->GetFloatArrayElements(jhess, nullptr);
+  CHECK(grad);
+  CHECK(hess);
+
+  xgboost::bst_target_t n_targets{1};
+  if (len != n_samples && n_samples != 0) {
+    CHECK_EQ(len % n_samples, 0) << "Invalid size of gradient.";
+    n_targets = len / n_samples;
+  }
+
+  auto ctx = xgboost::detail::BoosterCtx(handle);
+  auto [s_grad, s_hess] =
+      xgboost::detail::MakeGradientInterface(ctx, grad, hess, n_samples, n_targets);
+  int ret = XGBoosterTrainOneIter(handle, dtrain, static_cast<std::int32_t>(jiter), s_grad.c_str(),
+                                  s_hess.c_str());
+
+  // release
   jenv->ReleaseFloatArrayElements(jgrad, grad, 0);
   jenv->ReleaseFloatArrayElements(jhess, hess, 0);
+
   return ret;
+  API_END();
 }
 
 /*
diff --git a/jvm-packages/xgboost4j/src/native/xgboost4j.h b/jvm-packages/xgboost4j/src/native/xgboost4j.h
index 11a2f86ff..b221c6a57 100644
--- a/jvm-packages/xgboost4j/src/native/xgboost4j.h
+++ b/jvm-packages/xgboost4j/src/native/xgboost4j.h
@@ -185,11 +185,11 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterUpdateOne
 
 /*
  * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
- * Method:    XGBoosterBoostOneIter
- * Signature: (JJ[F[F)I
+ * Method:    XGBoosterTrainOneIter
+ * Signature: (JJI[F[F)I
  */
-JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterBoostOneIter
-  (JNIEnv *, jclass, jlong, jlong, jfloatArray, jfloatArray);
+JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterTrainOneIter
+  (JNIEnv *, jclass, jlong, jlong, jint, jfloatArray, jfloatArray);
 
 /*
  * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
@@ -386,19 +386,17 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixCreateFro
 /*
  * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
  * Method:    XGBoosterSetStrFeatureInfo
- * Signature: (JLjava/lang/String;[Ljava/lang/String;])I
+ * Signature: (JLjava/lang/String;[Ljava/lang/String;)I
  */
-JNIEXPORT jint JNICALL
-Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterSetStrFeatureInfo
+JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterSetStrFeatureInfo
   (JNIEnv *, jclass, jlong, jstring, jobjectArray);
 
 /*
  * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
  * Method:    XGBoosterGetStrFeatureInfo
- * Signature: (JLjava/lang/String;[Ljava/lang/String;])I
+ * Signature: (JLjava/lang/String;[Ljava/lang/String;)I
  */
-JNIEXPORT jint JNICALL
-Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterGetStrFeatureInfo
+JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterGetStrFeatureInfo
   (JNIEnv *, jclass, jlong, jstring, jobjectArray);
 
 #ifdef __cplusplus
diff --git a/plugin/example/custom_obj.cc b/plugin/example/custom_obj.cc
index 3f18330ce..b996447a3 100644
--- a/plugin/example/custom_obj.cc
+++ b/plugin/example/custom_obj.cc
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2015-2022 by Contributors
+/**
+ * Copyright 2015-2023, XGBoost Contributors
  * \file custom_metric.cc
  * \brief This is an example to define plugin of xgboost.
  *  This plugin defines the additional metric function.
@@ -9,9 +9,7 @@
 #include <xgboost/objective.h>
 #include <xgboost/json.h>
 
-namespace xgboost {
-namespace obj {
-
+namespace xgboost::obj {
 // This is a helpful data structure to define parameters
 // You do not have to use it.
 // see http://dmlc-core.readthedocs.org/en/latest/parameter.html
@@ -33,38 +31,38 @@ class MyLogistic : public ObjFunction {
  public:
   void Configure(const Args& args) override { param_.UpdateAllowUnknown(args); }
 
-  ObjInfo Task() const override { return ObjInfo::kRegression; }
+  [[nodiscard]] ObjInfo Task() const override { return ObjInfo::kRegression; }
 
-  void GetGradient(const HostDeviceVector<bst_float>& preds, const MetaInfo& info, int32_t /*iter*/,
-                   HostDeviceVector<GradientPair>* out_gpair) override {
-    out_gpair->Resize(preds.Size());
-    const std::vector<bst_float>& preds_h = preds.HostVector();
-    std::vector<GradientPair>& out_gpair_h = out_gpair->HostVector();
+  void GetGradient(const HostDeviceVector<float>& preds, MetaInfo const& info,
+                   std::int32_t /*iter*/, linalg::Matrix<GradientPair>* out_gpair) override {
+    out_gpair->Reshape(info.num_row_, 1);
+    const std::vector<float>& preds_h = preds.HostVector();
+    auto out_gpair_h = out_gpair->HostView();
     auto const labels_h = info.labels.HostView();
     for (size_t i = 0; i < preds_h.size(); ++i) {
-      bst_float w = info.GetWeight(i);
+      float w = info.GetWeight(i);
       // scale the negative examples!
       if (labels_h(i) == 0.0f) w *= param_.scale_neg_weight;
       // logistic transformation
-      bst_float p = 1.0f / (1.0f + std::exp(-preds_h[i]));
+      float p = 1.0f / (1.0f + std::exp(-preds_h[i]));
       // this is the gradient
-      bst_float grad = (p - labels_h(i)) * w;
+      float grad = (p - labels_h(i)) * w;
       // this is the second order gradient
-      bst_float hess = p * (1.0f - p) * w;
-      out_gpair_h.at(i) = GradientPair(grad, hess);
+      float hess = p * (1.0f - p) * w;
+      out_gpair_h(i) = GradientPair(grad, hess);
     }
   }
-  const char* DefaultEvalMetric() const override {
+  [[nodiscard]] const char* DefaultEvalMetric() const override {
     return "logloss";
   }
-  void PredTransform(HostDeviceVector<bst_float> *io_preds) const override {
+  void PredTransform(HostDeviceVector<float> *io_preds) const override {
     // transform margin value to probability.
-    std::vector<bst_float> &preds = io_preds->HostVector();
+    std::vector<float> &preds = io_preds->HostVector();
     for (auto& pred : preds) {
       pred = 1.0f / (1.0f + std::exp(-pred));
     }
   }
-  bst_float ProbToMargin(bst_float base_score) const override {
+  [[nodiscard]] float ProbToMargin(float base_score) const override {
     // transform probability to margin value
     return -std::log(1.0f / base_score - 1.0f);
   }
@@ -89,5 +87,4 @@ XGBOOST_REGISTER_OBJECTIVE(MyLogistic, "mylogistic")
 .describe("User defined logistic regression plugin")
 .set_body([]() { return new MyLogistic(); });
 
-}  // namespace obj
-}  // namespace xgboost
+}  // namespace xgboost::obj
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index d59d2f1d1..486cee514 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -2053,12 +2053,14 @@ class Booster:
         else:
             pred = self.predict(dtrain, output_margin=True, training=True)
             grad, hess = fobj(pred, dtrain)
-            self.boost(dtrain, grad, hess)
+            self.boost(dtrain, iteration=iteration, grad=grad, hess=hess)
 
-    def boost(self, dtrain: DMatrix, grad: np.ndarray, hess: np.ndarray) -> None:
-        """Boost the booster for one iteration, with customized gradient
-        statistics.  Like :py:func:`xgboost.Booster.update`, this
-        function should not be called directly by users.
+    def boost(
+        self, dtrain: DMatrix, iteration: int, grad: NumpyOrCupy, hess: NumpyOrCupy
+    ) -> None:
+        """Boost the booster for one iteration with customized gradient statistics.
+        Like :py:func:`xgboost.Booster.update`, this function should not be called
+        directly by users.
 
         Parameters
         ----------
@@ -2070,19 +2072,53 @@ class Booster:
             The second order of gradient.
 
         """
-        if len(grad) != len(hess):
-            raise ValueError(f"grad / hess length mismatch: {len(grad)} / {len(hess)}")
-        if not isinstance(dtrain, DMatrix):
-            raise TypeError(f"invalid training matrix: {type(dtrain).__name__}")
+        from .data import (
+            _array_interface,
+            _cuda_array_interface,
+            _ensure_np_dtype,
+            _is_cupy_array,
+        )
+
         self._assign_dmatrix_features(dtrain)
 
+        def is_flatten(array: NumpyOrCupy) -> bool:
+            return len(array.shape) == 1 or array.shape[1] == 1
+
+        def array_interface(array: NumpyOrCupy) -> bytes:
+            # Can we check for __array_interface__ instead of a specific type instead?
+            msg = (
+                "Expecting `np.ndarray` or `cupy.ndarray` for gradient and hessian."
+                f" Got: {type(array)}"
+            )
+            if not isinstance(array, np.ndarray) and not _is_cupy_array(array):
+                raise TypeError(msg)
+
+            n_samples = dtrain.num_row()
+            if array.shape[0] != n_samples and is_flatten(array):
+                warnings.warn(
+                    "Since 2.1.0, the shape of the gradient and hessian is required to"
+                    " be (n_samples, n_targets) or (n_samples, n_classes).",
+                    FutureWarning,
+                )
+                array = array.reshape(n_samples, array.size // n_samples)
+
+            if isinstance(array, np.ndarray):
+                array, _ = _ensure_np_dtype(array, array.dtype)
+                interface = _array_interface(array)
+            elif _is_cupy_array(array):
+                interface = _cuda_array_interface(array)
+            else:
+                raise TypeError(msg)
+
+            return interface
+
         _check_call(
-            _LIB.XGBoosterBoostOneIter(
+            _LIB.XGBoosterTrainOneIter(
                 self.handle,
                 dtrain.handle,
-                c_array(ctypes.c_float, grad),
-                c_array(ctypes.c_float, hess),
-                c_bst_ulong(len(grad)),
+                iteration,
+                array_interface(grad),
+                array_interface(hess),
             )
         )
 
diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py
index 8a21b6085..41fd6405a 100644
--- a/python-package/xgboost/testing/__init__.py
+++ b/python-package/xgboost/testing/__init__.py
@@ -763,13 +763,31 @@ def softmax(x: np.ndarray) -> np.ndarray:
     return e / np.sum(e)
 
 
-def softprob_obj(classes: int) -> SklObjective:
+def softprob_obj(
+    classes: int, use_cupy: bool = False, order: str = "C", gdtype: str = "float32"
+) -> SklObjective:
+    """Custom softprob objective for testing.
+
+    Parameters
+    ----------
+    use_cupy :
+        Whether the objective should return cupy arrays.
+    order :
+        The order of gradient matrices. "C" or "F".
+    gdtype :
+        DType for gradient. Hessian is not set. This is for testing asymmetric types.
+    """
+    if use_cupy:
+        import cupy as backend
+    else:
+        backend = np
+
     def objective(
-        labels: np.ndarray, predt: np.ndarray
-    ) -> Tuple[np.ndarray, np.ndarray]:
+        labels: backend.ndarray, predt: backend.ndarray
+    ) -> Tuple[backend.ndarray, backend.ndarray]:
         rows = labels.shape[0]
-        grad = np.zeros((rows, classes), dtype=float)
-        hess = np.zeros((rows, classes), dtype=float)
+        grad = backend.zeros((rows, classes), dtype=np.float32)
+        hess = backend.zeros((rows, classes), dtype=np.float32)
         eps = 1e-6
         for r in range(predt.shape[0]):
             target = labels[r]
@@ -781,8 +799,10 @@ def softprob_obj(classes: int) -> SklObjective:
                 grad[r, c] = g
                 hess[r, c] = h
 
-        grad = grad.reshape((rows * classes, 1))
-        hess = hess.reshape((rows * classes, 1))
+        grad = grad.reshape((rows, classes))
+        hess = hess.reshape((rows, classes))
+        grad = backend.require(grad, requirements=order, dtype=gdtype)
+        hess = backend.require(hess, requirements=order)
         return grad, hess
 
     return objective
diff --git a/python-package/xgboost/training.py b/python-package/xgboost/training.py
index aa3c18a01..e74a56904 100644
--- a/python-package/xgboost/training.py
+++ b/python-package/xgboost/training.py
@@ -178,7 +178,7 @@ def train(
     for i in range(start_iteration, num_boost_round):
         if cb_container.before_iteration(bst, i, dtrain, evals):
             break
-        bst.update(dtrain, i, obj)
+        bst.update(dtrain, iteration=i, fobj=obj)
         if cb_container.after_iteration(bst, i, dtrain, evals):
             break
 
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 5b49d136f..2b0862d49 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -22,6 +22,7 @@
 #include "../common/charconv.h"              // for from_chars, to_chars, NumericLimits, from_ch...
 #include "../common/hist_util.h"             // for HistogramCuts
 #include "../common/io.h"                    // for FileExtension, LoadSequentialFile, MemoryBuf...
+#include "../common/linalg_op.h"             // for ElementWiseTransformHost
 #include "../common/threading_utils.h"       // for OmpGetNumThreads, ParallelFor
 #include "../data/adapter.h"                 // for ArrayAdapter, DenseAdapter, RecordBatchesIte...
 #include "../data/ellpack_page.h"            // for EllpackPage
@@ -68,6 +69,7 @@ XGB_DLL void XGBoostVersion(int* major, int* minor, int* patch) {
   }
 }
 
+static_assert(DMLC_CXX11_THREAD_LOCAL, "XGBoost depends on thread-local storage.");
 using GlobalConfigAPIThreadLocalStore = dmlc::ThreadLocalStore<XGBAPIThreadLocalEntry>;
 
 #if !defined(XGBOOST_USE_CUDA)
@@ -717,8 +719,7 @@ XGB_DLL int XGDMatrixGetUIntInfo(const DMatrixHandle handle,
   API_END();
 }
 
-XGB_DLL int XGDMatrixNumRow(const DMatrixHandle handle,
-                            xgboost::bst_ulong *out) {
+XGB_DLL int XGDMatrixNumRow(DMatrixHandle handle, xgboost::bst_ulong *out) {
   API_BEGIN();
   CHECK_HANDLE();
   auto p_m = CastDMatrixHandle(handle);
@@ -727,8 +728,7 @@ XGB_DLL int XGDMatrixNumRow(const DMatrixHandle handle,
   API_END();
 }
 
-XGB_DLL int XGDMatrixNumCol(const DMatrixHandle handle,
-                            xgboost::bst_ulong *out) {
+XGB_DLL int XGDMatrixNumCol(DMatrixHandle handle, xgboost::bst_ulong *out) {
   API_BEGIN();
   CHECK_HANDLE();
   auto p_m = CastDMatrixHandle(handle);
@@ -970,28 +970,71 @@ XGB_DLL int XGBoosterUpdateOneIter(BoosterHandle handle,
   API_END();
 }
 
-XGB_DLL int XGBoosterBoostOneIter(BoosterHandle handle,
-                                  DMatrixHandle dtrain,
-                                  bst_float *grad,
-                                  bst_float *hess,
-                                  xgboost::bst_ulong len) {
+XGB_DLL int XGBoosterBoostOneIter(BoosterHandle handle, DMatrixHandle dtrain, bst_float *grad,
+                                  bst_float *hess, xgboost::bst_ulong len) {
   API_BEGIN();
   CHECK_HANDLE();
-  HostDeviceVector<GradientPair> tmp_gpair;
-  auto* bst = static_cast<Learner*>(handle);
-  auto* dtr =
-      static_cast<std::shared_ptr<DMatrix>*>(dtrain);
-  tmp_gpair.Resize(len);
-  std::vector<GradientPair>& tmp_gpair_h = tmp_gpair.HostVector();
-  if (len > 0) {
-    xgboost_CHECK_C_ARG_PTR(grad);
-    xgboost_CHECK_C_ARG_PTR(hess);
-  }
-  for (xgboost::bst_ulong i = 0; i < len; ++i) {
-    tmp_gpair_h[i] = GradientPair(grad[i], hess[i]);
-  }
+  error::DeprecatedFunc(__func__, "2.1.0", "XGBoosterTrainOneIter");
+  auto *learner = static_cast<Learner *>(handle);
+  auto ctx = learner->Ctx()->MakeCPU();
 
-  bst->BoostOneIter(0, *dtr, &tmp_gpair);
+  auto t_grad = linalg::MakeTensorView(&ctx, common::Span{grad, len}, len);
+  auto t_hess = linalg::MakeTensorView(&ctx, common::Span{hess, len}, len);
+
+  auto s_grad = linalg::ArrayInterfaceStr(t_grad);
+  auto s_hess = linalg::ArrayInterfaceStr(t_hess);
+
+  return XGBoosterTrainOneIter(handle, dtrain, 0, s_grad.c_str(), s_hess.c_str());
+  API_END();
+}
+
+namespace xgboost {
+// copy user-supplied CUDA gradient arrays
+void CopyGradientFromCUDAArrays(Context const *, ArrayInterface<2, false> const &,
+                                ArrayInterface<2, false> const &, linalg::Matrix<GradientPair> *)
+#if !defined(XGBOOST_USE_CUDA)
+{
+  common::AssertGPUSupport();
+}
+#else
+;  // NOLINT
+#endif
+}  // namespace xgboost
+
+XGB_DLL int XGBoosterTrainOneIter(BoosterHandle handle, DMatrixHandle dtrain, int iter,
+                                  char const *grad, char const *hess) {
+  API_BEGIN();
+  CHECK_HANDLE();
+  xgboost_CHECK_C_ARG_PTR(grad);
+  xgboost_CHECK_C_ARG_PTR(hess);
+  auto p_fmat = CastDMatrixHandle(dtrain);
+  ArrayInterface<2, false> i_grad{StringView{grad}};
+  ArrayInterface<2, false> i_hess{StringView{hess}};
+  StringView msg{"Mismatched shape between the gradient and hessian."};
+  CHECK_EQ(i_grad.Shape(0), i_hess.Shape(0)) << msg;
+  CHECK_EQ(i_grad.Shape(1), i_hess.Shape(1)) << msg;
+  linalg::Matrix<GradientPair> gpair;
+  auto grad_is_cuda = ArrayInterfaceHandler::IsCudaPtr(i_grad.data);
+  auto hess_is_cuda = ArrayInterfaceHandler::IsCudaPtr(i_hess.data);
+  CHECK_EQ(i_grad.Shape(0), p_fmat->Info().num_row_)
+      << "Mismatched size between the gradient and training data.";
+  CHECK_EQ(grad_is_cuda, hess_is_cuda) << "gradient and hessian should be on the same device.";
+  auto *learner = static_cast<Learner *>(handle);
+  auto ctx = learner->Ctx();
+  if (!grad_is_cuda) {
+    gpair.Reshape(i_grad.Shape(0), i_grad.Shape(1));
+    auto const shape = gpair.Shape();
+    auto h_gpair = gpair.HostView();
+    DispatchDType(i_grad, DeviceOrd::CPU(), [&](auto &&t_grad) {
+      DispatchDType(i_hess, DeviceOrd::CPU(), [&](auto &&t_hess) {
+        common::ParallelFor(h_gpair.Size(), ctx->Threads(),
+                            detail::CustomGradHessOp{t_grad, t_hess, h_gpair});
+      });
+    });
+  } else {
+    CopyGradientFromCUDAArrays(ctx, i_grad, i_hess, &gpair);
+  }
+  learner->BoostOneIter(iter, p_fmat, &gpair);
   API_END();
 }
 
diff --git a/src/c_api/c_api.cu b/src/c_api/c_api.cu
index 964ab0c3f..21674f785 100644
--- a/src/c_api/c_api.cu
+++ b/src/c_api/c_api.cu
@@ -1,8 +1,12 @@
 /**
  * Copyright 2019-2023 by XGBoost Contributors
  */
-#include "../common/api_entry.h"  // XGBAPIThreadLocalEntry
+#include <thrust/transform.h>  // for transform
+
+#include "../common/api_entry.h"       // for XGBAPIThreadLocalEntry
+#include "../common/cuda_context.cuh"  // for CUDAContext
 #include "../common/threading_utils.h"
+#include "../data/array_interface.h"  // for DispatchDType, ArrayInterface
 #include "../data/device_adapter.cuh"
 #include "../data/proxy_dmatrix.h"
 #include "c_api_error.h"
@@ -13,7 +17,6 @@
 #include "xgboost/learner.h"
 
 namespace xgboost {
-
 void XGBBuildInfoDevice(Json *p_info) {
   auto &info = *p_info;
 
@@ -55,6 +58,27 @@ void XGBoostAPIGuard::RestoreGPUAttribute() {
   // If errors, do nothing, assuming running on CPU only machine.
   cudaSetDevice(device_id_);
 }
+
+void CopyGradientFromCUDAArrays(Context const *ctx, ArrayInterface<2, false> const &grad,
+                                ArrayInterface<2, false> const &hess,
+                                linalg::Matrix<GradientPair> *out_gpair) {
+  auto grad_dev = dh::CudaGetPointerDevice(grad.data);
+  auto hess_dev = dh::CudaGetPointerDevice(hess.data);
+  CHECK_EQ(grad_dev, hess_dev) << "gradient and hessian should be on the same device.";
+  auto &gpair = *out_gpair;
+  gpair.SetDevice(grad_dev);
+  gpair.Reshape(grad.Shape(0), grad.Shape(1));
+  auto d_gpair = gpair.View(grad_dev);
+  auto cuctx = ctx->CUDACtx();
+
+  DispatchDType(grad, DeviceOrd::CUDA(grad_dev), [&](auto &&t_grad) {
+    DispatchDType(hess, DeviceOrd::CUDA(hess_dev), [&](auto &&t_hess) {
+      CHECK_EQ(t_grad.Size(), t_hess.Size());
+      thrust::for_each_n(cuctx->CTP(), thrust::make_counting_iterator(0ul), t_grad.Size(),
+                         detail::CustomGradHessOp{t_grad, t_hess, d_gpair});
+    });
+  });
+}
 }                        // namespace xgboost
 
 using namespace xgboost;  // NOLINT
diff --git a/src/c_api/c_api_error.h b/src/c_api/c_api_error.h
index 019bc1cf0..11c440384 100644
--- a/src/c_api/c_api_error.h
+++ b/src/c_api/c_api_error.h
@@ -1,5 +1,5 @@
-/*!
- *  Copyright (c) 2015-2022 by Contributors
+/**
+ *  Copyright 2015-2023, XGBoost Contributors
  * \file c_api_error.h
  * \brief Error handling for C API.
  */
@@ -35,8 +35,8 @@
   }                                                                            \
   return 0; // NOLINT(*)
 
-#define CHECK_HANDLE() if (handle == nullptr) \
-  LOG(FATAL) << "DMatrix/Booster has not been initialized or has already been disposed.";
+#define CHECK_HANDLE() \
+  if (handle == nullptr) ::xgboost::detail::EmptyHandle();
 
 /*!
  * \brief Set the last error message needed by C API
diff --git a/src/c_api/c_api_utils.h b/src/c_api/c_api_utils.h
index 1af0206be..e42eed633 100644
--- a/src/c_api/c_api_utils.h
+++ b/src/c_api/c_api_utils.h
@@ -7,8 +7,10 @@
 #include <algorithm>
 #include <cstddef>
 #include <functional>
-#include <memory>  // std::shared_ptr
-#include <string>
+#include <memory>   // for shared_ptr
+#include <string>   // for string
+#include <tuple>    // for make_tuple
+#include <utility>  // for move
 #include <vector>
 
 #include "xgboost/c_api.h"
@@ -16,7 +18,7 @@
 #include "xgboost/feature_map.h"  // for FeatureMap
 #include "xgboost/json.h"
 #include "xgboost/learner.h"
-#include "xgboost/linalg.h"       // ArrayInterfaceHandler
+#include "xgboost/linalg.h"  // ArrayInterfaceHandler, MakeTensorView, ArrayInterfaceStr
 #include "xgboost/logging.h"
 #include "xgboost/string_view.h"  // StringView
 
@@ -287,6 +289,19 @@ inline std::shared_ptr<DMatrix> CastDMatrixHandle(DMatrixHandle const handle) {
 }
 
 namespace detail {
+inline void EmptyHandle() {
+  LOG(FATAL) << "DMatrix/Booster has not been initialized or has already been disposed.";
+}
+
+inline xgboost::Context const *BoosterCtx(BoosterHandle handle) {
+  if (handle == nullptr) {
+    EmptyHandle();
+  }
+  auto *learner = static_cast<xgboost::Learner *>(handle);
+  CHECK(learner);
+  return learner->Ctx();
+}
+
 template <typename PtrT, typename I, typename T>
 void MakeSparseFromPtr(PtrT const *p_indptr, I const *p_indices, T const *p_data,
                        std::size_t nindptr, std::string *indptr_str, std::string *indices_str,
@@ -334,6 +349,40 @@ void MakeSparseFromPtr(PtrT const *p_indptr, I const *p_indices, T const *p_data
   Json::Dump(jindices, indices_str);
   Json::Dump(jdata, data_str);
 }
+
+/**
+ * @brief Make array interface for other language bindings.
+ */
+template <typename G, typename H>
+auto MakeGradientInterface(Context const *ctx, G const *grad, H const *hess, std::size_t n_samples,
+                           std::size_t n_targets) {
+  auto t_grad =
+      linalg::MakeTensorView(ctx, common::Span{grad, n_samples * n_targets}, n_samples, n_targets);
+  auto t_hess =
+      linalg::MakeTensorView(ctx, common::Span{hess, n_samples * n_targets}, n_samples, n_targets);
+  auto s_grad = linalg::ArrayInterfaceStr(t_grad);
+  auto s_hess = linalg::ArrayInterfaceStr(t_hess);
+  return std::make_tuple(s_grad, s_hess);
+}
+
+template <typename G, typename H>
+struct CustomGradHessOp {
+  linalg::MatrixView<G> t_grad;
+  linalg::MatrixView<H> t_hess;
+  linalg::MatrixView<GradientPair> d_gpair;
+
+  CustomGradHessOp(linalg::MatrixView<G> t_grad, linalg::MatrixView<H> t_hess,
+                   linalg::MatrixView<GradientPair> d_gpair)
+      : t_grad{std::move(t_grad)}, t_hess{std::move(t_hess)}, d_gpair{std::move(d_gpair)} {}
+
+  XGBOOST_DEVICE void operator()(std::size_t i) {
+    auto [m, n] = linalg::UnravelIndex(i, t_grad.Shape(0), t_grad.Shape(1));
+    auto g = t_grad(m, n);
+    auto h = t_hess(m, n);
+    // from struct of arrays to array of structs.
+    d_gpair(m, n) = GradientPair{static_cast<float>(g), static_cast<float>(h)};
+  }
+};
 }  // namespace detail
 }  // namespace xgboost
 #endif  // XGBOOST_C_API_C_API_UTILS_H_
diff --git a/src/data/array_interface.h b/src/data/array_interface.h
index 99effffef..c62a5cef2 100644
--- a/src/data/array_interface.h
+++ b/src/data/array_interface.h
@@ -384,7 +384,7 @@ inline bool ArrayInterfaceHandler::IsCudaPtr(void const *) { return false; }
  *   numpy has the proper support even though it's in the __cuda_array_interface__
  *   protocol defined by numba.
  */
-template <int32_t D, bool allow_mask = (D == 1)>
+template <std::int32_t D, bool allow_mask = (D == 1)>
 class ArrayInterface {
   static_assert(D > 0, "Invalid dimension for array interface.");
 
@@ -588,7 +588,7 @@ class ArrayInterface {
 };
 
 template <std::int32_t D, typename Fn>
-void DispatchDType(ArrayInterface<D> const array, std::int32_t device, Fn fn) {
+void DispatchDType(ArrayInterface<D> const array, DeviceOrd device, Fn fn) {
   // Only used for cuDF at the moment.
   CHECK_EQ(array.valid.Capacity(), 0);
   auto dispatch = [&](auto t) {
diff --git a/src/data/data.cc b/src/data/data.cc
index e8ecccb81..467770715 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -448,7 +448,7 @@ void CopyTensorInfoImpl(Context const& ctx, Json arr_interface, linalg::Tensor<T
   auto t_out = p_out->View(Context::kCpuId);
   CHECK(t_out.CContiguous());
   auto const shape = t_out.Shape();
-  DispatchDType(array, Context::kCpuId, [&](auto&& in) {
+  DispatchDType(array, DeviceOrd::CPU(), [&](auto&& in) {
     linalg::ElementWiseTransformHost(t_out, ctx.Threads(), [&](auto i, auto) {
       return std::apply(in, linalg::UnravelIndex<D>(i, shape));
     });
diff --git a/src/gbm/gblinear.cc b/src/gbm/gblinear.cc
index 520f76581..bf4f6b92f 100644
--- a/src/gbm/gblinear.cc
+++ b/src/gbm/gblinear.cc
@@ -29,7 +29,6 @@
 #include "../common/error_msg.h"
 
 namespace xgboost::gbm {
-
 DMLC_REGISTRY_FILE_TAG(gblinear);
 
 // training parameters
@@ -142,7 +141,7 @@ class GBLinear : public GradientBooster {
     this->updater_->SaveConfig(&j_updater);
   }
 
-  void DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair, PredictionCacheEntry*,
+  void DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair, PredictionCacheEntry*,
                ObjFunction const*) override {
     monitor_.Start("DoBoost");
 
@@ -232,9 +231,8 @@ class GBLinear : public GradientBooster {
     std::fill(contribs.begin(), contribs.end(), 0);
   }
 
-  std::vector<std::string> DumpModel(const FeatureMap& fmap,
-                                     bool with_stats,
-                                     std::string format) const override {
+  [[nodiscard]] std::vector<std::string> DumpModel(const FeatureMap& fmap, bool with_stats,
+                                                   std::string format) const override {
     return model_.DumpModel(fmap, with_stats, format);
   }
 
@@ -263,7 +261,7 @@ class GBLinear : public GradientBooster {
     }
   }
 
-  bool UseGPU() const override {
+  [[nodiscard]] bool UseGPU() const override {
     if (param_.updater == "gpu_coord_descent") {
       return true;
     } else {
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index 92154609c..e9c5be003 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -167,8 +167,8 @@ void GBTree::Configure(Args const& cfg) {
   }
 }
 
-void GPUCopyGradient(HostDeviceVector<GradientPair> const*, bst_group_t, bst_group_t,
-                     HostDeviceVector<GradientPair>*)
+void GPUCopyGradient(Context const*, linalg::Matrix<GradientPair> const*, bst_group_t,
+                     linalg::Matrix<GradientPair>*)
 #if defined(XGBOOST_USE_CUDA)
     ;  // NOLINT
 #else
@@ -177,16 +177,19 @@ void GPUCopyGradient(HostDeviceVector<GradientPair> const*, bst_group_t, bst_gro
 }
 #endif
 
-void CopyGradient(HostDeviceVector<GradientPair> const* in_gpair, int32_t n_threads,
-                  bst_group_t n_groups, bst_group_t group_id,
-                  HostDeviceVector<GradientPair>* out_gpair) {
-  if (in_gpair->DeviceIdx() != Context::kCpuId) {
-    GPUCopyGradient(in_gpair, n_groups, group_id, out_gpair);
+void CopyGradient(Context const* ctx, linalg::Matrix<GradientPair> const* in_gpair,
+                  bst_group_t group_id, linalg::Matrix<GradientPair>* out_gpair) {
+  out_gpair->SetDevice(ctx->Device());
+  out_gpair->Reshape(in_gpair->Shape(0), 1);
+  if (ctx->IsCUDA()) {
+    GPUCopyGradient(ctx, in_gpair, group_id, out_gpair);
   } else {
-    std::vector<GradientPair> &tmp_h = out_gpair->HostVector();
-    const auto& gpair_h = in_gpair->ConstHostVector();
-    common::ParallelFor(out_gpair->Size(), n_threads,
-                        [&](auto i) { tmp_h[i] = gpair_h[i * n_groups + group_id]; });
+    auto const& in = *in_gpair;
+    auto target_gpair = in.Slice(linalg::All(), group_id);
+    auto h_tmp = out_gpair->HostView();
+    auto h_in = in.HostView().Slice(linalg::All(), group_id);
+    CHECK_EQ(h_tmp.Size(), h_in.Size());
+    common::ParallelFor(h_in.Size(), ctx->Threads(), [&](auto i) { h_tmp(i) = h_in(i); });
   }
 }
 
@@ -215,7 +218,7 @@ void GBTree::UpdateTreeLeaf(DMatrix const* p_fmat, HostDeviceVector<float> const
   }
 }
 
-void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
+void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
                      PredictionCacheEntry* predt, ObjFunction const* obj) {
   if (model_.learner_model_param->IsVectorLeaf()) {
     CHECK(tparam_.tree_method == TreeMethod::kHist || tparam_.tree_method == TreeMethod::kAuto)
@@ -263,12 +266,12 @@ void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
     }
   } else {
     CHECK_EQ(in_gpair->Size() % n_groups, 0U) << "must have exactly ngroup * nrow gpairs";
-    HostDeviceVector<GradientPair> tmp(in_gpair->Size() / n_groups, GradientPair(),
-                                       in_gpair->DeviceIdx());
+    linalg::Matrix<GradientPair> tmp{{in_gpair->Shape(0), static_cast<std::size_t>(1ul)},
+                                     ctx_->Ordinal()};
     bool update_predict = true;
     for (bst_target_t gid = 0; gid < n_groups; ++gid) {
       node_position.clear();
-      CopyGradient(in_gpair, ctx_->Threads(), n_groups, gid, &tmp);
+      CopyGradient(ctx_, in_gpair, gid, &tmp);
       TreesOneGroup ret;
       BoostNewTrees(&tmp, p_fmat, gid, &node_position, &ret);
       UpdateTreeLeaf(p_fmat, predt->predictions, obj, gid, node_position, &ret);
@@ -289,7 +292,7 @@ void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
   this->CommitModel(std::move(new_trees));
 }
 
-void GBTree::BoostNewTrees(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat, int bst_group,
+void GBTree::BoostNewTrees(linalg::Matrix<GradientPair>* gpair, DMatrix* p_fmat, int bst_group,
                            std::vector<HostDeviceVector<bst_node_t>>* out_position,
                            TreesOneGroup* ret) {
   std::vector<RegTree*> new_trees;
diff --git a/src/gbm/gbtree.cu b/src/gbm/gbtree.cu
index c1972b2fc..8c4a96090 100644
--- a/src/gbm/gbtree.cu
+++ b/src/gbm/gbtree.cu
@@ -1,22 +1,24 @@
 /**
  * Copyright 2021-2023, XGBoost Contributors
  */
-#include "../common/device_helpers.cuh"
-#include "xgboost/linalg.h"
-#include "xgboost/span.h"
+#include <thrust/iterator/counting_iterator.h>  // for make_counting_iterator
+
+#include "../common/cuda_context.cuh"
+#include "../common/device_helpers.cuh"  // for MakeTransformIterator
+#include "xgboost/base.h"                // for GradientPair
+#include "xgboost/linalg.h"              // for Matrix
 
 namespace xgboost::gbm {
-void GPUCopyGradient(HostDeviceVector<GradientPair> const *in_gpair,
-                     bst_group_t n_groups, bst_group_t group_id,
-                     HostDeviceVector<GradientPair> *out_gpair) {
-  auto mat = linalg::TensorView<GradientPair const, 2>(
-      in_gpair->ConstDeviceSpan(),
-      {in_gpair->Size() / n_groups, static_cast<size_t>(n_groups)},
-      in_gpair->DeviceIdx());
-  auto v_in = mat.Slice(linalg::All(), group_id);
-  out_gpair->Resize(v_in.Size());
-  auto d_out = out_gpair->DeviceSpan();
-  dh::LaunchN(v_in.Size(), [=] __device__(size_t i) { d_out[i] = v_in(i); });
+void GPUCopyGradient(Context const *ctx, linalg::Matrix<GradientPair> const *in_gpair,
+                     bst_group_t group_id, linalg::Matrix<GradientPair> *out_gpair) {
+  auto v_in = in_gpair->View(ctx->Device()).Slice(linalg::All(), group_id);
+  out_gpair->SetDevice(ctx->Device());
+  out_gpair->Reshape(v_in.Size(), 1);
+  auto d_out = out_gpair->View(ctx->Device());
+  auto cuctx = ctx->CUDACtx();
+  auto it = dh::MakeTransformIterator<GradientPair>(
+      thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) { return v_in(i); });
+  thrust::copy(cuctx->CTP(), it, it + v_in.Size(), d_out.Values().data());
 }
 
 void GPUDartPredictInc(common::Span<float> out_predts,
diff --git a/src/gbm/gbtree.h b/src/gbm/gbtree.h
index 81e568368..827d85217 100644
--- a/src/gbm/gbtree.h
+++ b/src/gbm/gbtree.h
@@ -183,8 +183,8 @@ class GBTree : public GradientBooster {
   /**
    * @brief Carry out one iteration of boosting.
    */
-  void DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
-               PredictionCacheEntry* predt, ObjFunction const* obj) override;
+  void DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair, PredictionCacheEntry* predt,
+               ObjFunction const* obj) override;
 
   [[nodiscard]] bool UseGPU() const override { return tparam_.tree_method == TreeMethod::kGPUHist; }
 
@@ -326,7 +326,7 @@ class GBTree : public GradientBooster {
   }
 
  protected:
-  void BoostNewTrees(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat, int bst_group,
+  void BoostNewTrees(linalg::Matrix<GradientPair>* gpair, DMatrix* p_fmat, int bst_group,
                      std::vector<HostDeviceVector<bst_node_t>>* out_position,
                      std::vector<std::unique_ptr<RegTree>>* ret);
 
diff --git a/src/learner.cc b/src/learner.cc
index 81d1b795b..be562f972 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -1282,14 +1282,14 @@ class LearnerImpl : public LearnerIO {
     monitor_.Start("GetGradient");
     GetGradient(predt.predictions, train->Info(), iter, &gpair_);
     monitor_.Stop("GetGradient");
-    TrainingObserver::Instance().Observe(gpair_, "Gradients");
+    TrainingObserver::Instance().Observe(*gpair_.Data(), "Gradients");
 
     gbm_->DoBoost(train.get(), &gpair_, &predt, obj_.get());
     monitor_.Stop("UpdateOneIter");
   }
 
   void BoostOneIter(int iter, std::shared_ptr<DMatrix> train,
-                    HostDeviceVector<GradientPair>* in_gpair) override {
+                    linalg::Matrix<GradientPair>* in_gpair) override {
     monitor_.Start("BoostOneIter");
     this->Configure();
 
@@ -1299,6 +1299,9 @@ class LearnerImpl : public LearnerIO {
 
     this->ValidateDMatrix(train.get(), true);
 
+    CHECK_EQ(this->learner_model_param_.OutputLength(), in_gpair->Shape(1))
+        << "The number of columns in gradient should be equal to the number of targets/classes in "
+           "the model.";
     auto& predt = prediction_container_.Cache(train, ctx_.gpu_id);
     gbm_->DoBoost(train.get(), in_gpair, &predt, obj_.get());
     monitor_.Stop("BoostOneIter");
@@ -1461,18 +1464,18 @@ class LearnerImpl : public LearnerIO {
   }
 
  private:
-  void GetGradient(HostDeviceVector<bst_float> const& preds, MetaInfo const& info, int iteration,
-                   HostDeviceVector<GradientPair>* out_gpair) {
-    out_gpair->Resize(preds.Size());
-    collective::ApplyWithLabels(info, out_gpair->HostPointer(),
+  void GetGradient(HostDeviceVector<bst_float> const& preds, MetaInfo const& info,
+                   std::int32_t iter, linalg::Matrix<GradientPair>* out_gpair) {
+    out_gpair->Reshape(info.num_row_, this->learner_model_param_.OutputLength());
+    collective::ApplyWithLabels(info, out_gpair->Data()->HostPointer(),
                                 out_gpair->Size() * sizeof(GradientPair),
-                                [&] { obj_->GetGradient(preds, info, iteration, out_gpair); });
+                                [&] { obj_->GetGradient(preds, info, iter, out_gpair); });
   }
 
   /*! \brief random number transformation seed. */
   static int32_t constexpr kRandSeedMagic = 127;
   // gradient pairs
-  HostDeviceVector<GradientPair> gpair_;
+  linalg::Matrix<GradientPair> gpair_;
   /*! \brief Temporary storage to prediction.  Useful for storing data transformed by
    *  objective function */
   PredictionContainer output_predictions_;
diff --git a/src/linear/updater_coordinate.cc b/src/linear/updater_coordinate.cc
index f660a1be8..0d61d7c7c 100644
--- a/src/linear/updater_coordinate.cc
+++ b/src/linear/updater_coordinate.cc
@@ -45,30 +45,31 @@ class CoordinateUpdater : public LinearUpdater {
     out["coordinate_param"] = ToJson(cparam_);
   }
 
-  void Update(HostDeviceVector<GradientPair> *in_gpair, DMatrix *p_fmat,
-              gbm::GBLinearModel *model, double sum_instance_weight) override {
+  void Update(linalg::Matrix<GradientPair> *in_gpair, DMatrix *p_fmat, gbm::GBLinearModel *model,
+              double sum_instance_weight) override {
+    auto gpair = in_gpair->Data();
     tparam_.DenormalizePenalties(sum_instance_weight);
     const int ngroup = model->learner_model_param->num_output_group;
     // update bias
     for (int group_idx = 0; group_idx < ngroup; ++group_idx) {
-      auto grad = GetBiasGradientParallel(group_idx, ngroup, in_gpair->ConstHostVector(), p_fmat,
+      auto grad = GetBiasGradientParallel(group_idx, ngroup, gpair->ConstHostVector(), p_fmat,
                                           ctx_->Threads());
       auto dbias = static_cast<float>(tparam_.learning_rate *
                                       CoordinateDeltaBias(grad.first, grad.second));
       model->Bias()[group_idx] += dbias;
-      UpdateBiasResidualParallel(ctx_, group_idx, ngroup, dbias, &in_gpair->HostVector(), p_fmat);
+      UpdateBiasResidualParallel(ctx_, group_idx, ngroup, dbias, &gpair->HostVector(), p_fmat);
     }
     // prepare for updating the weights
-    selector_->Setup(ctx_, *model, in_gpair->ConstHostVector(), p_fmat, tparam_.reg_alpha_denorm,
+    selector_->Setup(ctx_, *model, gpair->ConstHostVector(), p_fmat, tparam_.reg_alpha_denorm,
                      tparam_.reg_lambda_denorm, cparam_.top_k);
     // update weights
     for (int group_idx = 0; group_idx < ngroup; ++group_idx) {
       for (unsigned i = 0U; i < model->learner_model_param->num_feature; i++) {
         int fidx =
-            selector_->NextFeature(ctx_, i, *model, group_idx, in_gpair->ConstHostVector(), p_fmat,
+            selector_->NextFeature(ctx_, i, *model, group_idx, gpair->ConstHostVector(), p_fmat,
                                    tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm);
         if (fidx < 0) break;
-        this->UpdateFeature(fidx, group_idx, &in_gpair->HostVector(), p_fmat, model);
+        this->UpdateFeature(fidx, group_idx, &gpair->HostVector(), p_fmat, model);
       }
     }
     monitor_.Stop("UpdateFeature");
diff --git a/src/linear/updater_gpu_coordinate.cu b/src/linear/updater_gpu_coordinate.cu
index b6c817696..659b45135 100644
--- a/src/linear/updater_gpu_coordinate.cu
+++ b/src/linear/updater_gpu_coordinate.cu
@@ -93,17 +93,18 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
     }
   }
 
-  void Update(HostDeviceVector<GradientPair> *in_gpair, DMatrix *p_fmat,
-              gbm::GBLinearModel *model, double sum_instance_weight) override {
+  void Update(linalg::Matrix<GradientPair> *in_gpair, DMatrix *p_fmat, gbm::GBLinearModel *model,
+              double sum_instance_weight) override {
     tparam_.DenormalizePenalties(sum_instance_weight);
     monitor_.Start("LazyInitDevice");
     this->LazyInitDevice(p_fmat, *(model->learner_model_param));
     monitor_.Stop("LazyInitDevice");
 
     monitor_.Start("UpdateGpair");
+
     // Update gpair
-    if (ctx_->gpu_id >= 0) {
-      this->UpdateGpair(in_gpair->ConstHostVector());
+    if (ctx_->IsCUDA()) {
+      this->UpdateGpair(in_gpair->Data()->ConstHostVector());
     }
     monitor_.Stop("UpdateGpair");
 
@@ -111,15 +112,15 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
     this->UpdateBias(model);
     monitor_.Stop("UpdateBias");
     // prepare for updating the weights
-    selector_->Setup(ctx_, *model, in_gpair->ConstHostVector(), p_fmat, tparam_.reg_alpha_denorm,
-                     tparam_.reg_lambda_denorm, coord_param_.top_k);
+    selector_->Setup(ctx_, *model, in_gpair->Data()->ConstHostVector(), p_fmat,
+                     tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm, coord_param_.top_k);
     monitor_.Start("UpdateFeature");
     for (uint32_t group_idx = 0; group_idx < model->learner_model_param->num_output_group;
          ++group_idx) {
       for (auto i = 0U; i < model->learner_model_param->num_feature; i++) {
         auto fidx =
-            selector_->NextFeature(ctx_, i, *model, group_idx, in_gpair->ConstHostVector(), p_fmat,
-                                   tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm);
+            selector_->NextFeature(ctx_, i, *model, group_idx, in_gpair->Data()->ConstHostVector(),
+                                   p_fmat, tparam_.reg_alpha_denorm, tparam_.reg_lambda_denorm);
         if (fidx < 0) break;
         this->UpdateFeature(fidx, group_idx, model);
       }
diff --git a/src/linear/updater_shotgun.cc b/src/linear/updater_shotgun.cc
index 18b747f64..78fb1fe1b 100644
--- a/src/linear/updater_shotgun.cc
+++ b/src/linear/updater_shotgun.cc
@@ -6,8 +6,7 @@
 #include <xgboost/linear_updater.h>
 #include "coordinate_common.h"
 
-namespace xgboost {
-namespace linear {
+namespace xgboost::linear {
 
 DMLC_REGISTRY_FILE_TAG(updater_shotgun);
 
@@ -32,30 +31,31 @@ class ShotgunUpdater : public LinearUpdater {
     out["linear_train_param"] = ToJson(param_);
   }
 
-  void Update(HostDeviceVector<GradientPair> *in_gpair, DMatrix *p_fmat,
-              gbm::GBLinearModel *model, double sum_instance_weight) override {
-    auto &gpair = in_gpair->HostVector();
+  void Update(linalg::Matrix<GradientPair> *in_gpair, DMatrix *p_fmat, gbm::GBLinearModel *model,
+              double sum_instance_weight) override {
+    auto gpair = in_gpair->Data();
     param_.DenormalizePenalties(sum_instance_weight);
     const int ngroup = model->learner_model_param->num_output_group;
 
     // update bias
     for (int gid = 0; gid < ngroup; ++gid) {
-      auto grad = GetBiasGradientParallel(gid, ngroup, in_gpair->ConstHostVector(), p_fmat,
+      auto grad = GetBiasGradientParallel(gid, ngroup, gpair->ConstHostVector(), p_fmat,
                                           ctx_->Threads());
       auto dbias = static_cast<bst_float>(param_.learning_rate *
                                CoordinateDeltaBias(grad.first, grad.second));
       model->Bias()[gid] += dbias;
-      UpdateBiasResidualParallel(ctx_, gid, ngroup, dbias, &in_gpair->HostVector(), p_fmat);
+      UpdateBiasResidualParallel(ctx_, gid, ngroup, dbias, &gpair->HostVector(), p_fmat);
     }
 
     // lock-free parallel updates of weights
-    selector_->Setup(ctx_, *model, in_gpair->ConstHostVector(), p_fmat, param_.reg_alpha_denorm,
+    selector_->Setup(ctx_, *model, gpair->ConstHostVector(), p_fmat, param_.reg_alpha_denorm,
                      param_.reg_lambda_denorm, 0);
+    auto &h_gpair = gpair->HostVector();
     for (const auto &batch : p_fmat->GetBatches<CSCPage>(ctx_)) {
       auto page = batch.GetView();
       const auto nfeat = static_cast<bst_omp_uint>(batch.Size());
       common::ParallelFor(nfeat, ctx_->Threads(), [&](auto i) {
-        int ii = selector_->NextFeature(ctx_, i, *model, 0, in_gpair->ConstHostVector(), p_fmat,
+        int ii = selector_->NextFeature(ctx_, i, *model, 0, gpair->ConstHostVector(), p_fmat,
                                         param_.reg_alpha_denorm, param_.reg_lambda_denorm);
         if (ii < 0) return;
         const bst_uint fid = ii;
@@ -63,7 +63,7 @@ class ShotgunUpdater : public LinearUpdater {
         for (int gid = 0; gid < ngroup; ++gid) {
           double sum_grad = 0.0, sum_hess = 0.0;
           for (auto &c : col) {
-            const GradientPair &p = gpair[c.index * ngroup + gid];
+            const GradientPair &p = h_gpair[c.index * ngroup + gid];
             if (p.GetHess() < 0.0f) continue;
             const bst_float v = c.fvalue;
             sum_grad += p.GetGrad() * v;
@@ -77,7 +77,7 @@ class ShotgunUpdater : public LinearUpdater {
           w += dw;
           // update grad values
           for (auto &c : col) {
-            GradientPair &p = gpair[c.index * ngroup + gid];
+            GradientPair &p = h_gpair[c.index * ngroup + gid];
             if (p.GetHess() < 0.0f) continue;
             p += GradientPair(p.GetHess() * c.fvalue * dw, 0);
           }
@@ -98,5 +98,4 @@ XGBOOST_REGISTER_LINEAR_UPDATER(ShotgunUpdater, "shotgun")
         "Update linear model according to shotgun coordinate descent "
         "algorithm.")
     .set_body([]() { return new ShotgunUpdater(); });
-}  // namespace linear
-}  // namespace xgboost
+}  // namespace xgboost::linear
diff --git a/src/objective/aft_obj.cu b/src/objective/aft_obj.cu
index 52a58a7f4..522866a42 100644
--- a/src/objective/aft_obj.cu
+++ b/src/objective/aft_obj.cu
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2019-2022 by Contributors
+/**
+ * Copyright 2019-2023, XGBoost Contributors
  * \file aft_obj.cu
  * \brief Definition of AFT loss for survival analysis.
  * \author Avinash Barnwal, Hyunsu Cho and Toby Hocking
@@ -41,11 +41,9 @@ class AFTObj : public ObjFunction {
   ObjInfo Task() const override { return ObjInfo::kSurvival; }
 
   template <typename Distribution>
-  void GetGradientImpl(const HostDeviceVector<bst_float> &preds,
-                       const MetaInfo &info,
-                       HostDeviceVector<GradientPair> *out_gpair,
-                       size_t ndata, int device, bool is_null_weight,
-                       float aft_loss_distribution_scale) {
+  void GetGradientImpl(const HostDeviceVector<bst_float>& preds, const MetaInfo& info,
+                       linalg::Matrix<GradientPair>* out_gpair, size_t ndata, int device,
+                       bool is_null_weight, float aft_loss_distribution_scale) {
     common::Transform<>::Init(
         [=] XGBOOST_DEVICE(size_t _idx,
         common::Span<GradientPair> _out_gpair,
@@ -66,16 +64,17 @@ class AFTObj : public ObjFunction {
       _out_gpair[_idx] = GradientPair(grad * w, hess * w);
     },
     common::Range{0, static_cast<int64_t>(ndata)}, this->ctx_->Threads(), device).Eval(
-        out_gpair, &preds, &info.labels_lower_bound_, &info.labels_upper_bound_,
+        out_gpair->Data(), &preds, &info.labels_lower_bound_, &info.labels_upper_bound_,
         &info.weights_);
   }
 
   void GetGradient(const HostDeviceVector<bst_float>& preds, const MetaInfo& info, int /*iter*/,
-                   HostDeviceVector<GradientPair>* out_gpair) override {
+                   linalg::Matrix<GradientPair>* out_gpair) override {
     const size_t ndata = preds.Size();
     CHECK_EQ(info.labels_lower_bound_.Size(), ndata);
     CHECK_EQ(info.labels_upper_bound_.Size(), ndata);
-    out_gpair->Resize(ndata);
+    out_gpair->SetDevice(ctx_->Device());
+    out_gpair->Reshape(ndata, 1);
     const int device = ctx_->gpu_id;
     const float aft_loss_distribution_scale = param_.aft_loss_distribution_scale;
     const bool is_null_weight = info.weights_.Size() == 0;
diff --git a/src/objective/hinge.cu b/src/objective/hinge.cu
index bff3bc593..0d3ed6ca4 100644
--- a/src/objective/hinge.cu
+++ b/src/objective/hinge.cu
@@ -27,8 +27,8 @@ class HingeObj : public ObjFunction {
   void Configure(Args const&) override {}
   ObjInfo Task() const override { return ObjInfo::kRegression; }
 
-  void GetGradient(const HostDeviceVector<bst_float> &preds, const MetaInfo &info, int /*iter*/,
-                   HostDeviceVector<GradientPair> *out_gpair) override {
+  void GetGradient(const HostDeviceVector<bst_float> &preds, const MetaInfo &info,
+                   std::int32_t /*iter*/, linalg::Matrix<GradientPair> *out_gpair) override {
     CHECK_NE(info.labels.Size(), 0U) << "label set cannot be empty";
     CHECK_EQ(preds.Size(), info.labels.Size())
         << "labels are not correctly provided"
@@ -41,7 +41,8 @@ class HingeObj : public ObjFunction {
       CHECK_EQ(info.weights_.Size(), ndata)
           << "Number of weights should be equal to number of data points.";
     }
-    out_gpair->Resize(ndata);
+    CHECK_EQ(info.labels.Shape(1), 1) << "Multi-target for `binary:hinge` is not yet supported.";
+    out_gpair->Reshape(ndata, 1);
     common::Transform<>::Init(
         [=] XGBOOST_DEVICE(size_t _idx,
                            common::Span<GradientPair> _out_gpair,
@@ -63,7 +64,7 @@ class HingeObj : public ObjFunction {
         },
         common::Range{0, static_cast<int64_t>(ndata)}, this->ctx_->Threads(),
         ctx_->gpu_id).Eval(
-            out_gpair, &preds, info.labels.Data(), &info.weights_);
+            out_gpair->Data(), &preds, info.labels.Data(), &info.weights_);
   }
 
   void PredTransform(HostDeviceVector<bst_float> *io_preds) const override {
diff --git a/src/objective/init_estimation.cc b/src/objective/init_estimation.cc
index 834c052f5..47e0364fe 100644
--- a/src/objective/init_estimation.cc
+++ b/src/objective/init_estimation.cc
@@ -21,7 +21,7 @@ void FitIntercept::InitEstimation(MetaInfo const& info, linalg::Vector<float>* b
   }
   // Avoid altering any state in child objective.
   HostDeviceVector<float> dummy_predt(info.labels.Size(), 0.0f, this->ctx_->gpu_id);
-  HostDeviceVector<GradientPair> gpair(info.labels.Size(), GradientPair{}, this->ctx_->gpu_id);
+  linalg::Matrix<GradientPair> gpair(info.labels.Shape(), this->ctx_->gpu_id);
 
   Json config{Object{}};
   this->SaveConfig(&config);
diff --git a/src/objective/lambdarank_obj.cc b/src/objective/lambdarank_obj.cc
index d0ff5bda5..46fd77705 100644
--- a/src/objective/lambdarank_obj.cc
+++ b/src/objective/lambdarank_obj.cc
@@ -165,9 +165,8 @@ class LambdaRankObj : public FitIntercept {
   void CalcLambdaForGroup(std::int32_t iter, common::Span<float const> g_predt,
                           linalg::VectorView<float const> g_label, float w,
                           common::Span<std::size_t const> g_rank, bst_group_t g, Delta delta,
-                          common::Span<GradientPair> g_gpair) {
-    std::fill_n(g_gpair.data(), g_gpair.size(), GradientPair{});
-    auto p_gpair = g_gpair.data();
+                          linalg::VectorView<GradientPair> g_gpair) {
+    std::fill_n(g_gpair.Values().data(), g_gpair.Size(), GradientPair{});
 
     auto ti_plus = ti_plus_.HostView();
     auto tj_minus = tj_minus_.HostView();
@@ -198,8 +197,8 @@ class LambdaRankObj : public FitIntercept {
 
       std::size_t idx_high = g_rank[rank_high];
       std::size_t idx_low = g_rank[rank_low];
-      p_gpair[idx_high] += pg;
-      p_gpair[idx_low] += ng;
+      g_gpair(idx_high) += pg;
+      g_gpair(idx_low) += ng;
 
       if (unbiased) {
         auto k = ti_plus.Size();
@@ -225,12 +224,13 @@ class LambdaRankObj : public FitIntercept {
     MakePairs(ctx_, iter, p_cache_, g, g_label, g_rank, loop);
     if (sum_lambda > 0.0) {
       double norm = std::log2(1.0 + sum_lambda) / sum_lambda;
-      std::transform(g_gpair.data(), g_gpair.data() + g_gpair.size(), g_gpair.data(),
-                     [norm](GradientPair const& g) { return g * norm; });
+      std::transform(g_gpair.Values().data(), g_gpair.Values().data() + g_gpair.Size(),
+                     g_gpair.Values().data(), [norm](GradientPair const& g) { return g * norm; });
     }
 
     auto w_norm = p_cache_->WeightNorm();
-    std::transform(g_gpair.begin(), g_gpair.end(), g_gpair.begin(),
+    std::transform(g_gpair.Values().data(), g_gpair.Values().data() + g_gpair.Size(),
+                   g_gpair.Values().data(),
                    [&](GradientPair const& gpair) { return gpair * w * w_norm; });
   }
 
@@ -301,7 +301,7 @@ class LambdaRankObj : public FitIntercept {
   }
 
   void GetGradient(HostDeviceVector<float> const& predt, MetaInfo const& info, std::int32_t iter,
-                   HostDeviceVector<GradientPair>* out_gpair) override {
+                   linalg::Matrix<GradientPair>* out_gpair) override {
     CHECK_EQ(info.labels.Size(), predt.Size()) << error::LabelScoreSize();
 
     // init/renew cache
@@ -339,7 +339,7 @@ class LambdaRankNDCG : public LambdaRankObj<LambdaRankNDCG, ltr::NDCGCache> {
   void CalcLambdaForGroupNDCG(std::int32_t iter, common::Span<float const> g_predt,
                               linalg::VectorView<float const> g_label, float w,
                               common::Span<std::size_t const> g_rank,
-                              common::Span<GradientPair> g_gpair,
+                              linalg::VectorView<GradientPair> g_gpair,
                               linalg::VectorView<double const> inv_IDCG,
                               common::Span<double const> discount, bst_group_t g) {
     auto delta = [&](auto y_high, auto y_low, std::size_t rank_high, std::size_t rank_low,
@@ -351,7 +351,7 @@ class LambdaRankNDCG : public LambdaRankObj<LambdaRankNDCG, ltr::NDCGCache> {
   }
 
   void GetGradientImpl(std::int32_t iter, const HostDeviceVector<float>& predt,
-                       const MetaInfo& info, HostDeviceVector<GradientPair>* out_gpair) {
+                       const MetaInfo& info, linalg::Matrix<GradientPair>* out_gpair) {
     if (ctx_->IsCUDA()) {
       cuda_impl::LambdaRankGetGradientNDCG(
           ctx_, iter, predt, info, GetCache(), ti_plus_.View(ctx_->gpu_id),
@@ -363,8 +363,10 @@ class LambdaRankNDCG : public LambdaRankObj<LambdaRankNDCG, ltr::NDCGCache> {
     bst_group_t n_groups = p_cache_->Groups();
     auto gptr = p_cache_->DataGroupPtr(ctx_);
 
-    out_gpair->Resize(info.num_row_);
-    auto h_gpair = out_gpair->HostSpan();
+    out_gpair->SetDevice(ctx_->Device());
+    out_gpair->Reshape(info.num_row_, 1);
+
+    auto h_gpair = out_gpair->HostView();
     auto h_predt = predt.ConstHostSpan();
     auto h_label = info.labels.HostView();
     auto h_weight = common::MakeOptionalWeights(ctx_, info.weights_);
@@ -378,7 +380,8 @@ class LambdaRankNDCG : public LambdaRankObj<LambdaRankNDCG, ltr::NDCGCache> {
       std::size_t cnt = gptr[g + 1] - gptr[g];
       auto w = h_weight[g];
       auto g_predt = h_predt.subspan(gptr[g], cnt);
-      auto g_gpair = h_gpair.subspan(gptr[g], cnt);
+      auto g_gpair =
+          h_gpair.Slice(linalg::Range(static_cast<std::size_t>(gptr[g]), gptr[g] + cnt), 0);
       auto g_label = h_label.Slice(make_range(g), 0);
       auto g_rank = rank_idx.subspan(gptr[g], cnt);
 
@@ -420,7 +423,7 @@ void LambdaRankGetGradientNDCG(Context const*, std::int32_t, HostDeviceVector<fl
                                linalg::VectorView<double const>,  // input bias ratio
                                linalg::VectorView<double const>,  // input bias ratio
                                linalg::VectorView<double>, linalg::VectorView<double>,
-                               HostDeviceVector<GradientPair>*) {
+                               linalg::Matrix<GradientPair>*) {
   common::AssertGPUSupport();
 }
 
@@ -470,7 +473,7 @@ void MAPStat(Context const* ctx, linalg::VectorView<float const> label,
 class LambdaRankMAP : public LambdaRankObj<LambdaRankMAP, ltr::MAPCache> {
  public:
   void GetGradientImpl(std::int32_t iter, const HostDeviceVector<float>& predt,
-                       const MetaInfo& info, HostDeviceVector<GradientPair>* out_gpair) {
+                       const MetaInfo& info, linalg::Matrix<GradientPair>* out_gpair) {
     CHECK(param_.ndcg_exp_gain) << "NDCG gain can not be set for the MAP objective.";
     if (ctx_->IsCUDA()) {
       return cuda_impl::LambdaRankGetGradientMAP(
@@ -482,8 +485,11 @@ class LambdaRankMAP : public LambdaRankObj<LambdaRankMAP, ltr::MAPCache> {
     auto gptr = p_cache_->DataGroupPtr(ctx_).data();
     bst_group_t n_groups = p_cache_->Groups();
 
-    out_gpair->Resize(info.num_row_);
-    auto h_gpair = out_gpair->HostSpan();
+    CHECK_EQ(info.labels.Shape(1), 1) << "multi-target for learning to rank is not yet supported.";
+    out_gpair->SetDevice(ctx_->Device());
+    out_gpair->Reshape(info.num_row_, this->Targets(info));
+
+    auto h_gpair = out_gpair->HostView();
     auto h_label = info.labels.HostView().Slice(linalg::All(), 0);
     auto h_predt = predt.ConstHostSpan();
     auto rank_idx = p_cache_->SortedIdx(ctx_, h_predt);
@@ -514,7 +520,7 @@ class LambdaRankMAP : public LambdaRankObj<LambdaRankMAP, ltr::MAPCache> {
       auto cnt = gptr[g + 1] - gptr[g];
       auto w = h_weight[g];
       auto g_predt = h_predt.subspan(gptr[g], cnt);
-      auto g_gpair = h_gpair.subspan(gptr[g], cnt);
+      auto g_gpair = h_gpair.Slice(linalg::Range(gptr[g], gptr[g] + cnt), 0);
       auto g_label = h_label.Slice(make_range(g));
       auto g_rank = rank_idx.subspan(gptr[g], cnt);
 
@@ -545,7 +551,7 @@ void LambdaRankGetGradientMAP(Context const*, std::int32_t, HostDeviceVector<flo
                               linalg::VectorView<double const>,  // input bias ratio
                               linalg::VectorView<double const>,  // input bias ratio
                               linalg::VectorView<double>, linalg::VectorView<double>,
-                              HostDeviceVector<GradientPair>*) {
+                              linalg::Matrix<GradientPair>*) {
   common::AssertGPUSupport();
 }
 }  // namespace cuda_impl
@@ -557,7 +563,7 @@ void LambdaRankGetGradientMAP(Context const*, std::int32_t, HostDeviceVector<flo
 class LambdaRankPairwise : public LambdaRankObj<LambdaRankPairwise, ltr::RankingCache> {
  public:
   void GetGradientImpl(std::int32_t iter, const HostDeviceVector<float>& predt,
-                       const MetaInfo& info, HostDeviceVector<GradientPair>* out_gpair) {
+                       const MetaInfo& info, linalg::Matrix<GradientPair>* out_gpair) {
     CHECK(param_.ndcg_exp_gain) << "NDCG gain can not be set for the pairwise objective.";
     if (ctx_->IsCUDA()) {
       return cuda_impl::LambdaRankGetGradientPairwise(
@@ -569,8 +575,10 @@ class LambdaRankPairwise : public LambdaRankObj<LambdaRankPairwise, ltr::Ranking
     auto gptr = p_cache_->DataGroupPtr(ctx_);
     bst_group_t n_groups = p_cache_->Groups();
 
-    out_gpair->Resize(info.num_row_);
-    auto h_gpair = out_gpair->HostSpan();
+    out_gpair->SetDevice(ctx_->Device());
+    out_gpair->Reshape(info.num_row_, this->Targets(info));
+
+    auto h_gpair = out_gpair->HostView();
     auto h_label = info.labels.HostView().Slice(linalg::All(), 0);
     auto h_predt = predt.ConstHostSpan();
     auto h_weight = common::MakeOptionalWeights(ctx_, info.weights_);
@@ -585,7 +593,7 @@ class LambdaRankPairwise : public LambdaRankObj<LambdaRankPairwise, ltr::Ranking
       auto cnt = gptr[g + 1] - gptr[g];
       auto w = h_weight[g];
       auto g_predt = h_predt.subspan(gptr[g], cnt);
-      auto g_gpair = h_gpair.subspan(gptr[g], cnt);
+      auto g_gpair = h_gpair.Slice(linalg::Range(gptr[g], gptr[g] + cnt), 0);
       auto g_label = h_label.Slice(make_range(g));
       auto g_rank = rank_idx.subspan(gptr[g], cnt);
 
@@ -611,7 +619,7 @@ void LambdaRankGetGradientPairwise(Context const*, std::int32_t, HostDeviceVecto
                                    linalg::VectorView<double const>,  // input bias ratio
                                    linalg::VectorView<double const>,  // input bias ratio
                                    linalg::VectorView<double>, linalg::VectorView<double>,
-                                   HostDeviceVector<GradientPair>*) {
+                                   linalg::Matrix<GradientPair>*) {
   common::AssertGPUSupport();
 }
 }  // namespace cuda_impl
diff --git a/src/objective/lambdarank_obj.cu b/src/objective/lambdarank_obj.cu
index 2a7cac751..0f57fce48 100644
--- a/src/objective/lambdarank_obj.cu
+++ b/src/objective/lambdarank_obj.cu
@@ -93,7 +93,7 @@ struct GetGradOp {
     // obtain group segment data.
     auto g_label = args.labels.Slice(linalg::Range(data_group_begin, data_group_begin + n_data), 0);
     auto g_predt = args.predts.subspan(data_group_begin, n_data);
-    auto g_gpair = args.gpairs.subspan(data_group_begin, n_data).data();
+    auto g_gpair = args.gpairs.Slice(linalg::Range(data_group_begin, data_group_begin + n_data));
     auto g_rank = args.d_sorted_idx.subspan(data_group_begin, n_data);
 
     auto [i, j] = make_pair(idx, g);
@@ -128,8 +128,8 @@ struct GetGradOp {
       auto ngt = GradientPair{common::TruncateWithRounding(gr.GetGrad(), ng.GetGrad()),
                               common::TruncateWithRounding(gr.GetHess(), ng.GetHess())};
 
-      dh::AtomicAddGpair(g_gpair + idx_high, pgt);
-      dh::AtomicAddGpair(g_gpair + idx_low, ngt);
+      dh::AtomicAddGpair(&g_gpair(idx_high), pgt);
+      dh::AtomicAddGpair(&g_gpair(idx_low), ngt);
     }
 
     if (unbiased && need_update) {
@@ -266,16 +266,16 @@ void CalcGrad(Context const* ctx, MetaInfo const& info, std::shared_ptr<ltr::Ran
    */
   auto d_weights = common::MakeOptionalWeights(ctx, info.weights_);
   auto w_norm = p_cache->WeightNorm();
-  thrust::for_each_n(ctx->CUDACtx()->CTP(), thrust::make_counting_iterator(0ul), d_gpair.size(),
-                     [=] XGBOOST_DEVICE(std::size_t i) {
+  thrust::for_each_n(ctx->CUDACtx()->CTP(), thrust::make_counting_iterator(0ul), d_gpair.Size(),
+                     [=] XGBOOST_DEVICE(std::size_t i) mutable {
                        auto g = dh::SegmentId(d_gptr, i);
                        auto sum_lambda = thrust::get<2>(d_max_lambdas[g]);
                        // Normalization
                        if (sum_lambda > 0.0) {
                          double norm = std::log2(1.0 + sum_lambda) / sum_lambda;
-                         d_gpair[i] *= norm;
+                         d_gpair(i, 0) *= norm;
                        }
-                       d_gpair[i] *= (d_weights[g] * w_norm);
+                       d_gpair(i, 0) *= (d_weights[g] * w_norm);
                      });
 }
 
@@ -288,7 +288,7 @@ void Launch(Context const* ctx, std::int32_t iter, HostDeviceVector<float> const
             linalg::VectorView<double const> ti_plus,   // input bias ratio
             linalg::VectorView<double const> tj_minus,  // input bias ratio
             linalg::VectorView<double> li, linalg::VectorView<double> lj,
-            HostDeviceVector<GradientPair>* out_gpair) {
+            linalg::Matrix<GradientPair>* out_gpair) {
   // boilerplate
   std::int32_t device_id = ctx->gpu_id;
   dh::safe_cuda(cudaSetDevice(device_id));
@@ -296,8 +296,8 @@ void Launch(Context const* ctx, std::int32_t iter, HostDeviceVector<float> const
 
   info.labels.SetDevice(device_id);
   preds.SetDevice(device_id);
-  out_gpair->SetDevice(device_id);
-  out_gpair->Resize(preds.Size());
+  out_gpair->SetDevice(ctx->Device());
+  out_gpair->Reshape(preds.Size(), 1);
 
   CHECK(p_cache);
 
@@ -308,8 +308,9 @@ void Launch(Context const* ctx, std::int32_t iter, HostDeviceVector<float> const
 
   auto label = info.labels.View(ctx->gpu_id);
   auto predts = preds.ConstDeviceSpan();
-  auto gpairs = out_gpair->DeviceSpan();
-  thrust::fill_n(ctx->CUDACtx()->CTP(), gpairs.data(), gpairs.size(), GradientPair{0.0f, 0.0f});
+  auto gpairs = out_gpair->View(ctx->Device());
+  thrust::fill_n(ctx->CUDACtx()->CTP(), gpairs.Values().data(), gpairs.Size(),
+                 GradientPair{0.0f, 0.0f});
 
   auto const d_threads_group_ptr = p_cache->CUDAThreadsGroupPtr();
   auto const d_gptr = p_cache->DataGroupPtr(ctx);
@@ -371,7 +372,7 @@ void LambdaRankGetGradientNDCG(Context const* ctx, std::int32_t iter,
                                linalg::VectorView<double const> ti_plus,   // input bias ratio
                                linalg::VectorView<double const> tj_minus,  // input bias ratio
                                linalg::VectorView<double> li, linalg::VectorView<double> lj,
-                               HostDeviceVector<GradientPair>* out_gpair) {
+                               linalg::Matrix<GradientPair>* out_gpair) {
   // boilerplate
   std::int32_t device_id = ctx->gpu_id;
   dh::safe_cuda(cudaSetDevice(device_id));
@@ -440,7 +441,7 @@ void LambdaRankGetGradientMAP(Context const* ctx, std::int32_t iter,
                               linalg::VectorView<double const> ti_plus,   // input bias ratio
                               linalg::VectorView<double const> tj_minus,  // input bias ratio
                               linalg::VectorView<double> li, linalg::VectorView<double> lj,
-                              HostDeviceVector<GradientPair>* out_gpair) {
+                              linalg::Matrix<GradientPair>* out_gpair) {
   std::int32_t device_id = ctx->gpu_id;
   dh::safe_cuda(cudaSetDevice(device_id));
 
@@ -479,7 +480,7 @@ void LambdaRankGetGradientPairwise(Context const* ctx, std::int32_t iter,
                                    linalg::VectorView<double const> ti_plus,   // input bias ratio
                                    linalg::VectorView<double const> tj_minus,  // input bias ratio
                                    linalg::VectorView<double> li, linalg::VectorView<double> lj,
-                                   HostDeviceVector<GradientPair>* out_gpair) {
+                                   linalg::Matrix<GradientPair>* out_gpair) {
   std::int32_t device_id = ctx->gpu_id;
   dh::safe_cuda(cudaSetDevice(device_id));
 
diff --git a/src/objective/lambdarank_obj.cuh b/src/objective/lambdarank_obj.cuh
index be9f479ce..2e5724f7f 100644
--- a/src/objective/lambdarank_obj.cuh
+++ b/src/objective/lambdarank_obj.cuh
@@ -61,7 +61,7 @@ struct KernelInputs {
 
   linalg::MatrixView<float const> labels;
   common::Span<float const> predts;
-  common::Span<GradientPair> gpairs;
+  linalg::MatrixView<GradientPair> gpairs;
 
   linalg::VectorView<GradientPair const> d_roundings;
   double const *d_cost_rounding;
@@ -79,8 +79,8 @@ struct MakePairsOp {
   /**
    * \brief Make pair for the topk pair method.
    */
-  XGBOOST_DEVICE std::tuple<std::size_t, std::size_t> WithTruncation(std::size_t idx,
-                                                                     bst_group_t g) const {
+  [[nodiscard]] XGBOOST_DEVICE std::tuple<std::size_t, std::size_t> WithTruncation(
+      std::size_t idx, bst_group_t g) const {
     auto thread_group_begin = args.d_threads_group_ptr[g];
     auto idx_in_thread_group = idx - thread_group_begin;
 
diff --git a/src/objective/lambdarank_obj.h b/src/objective/lambdarank_obj.h
index c2222c028..f3856e3ce 100644
--- a/src/objective/lambdarank_obj.h
+++ b/src/objective/lambdarank_obj.h
@@ -154,7 +154,7 @@ void LambdaRankGetGradientNDCG(Context const* ctx, std::int32_t iter,
                                linalg::VectorView<double const> t_plus,   // input bias ratio
                                linalg::VectorView<double const> t_minus,  // input bias ratio
                                linalg::VectorView<double> li, linalg::VectorView<double> lj,
-                               HostDeviceVector<GradientPair>* out_gpair);
+                               linalg::Matrix<GradientPair>* out_gpair);
 
 /**
  * \brief Generate statistic for MAP used for calculating \Delta Z in lambda mart.
@@ -168,7 +168,7 @@ void LambdaRankGetGradientMAP(Context const* ctx, std::int32_t iter,
                               linalg::VectorView<double const> t_plus,   // input bias ratio
                               linalg::VectorView<double const> t_minus,  // input bias ratio
                               linalg::VectorView<double> li, linalg::VectorView<double> lj,
-                              HostDeviceVector<GradientPair>* out_gpair);
+                              linalg::Matrix<GradientPair>* out_gpair);
 
 void LambdaRankGetGradientPairwise(Context const* ctx, std::int32_t iter,
                                    HostDeviceVector<float> const& predt, const MetaInfo& info,
@@ -176,7 +176,7 @@ void LambdaRankGetGradientPairwise(Context const* ctx, std::int32_t iter,
                                    linalg::VectorView<double const> ti_plus,   // input bias ratio
                                    linalg::VectorView<double const> tj_minus,  // input bias ratio
                                    linalg::VectorView<double> li, linalg::VectorView<double> lj,
-                                   HostDeviceVector<GradientPair>* out_gpair);
+                                   linalg::Matrix<GradientPair>* out_gpair);
 
 void LambdaRankUpdatePositionBias(Context const* ctx, linalg::VectorView<double const> li_full,
                                   linalg::VectorView<double const> lj_full,
diff --git a/src/objective/multiclass_obj.cu b/src/objective/multiclass_obj.cu
index 312992ec5..7c762ed48 100644
--- a/src/objective/multiclass_obj.cu
+++ b/src/objective/multiclass_obj.cu
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2015-2022 by XGBoost Contributors
+/**
+ * Copyright 2015-2023, XGBoost Contributors
  * \file multi_class.cc
  * \brief Definition of multi-class classification objectives.
  * \author Tianqi Chen
@@ -48,13 +48,8 @@ class SoftmaxMultiClassObj : public ObjFunction {
 
   ObjInfo Task() const override { return ObjInfo::kClassification; }
 
-  void GetGradient(const HostDeviceVector<bst_float>& preds,
-                   const MetaInfo& info,
-                   int iter,
-                   HostDeviceVector<GradientPair>* out_gpair) override {
-    // Remove unused parameter compiler warning.
-    (void) iter;
-
+  void GetGradient(const HostDeviceVector<bst_float>& preds, const MetaInfo& info, std::int32_t,
+                   linalg::Matrix<GradientPair>* out_gpair) override {
     if (info.labels.Size() == 0) {
       return;
     }
@@ -77,7 +72,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
     label_correct_.Resize(1);
     label_correct_.SetDevice(device);
 
-    out_gpair->Resize(preds.Size());
+    out_gpair->Reshape(info.num_row_, static_cast<std::uint64_t>(nclass));
     label_correct_.Fill(1);
 
     const bool is_null_weight = info.weights_.Size() == 0;
@@ -115,7 +110,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
             gpair[idx * nclass + k] = GradientPair(p * wt, h);
           }
         }, common::Range{0, ndata}, ctx_->Threads(), device)
-        .Eval(out_gpair, info.labels.Data(), &preds, &info.weights_, &label_correct_);
+        .Eval(out_gpair->Data(), info.labels.Data(), &preds, &info.weights_, &label_correct_);
 
     std::vector<int>& label_correct_h = label_correct_.HostVector();
     for (auto const flag : label_correct_h) {
diff --git a/src/objective/quantile_obj.cu b/src/objective/quantile_obj.cu
index f94b5edf0..0774223e7 100644
--- a/src/objective/quantile_obj.cu
+++ b/src/objective/quantile_obj.cu
@@ -27,13 +27,12 @@
 
 #endif                              // defined(XGBOOST_USE_CUDA)
 
-namespace xgboost {
-namespace obj {
+namespace xgboost::obj {
 class QuantileRegression : public ObjFunction {
   common::QuantileLossParam param_;
   HostDeviceVector<float> alpha_;
 
-  bst_target_t Targets(MetaInfo const& info) const override {
+  [[nodiscard]] bst_target_t Targets(MetaInfo const& info) const override {
     auto const& alpha = param_.quantile_alpha.Get();
     CHECK_EQ(alpha.size(), alpha_.Size()) << "The objective is not yet configured.";
     if (info.ShouldHaveLabels()) {
@@ -50,7 +49,7 @@ class QuantileRegression : public ObjFunction {
 
  public:
   void GetGradient(HostDeviceVector<float> const& preds, const MetaInfo& info, std::int32_t iter,
-                   HostDeviceVector<GradientPair>* out_gpair) override {
+                   linalg::Matrix<GradientPair>* out_gpair) override {
     if (iter == 0) {
       CheckInitInputs(info);
     }
@@ -65,10 +64,11 @@ class QuantileRegression : public ObjFunction {
 
     auto labels = info.labels.View(ctx_->gpu_id);
 
-    out_gpair->SetDevice(ctx_->gpu_id);
-    out_gpair->Resize(n_targets * info.num_row_);
-    auto gpair =
-        linalg::MakeTensorView(ctx_, out_gpair, info.num_row_, n_alphas, n_targets / n_alphas);
+    out_gpair->SetDevice(ctx_->Device());
+    CHECK_EQ(info.labels.Shape(1), 1)
+        << "Multi-target for quantile regression is not yet supported.";
+    out_gpair->Reshape(info.num_row_, n_targets);
+    auto gpair = out_gpair->View(ctx_->Device());
 
     info.weights_.SetDevice(ctx_->gpu_id);
     common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
@@ -85,15 +85,16 @@ class QuantileRegression : public ObjFunction {
         ctx_, gpair, [=] XGBOOST_DEVICE(std::size_t i, GradientPair const&) mutable {
           auto [sample_id, quantile_id, target_id] =
               linalg::UnravelIndex(i, n_samples, alpha.size(), n_targets / alpha.size());
+          assert(target_id == 0);
 
           auto d = predt(i) - labels(sample_id, target_id);
           auto h = weight[sample_id];
           if (d >= 0) {
             auto g = (1.0f - alpha[quantile_id]) * weight[sample_id];
-            gpair(sample_id, quantile_id, target_id) = GradientPair{g, h};
+            gpair(sample_id, quantile_id) = GradientPair{g, h};
           } else {
             auto g = (-alpha[quantile_id] * weight[sample_id]);
-            gpair(sample_id, quantile_id, target_id) = GradientPair{g, h};
+            gpair(sample_id, quantile_id) = GradientPair{g, h};
           }
         });
   }
@@ -192,7 +193,7 @@ class QuantileRegression : public ObjFunction {
     param_.Validate();
     this->alpha_.HostVector() = param_.quantile_alpha.Get();
   }
-  ObjInfo Task() const override { return {ObjInfo::kRegression, true, true}; }
+  [[nodiscard]] ObjInfo Task() const override { return {ObjInfo::kRegression, true, true}; }
   static char const* Name() { return "reg:quantileerror"; }
 
   void SaveConfig(Json* p_out) const override {
@@ -206,8 +207,8 @@ class QuantileRegression : public ObjFunction {
     alpha_.HostVector() = param_.quantile_alpha.Get();
   }
 
-  const char* DefaultEvalMetric() const override { return "quantile"; }
-  Json DefaultMetricConfig() const override {
+  [[nodiscard]] const char* DefaultEvalMetric() const override { return "quantile"; }
+  [[nodiscard]] Json DefaultMetricConfig() const override {
     CHECK(param_.GetInitialised());
     Json config{Object{}};
     config["name"] = String{this->DefaultEvalMetric()};
@@ -223,5 +224,4 @@ XGBOOST_REGISTER_OBJECTIVE(QuantileRegression, QuantileRegression::Name())
 #if defined(XGBOOST_USE_CUDA)
 DMLC_REGISTRY_FILE_TAG(quantile_obj_gpu);
 #endif  // defined(XGBOOST_USE_CUDA)
-}  // namespace obj
-}  // namespace xgboost
+}  // namespace xgboost::obj
diff --git a/src/objective/regression_obj.cu b/src/objective/regression_obj.cu
index 4c5ed9ec8..5751d6102 100644
--- a/src/objective/regression_obj.cu
+++ b/src/objective/regression_obj.cu
@@ -36,12 +36,12 @@
 #include "xgboost/tree_model.h"  // RegTree
 
 #if defined(XGBOOST_USE_CUDA)
+#include "../common/cuda_context.cuh"  // for CUDAContext
 #include "../common/device_helpers.cuh"
 #include "../common/linalg_op.cuh"
 #endif  // defined(XGBOOST_USE_CUDA)
 
-namespace xgboost {
-namespace obj {
+namespace xgboost::obj {
 namespace {
 void CheckRegInputs(MetaInfo const& info, HostDeviceVector<bst_float> const& preds) {
   CheckInitInputs(info);
@@ -68,33 +68,60 @@ class RegLossObj : public FitIntercept {
   HostDeviceVector<float> additional_input_;
 
  public:
-  // 0 - label_correct flag, 1 - scale_pos_weight, 2 - is_null_weight
-  RegLossObj(): additional_input_(3) {}
+  void ValidateLabel(MetaInfo const& info) {
+    auto label = info.labels.View(ctx_->Ordinal());
+    auto valid = ctx_->DispatchDevice(
+        [&] {
+          return std::all_of(linalg::cbegin(label), linalg::cend(label),
+                             [](float y) -> bool { return Loss::CheckLabel(y); });
+        },
+        [&] {
+#if defined(XGBOOST_USE_CUDA)
+          auto cuctx = ctx_->CUDACtx();
+          auto it = dh::MakeTransformIterator<bool>(
+              thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(std::size_t i) -> bool {
+                auto [m, n] = linalg::UnravelIndex(i, label.Shape());
+                return Loss::CheckLabel(label(m, n));
+              });
+          return dh::Reduce(cuctx->CTP(), it, it + label.Size(), true, thrust::logical_and<>{});
+#else
+          common::AssertGPUSupport();
+          return false;
+#endif  // defined(XGBOOST_USE_CUDA)
+        });
+    if (!valid) {
+      LOG(FATAL) << Loss::LabelErrorMsg();
+    }
+  }
+  // 0 - scale_pos_weight, 1 - is_null_weight
+  RegLossObj(): additional_input_(2) {}
 
   void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
     param_.UpdateAllowUnknown(args);
   }
 
-  ObjInfo Task() const override { return Loss::Info(); }
+  [[nodiscard]] ObjInfo Task() const override { return Loss::Info(); }
 
-  bst_target_t Targets(MetaInfo const& info) const override {
+  [[nodiscard]] bst_target_t Targets(MetaInfo const& info) const override {
     // Multi-target regression.
-    return std::max(static_cast<size_t>(1), info.labels.Shape(1));
+    return std::max(static_cast<std::size_t>(1), info.labels.Shape(1));
   }
 
-  void GetGradient(const HostDeviceVector<bst_float>& preds,
-                   const MetaInfo &info, int,
-                   HostDeviceVector<GradientPair>* out_gpair) override {
+  void GetGradient(const HostDeviceVector<bst_float>& preds, const MetaInfo& info,
+                   std::int32_t iter, linalg::Matrix<GradientPair>* out_gpair) override {
     CheckRegInputs(info, preds);
+    if (iter == 0) {
+      ValidateLabel(info);
+    }
+
     size_t const ndata = preds.Size();
-    out_gpair->Resize(ndata);
+    out_gpair->SetDevice(ctx_->Device());
     auto device = ctx_->gpu_id;
-    additional_input_.HostVector().begin()[0] = 1;  // Fill the label_correct flag
 
     bool is_null_weight = info.weights_.Size() == 0;
     auto scale_pos_weight = param_.scale_pos_weight;
-    additional_input_.HostVector().begin()[1] = scale_pos_weight;
-    additional_input_.HostVector().begin()[2] = is_null_weight;
+    additional_input_.HostVector().begin()[0] = scale_pos_weight;
+    additional_input_.HostVector().begin()[1] = is_null_weight;
 
     const size_t nthreads = ctx_->Threads();
     bool on_device = device >= 0;
@@ -102,7 +129,8 @@ class RegLossObj : public FitIntercept {
     // for better performance.
     const size_t n_data_blocks = std::max(static_cast<size_t>(1), (on_device ? ndata : nthreads));
     const size_t block_size = ndata / n_data_blocks + !!(ndata % n_data_blocks);
-    auto const n_targets = std::max(info.labels.Shape(1), static_cast<size_t>(1));
+    auto const n_targets = this->Targets(info);
+    out_gpair->Reshape(info.num_row_, n_targets);
 
     common::Transform<>::Init(
         [block_size, ndata, n_targets] XGBOOST_DEVICE(
@@ -117,8 +145,8 @@ class RegLossObj : public FitIntercept {
           GradientPair* out_gpair_ptr = _out_gpair.data();
           const size_t begin = data_block_idx*block_size;
           const size_t end = std::min(ndata, begin + block_size);
-          const float _scale_pos_weight = _additional_input[1];
-          const bool _is_null_weight = _additional_input[2];
+          const float _scale_pos_weight = _additional_input[0];
+          const bool _is_null_weight = _additional_input[1];
 
           for (size_t idx = begin; idx < end; ++idx) {
             bst_float p = Loss::PredTransform(preds_ptr[idx]);
@@ -127,26 +155,17 @@ class RegLossObj : public FitIntercept {
             if (label == 1.0f) {
               w *= _scale_pos_weight;
             }
-            if (!Loss::CheckLabel(label)) {
-              // If there is an incorrect label, the host code will know.
-              _additional_input[0] = 0;
-            }
             out_gpair_ptr[idx] = GradientPair(Loss::FirstOrderGradient(p, label) * w,
                                               Loss::SecondOrderGradient(p, label) * w);
           }
         },
         common::Range{0, static_cast<int64_t>(n_data_blocks)}, nthreads, device)
-        .Eval(&additional_input_, out_gpair, &preds, info.labels.Data(),
+        .Eval(&additional_input_, out_gpair->Data(), &preds, info.labels.Data(),
               &info.weights_);
-
-    auto const flag = additional_input_.HostVector().begin()[0];
-    if (flag == 0) {
-      LOG(FATAL) << Loss::LabelErrorMsg();
-    }
   }
 
  public:
-  const char* DefaultEvalMetric() const override {
+  [[nodiscard]] const char* DefaultEvalMetric() const override {
     return Loss::DefaultEvalMetric();
   }
 
@@ -160,7 +179,7 @@ class RegLossObj : public FitIntercept {
         .Eval(io_preds);
   }
 
-  float ProbToMargin(float base_score) const override {
+  [[nodiscard]] float ProbToMargin(float base_score) const override {
     return Loss::ProbToMargin(base_score);
   }
 
@@ -215,21 +234,21 @@ class PseudoHuberRegression : public FitIntercept {
 
  public:
   void Configure(Args const& args) override { param_.UpdateAllowUnknown(args); }
-  ObjInfo Task() const override { return ObjInfo::kRegression; }
-  bst_target_t Targets(MetaInfo const& info) const override {
-    return std::max(static_cast<size_t>(1), info.labels.Shape(1));
+  [[nodiscard]] ObjInfo Task() const override { return ObjInfo::kRegression; }
+  [[nodiscard]] bst_target_t Targets(MetaInfo const& info) const override {
+    return std::max(static_cast<std::size_t>(1), info.labels.Shape(1));
   }
 
   void GetGradient(HostDeviceVector<bst_float> const& preds, const MetaInfo& info, int /*iter*/,
-                   HostDeviceVector<GradientPair>* out_gpair) override {
+                   linalg::Matrix<GradientPair>* out_gpair) override {
     CheckRegInputs(info, preds);
     auto slope = param_.huber_slope;
     CHECK_NE(slope, 0.0) << "slope for pseudo huber cannot be 0.";
     auto labels = info.labels.View(ctx_->gpu_id);
 
     out_gpair->SetDevice(ctx_->gpu_id);
-    out_gpair->Resize(info.labels.Size());
-    auto gpair = linalg::MakeVec(out_gpair);
+    out_gpair->Reshape(info.num_row_, this->Targets(info));
+    auto gpair = out_gpair->View(ctx_->Device());
 
     preds.SetDevice(ctx_->gpu_id);
     auto predt = linalg::MakeVec(&preds);
@@ -252,7 +271,7 @@ class PseudoHuberRegression : public FitIntercept {
     });
   }
 
-  const char* DefaultEvalMetric() const override { return "mphe"; }
+  [[nodiscard]] const char* DefaultEvalMetric() const override { return "mphe"; }
 
   void SaveConfig(Json* p_out) const override {
     auto& out = *p_out;
@@ -292,15 +311,15 @@ class PoissonRegression : public FitIntercept {
     param_.UpdateAllowUnknown(args);
   }
 
-  ObjInfo Task() const override { return ObjInfo::kRegression; }
+  [[nodiscard]] ObjInfo Task() const override { return ObjInfo::kRegression; }
 
-  void GetGradient(const HostDeviceVector<bst_float>& preds,
-                   const MetaInfo &info, int,
-                   HostDeviceVector<GradientPair> *out_gpair) override {
+  void GetGradient(const HostDeviceVector<bst_float>& preds, const MetaInfo& info, int,
+                   linalg::Matrix<GradientPair>* out_gpair) override {
     CHECK_NE(info.labels.Size(), 0U) << "label set cannot be empty";
     CHECK_EQ(preds.Size(), info.labels.Size()) << "labels are not correctly provided";
     size_t const ndata = preds.Size();
-    out_gpair->Resize(ndata);
+    out_gpair->SetDevice(ctx_->Device());
+    out_gpair->Reshape(info.num_row_, this->Targets(info));
     auto device = ctx_->gpu_id;
     label_correct_.Resize(1);
     label_correct_.Fill(1);
@@ -328,7 +347,7 @@ class PoissonRegression : public FitIntercept {
                                           expf(p + max_delta_step) * w};
         },
         common::Range{0, static_cast<int64_t>(ndata)}, this->ctx_->Threads(), device).Eval(
-            &label_correct_, out_gpair, &preds, info.labels.Data(), &info.weights_);
+            &label_correct_, out_gpair->Data(), &preds, info.labels.Data(), &info.weights_);
     // copy "label correct" flags back to host
     std::vector<int>& label_correct_h = label_correct_.HostVector();
     for (auto const flag : label_correct_h) {
@@ -349,10 +368,10 @@ class PoissonRegression : public FitIntercept {
   void EvalTransform(HostDeviceVector<bst_float> *io_preds) override {
     PredTransform(io_preds);
   }
-  bst_float ProbToMargin(bst_float base_score) const override {
+  [[nodiscard]] float ProbToMargin(bst_float base_score) const override {
     return std::log(base_score);
   }
-  const char* DefaultEvalMetric() const override {
+  [[nodiscard]] const char* DefaultEvalMetric() const override {
     return "poisson-nloglik";
   }
 
@@ -383,16 +402,15 @@ XGBOOST_REGISTER_OBJECTIVE(PoissonRegression, "count:poisson")
 class CoxRegression : public FitIntercept {
  public:
   void Configure(Args const&) override {}
-  ObjInfo Task() const override { return ObjInfo::kRegression; }
+  [[nodiscard]] ObjInfo Task() const override { return ObjInfo::kRegression; }
 
-  void GetGradient(const HostDeviceVector<bst_float>& preds,
-                   const MetaInfo &info, int,
-                   HostDeviceVector<GradientPair> *out_gpair) override {
+  void GetGradient(const HostDeviceVector<bst_float>& preds, const MetaInfo& info, int,
+                   linalg::Matrix<GradientPair>* out_gpair) override {
     CHECK_NE(info.labels.Size(), 0U) << "label set cannot be empty";
     CHECK_EQ(preds.Size(), info.labels.Size()) << "labels are not correctly provided";
     const auto& preds_h = preds.HostVector();
-    out_gpair->Resize(preds_h.size());
-    auto& gpair = out_gpair->HostVector();
+    out_gpair->Reshape(info.num_row_, this->Targets(info));
+    auto gpair = out_gpair->HostView();
     const std::vector<size_t> &label_order = info.LabelAbsSort(ctx_);
 
     const omp_ulong ndata = static_cast<omp_ulong>(preds_h.size()); // NOLINT(*)
@@ -440,8 +458,8 @@ class CoxRegression : public FitIntercept {
       }
 
       const double grad = exp_p*r_k - static_cast<bst_float>(y > 0);
-      const double hess = exp_p*r_k - exp_p*exp_p * s_k;
-      gpair.at(ind) = GradientPair(grad * w, hess * w);
+      const double hess = exp_p * r_k - exp_p * exp_p * s_k;
+      gpair(ind) = GradientPair(grad * w, hess * w);
 
       last_abs_y = abs_y;
       last_exp_p = exp_p;
@@ -457,10 +475,10 @@ class CoxRegression : public FitIntercept {
   void EvalTransform(HostDeviceVector<bst_float> *io_preds) override {
     PredTransform(io_preds);
   }
-  bst_float ProbToMargin(bst_float base_score) const override {
+  [[nodiscard]] float ProbToMargin(bst_float base_score) const override {
     return std::log(base_score);
   }
-  const char* DefaultEvalMetric() const override {
+  [[nodiscard]] const char* DefaultEvalMetric() const override {
     return "cox-nloglik";
   }
 
@@ -480,16 +498,16 @@ XGBOOST_REGISTER_OBJECTIVE(CoxRegression, "survival:cox")
 class GammaRegression : public FitIntercept {
  public:
   void Configure(Args const&) override {}
-  ObjInfo Task() const override { return ObjInfo::kRegression; }
+  [[nodiscard]] ObjInfo Task() const override { return ObjInfo::kRegression; }
 
-  void GetGradient(const HostDeviceVector<bst_float> &preds,
-                   const MetaInfo &info, int,
-                   HostDeviceVector<GradientPair> *out_gpair) override {
+  void GetGradient(const HostDeviceVector<bst_float>& preds, const MetaInfo& info, std::int32_t,
+                   linalg::Matrix<GradientPair>* out_gpair) override {
     CHECK_NE(info.labels.Size(), 0U) << "label set cannot be empty";
     CHECK_EQ(preds.Size(), info.labels.Size()) << "labels are not correctly provided";
     const size_t ndata = preds.Size();
     auto device = ctx_->gpu_id;
-    out_gpair->Resize(ndata);
+    out_gpair->SetDevice(ctx_->Device());
+    out_gpair->Reshape(info.num_row_, this->Targets(info));
     label_correct_.Resize(1);
     label_correct_.Fill(1);
 
@@ -514,7 +532,7 @@ class GammaRegression : public FitIntercept {
           _out_gpair[_idx] = GradientPair((1 - y / expf(p)) * w, y / expf(p) * w);
         },
         common::Range{0, static_cast<int64_t>(ndata)}, this->ctx_->Threads(), device).Eval(
-            &label_correct_, out_gpair, &preds, info.labels.Data(), &info.weights_);
+            &label_correct_, out_gpair->Data(), &preds, info.labels.Data(), &info.weights_);
 
     // copy "label correct" flags back to host
     std::vector<int>& label_correct_h = label_correct_.HostVector();
@@ -536,10 +554,10 @@ class GammaRegression : public FitIntercept {
   void EvalTransform(HostDeviceVector<bst_float> *io_preds) override {
     PredTransform(io_preds);
   }
-  bst_float ProbToMargin(bst_float base_score) const override {
+  [[nodiscard]] float ProbToMargin(bst_float base_score) const override {
     return std::log(base_score);
   }
-  const char* DefaultEvalMetric() const override {
+  [[nodiscard]] const char* DefaultEvalMetric() const override {
     return "gamma-nloglik";
   }
   void SaveConfig(Json* p_out) const override {
@@ -578,15 +596,15 @@ class TweedieRegression : public FitIntercept {
     metric_ = os.str();
   }
 
-  ObjInfo Task() const override { return ObjInfo::kRegression; }
+  [[nodiscard]] ObjInfo Task() const override { return ObjInfo::kRegression; }
 
-  void GetGradient(const HostDeviceVector<bst_float>& preds,
-                   const MetaInfo &info, int,
-                   HostDeviceVector<GradientPair> *out_gpair) override {
+  void GetGradient(const HostDeviceVector<bst_float>& preds, const MetaInfo& info, std::int32_t,
+                   linalg::Matrix<GradientPair>* out_gpair) override {
     CHECK_NE(info.labels.Size(), 0U) << "label set cannot be empty";
     CHECK_EQ(preds.Size(), info.labels.Size()) << "labels are not correctly provided";
     const size_t ndata = preds.Size();
-    out_gpair->Resize(ndata);
+    out_gpair->SetDevice(ctx_->Device());
+    out_gpair->Reshape(info.num_row_, this->Targets(info));
 
     auto device = ctx_->gpu_id;
     label_correct_.Resize(1);
@@ -619,7 +637,7 @@ class TweedieRegression : public FitIntercept {
           _out_gpair[_idx] = GradientPair(grad * w, hess * w);
         },
         common::Range{0, static_cast<int64_t>(ndata), 1}, this->ctx_->Threads(), device)
-        .Eval(&label_correct_, out_gpair, &preds, info.labels.Data(), &info.weights_);
+        .Eval(&label_correct_, out_gpair->Data(), &preds, info.labels.Data(), &info.weights_);
 
     // copy "label correct" flags back to host
     std::vector<int>& label_correct_h = label_correct_.HostVector();
@@ -639,11 +657,11 @@ class TweedieRegression : public FitIntercept {
         .Eval(io_preds);
   }
 
-  bst_float ProbToMargin(bst_float base_score) const override {
+  [[nodiscard]] float ProbToMargin(bst_float base_score) const override {
     return std::log(base_score);
   }
 
-  const char* DefaultEvalMetric() const override {
+  [[nodiscard]] const char* DefaultEvalMetric() const override {
     return metric_.c_str();
   }
 
@@ -672,19 +690,19 @@ XGBOOST_REGISTER_OBJECTIVE(TweedieRegression, "reg:tweedie")
 class MeanAbsoluteError : public ObjFunction {
  public:
   void Configure(Args const&) override {}
-  ObjInfo Task() const override { return {ObjInfo::kRegression, true, true}; }
-  bst_target_t Targets(MetaInfo const& info) const override {
-    return std::max(static_cast<size_t>(1), info.labels.Shape(1));
+  [[nodiscard]] ObjInfo Task() const override { return {ObjInfo::kRegression, true, true}; }
+  [[nodiscard]] bst_target_t Targets(MetaInfo const& info) const override {
+    return std::max(static_cast<std::size_t>(1), info.labels.Shape(1));
   }
 
-  void GetGradient(HostDeviceVector<bst_float> const& preds, const MetaInfo& info, int /*iter*/,
-                   HostDeviceVector<GradientPair>* out_gpair) override {
+  void GetGradient(HostDeviceVector<float> const& preds, const MetaInfo& info,
+                   std::int32_t /*iter*/, linalg::Matrix<GradientPair>* out_gpair) override {
     CheckRegInputs(info, preds);
     auto labels = info.labels.View(ctx_->gpu_id);
 
-    out_gpair->SetDevice(ctx_->gpu_id);
-    out_gpair->Resize(info.labels.Size());
-    auto gpair = linalg::MakeVec(out_gpair);
+    out_gpair->SetDevice(ctx_->Device());
+    out_gpair->Reshape(info.num_row_, this->Targets(info));
+    auto gpair = out_gpair->View(ctx_->Device());
 
     preds.SetDevice(ctx_->gpu_id);
     auto predt = linalg::MakeVec(&preds);
@@ -692,14 +710,14 @@ class MeanAbsoluteError : public ObjFunction {
     common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
                                                  : info.weights_.ConstDeviceSpan()};
 
-    linalg::ElementWiseKernel(ctx_, labels, [=] XGBOOST_DEVICE(size_t i, float const y) mutable {
+    linalg::ElementWiseKernel(ctx_, labels, [=] XGBOOST_DEVICE(std::size_t i, float y) mutable {
       auto sign = [](auto x) {
         return (x > static_cast<decltype(x)>(0)) - (x < static_cast<decltype(x)>(0));
       };
-      auto sample_id = std::get<0>(linalg::UnravelIndex(i, labels.Shape()));
+      auto [sample_id, target_id] = linalg::UnravelIndex(i, labels.Shape());
       auto grad = sign(predt(i) - y) * weight[sample_id];
       auto hess = weight[sample_id];
-      gpair(i) = GradientPair{grad, hess};
+      gpair(sample_id, target_id) = GradientPair{grad, hess};
     });
   }
 
@@ -748,7 +766,7 @@ class MeanAbsoluteError : public ObjFunction {
                                    p_tree);
   }
 
-  const char* DefaultEvalMetric() const override { return "mae"; }
+  [[nodiscard]] const char* DefaultEvalMetric() const override { return "mae"; }
 
   void SaveConfig(Json* p_out) const override {
     auto& out = *p_out;
@@ -763,5 +781,4 @@ class MeanAbsoluteError : public ObjFunction {
 XGBOOST_REGISTER_OBJECTIVE(MeanAbsoluteError, "reg:absoluteerror")
     .describe("Mean absoluate error.")
     .set_body([]() { return new MeanAbsoluteError(); });
-}  // namespace obj
-}  // namespace xgboost
+}  // namespace xgboost::obj
diff --git a/src/tree/fit_stump.cc b/src/tree/fit_stump.cc
index 3533de772..ec654a1b2 100644
--- a/src/tree/fit_stump.cc
+++ b/src/tree/fit_stump.cc
@@ -66,14 +66,13 @@ inline void FitStump(Context const*, linalg::TensorView<GradientPair const, 2>,
 #endif  // !defined(XGBOOST_USE_CUDA)
 }  // namespace cuda_impl
 
-void FitStump(Context const* ctx, MetaInfo const& info, HostDeviceVector<GradientPair> const& gpair,
+void FitStump(Context const* ctx, MetaInfo const& info, linalg::Matrix<GradientPair> const& gpair,
               bst_target_t n_targets, linalg::Vector<float>* out) {
   out->SetDevice(ctx->gpu_id);
   out->Reshape(n_targets);
-  auto n_samples = gpair.Size() / n_targets;
 
-  gpair.SetDevice(ctx->gpu_id);
-  auto gpair_t = linalg::MakeTensorView(ctx, &gpair, n_samples, n_targets);
+  gpair.SetDevice(ctx->Device());
+  auto gpair_t = gpair.View(ctx->Device());
   ctx->IsCPU() ? cpu_impl::FitStump(ctx, info, gpair_t, out->HostView())
                : cuda_impl::FitStump(ctx, gpair_t, out->View(ctx->gpu_id));
 }
diff --git a/src/tree/fit_stump.h b/src/tree/fit_stump.h
index 4778ecfc5..2af779f77 100644
--- a/src/tree/fit_stump.h
+++ b/src/tree/fit_stump.h
@@ -31,7 +31,7 @@ XGBOOST_DEVICE inline double CalcUnregularizedWeight(T sum_grad, T sum_hess) {
 /**
  * @brief Fit a tree stump as an estimation of base_score.
  */
-void FitStump(Context const* ctx, MetaInfo const& info, HostDeviceVector<GradientPair> const& gpair,
+void FitStump(Context const* ctx, MetaInfo const& info, linalg::Matrix<GradientPair> const& gpair,
               bst_target_t n_targets, linalg::Vector<float>* out);
 }  // namespace tree
 }  // namespace xgboost
diff --git a/src/tree/updater_approx.cc b/src/tree/updater_approx.cc
index 2110cd6e6..17e020ced 100644
--- a/src/tree/updater_approx.cc
+++ b/src/tree/updater_approx.cc
@@ -269,17 +269,18 @@ class GlobalApproxUpdater : public TreeUpdater {
     out["hist_train_param"] = ToJson(hist_param_);
   }
 
-  void InitData(TrainParam const &param, HostDeviceVector<GradientPair> const *gpair,
+  void InitData(TrainParam const &param, linalg::Matrix<GradientPair> const *gpair,
                 linalg::Matrix<GradientPair> *sampled) {
     *sampled = linalg::Empty<GradientPair>(ctx_, gpair->Size(), 1);
-    sampled->Data()->Copy(*gpair);
+    auto in = gpair->HostView().Values();
+    std::copy(in.data(), in.data() + in.size(), sampled->HostView().Values().data());
 
     SampleGradient(ctx_, param, sampled->HostView());
   }
 
   [[nodiscard]] char const *Name() const override { return "grow_histmaker"; }
 
-  void Update(TrainParam const *param, HostDeviceVector<GradientPair> *gpair, DMatrix *m,
+  void Update(TrainParam const *param, linalg::Matrix<GradientPair> *gpair, DMatrix *m,
               common::Span<HostDeviceVector<bst_node_t>> out_position,
               const std::vector<RegTree *> &trees) override {
     CHECK(hist_param_.GetInitialised());
diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc
index bda9b4dfa..3afbe3e46 100644
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -91,7 +91,7 @@ class ColMaker: public TreeUpdater {
     }
   }
 
-  void Update(TrainParam const *param, HostDeviceVector<GradientPair> *gpair, DMatrix *dmat,
+  void Update(TrainParam const *param, linalg::Matrix<GradientPair> *gpair, DMatrix *dmat,
               common::Span<HostDeviceVector<bst_node_t>> /*out_position*/,
               const std::vector<RegTree *> &trees) override {
     if (collective::IsDistributed()) {
@@ -106,10 +106,11 @@ class ColMaker: public TreeUpdater {
     // rescale learning rate according to size of trees
     interaction_constraints_.Configure(*param, dmat->Info().num_row_);
     // build tree
+    CHECK_EQ(gpair->Shape(1), 1) << MTNotImplemented();
     for (auto tree : trees) {
       CHECK(ctx_);
       Builder builder(*param, colmaker_param_, interaction_constraints_, ctx_, column_densities_);
-      builder.Update(gpair->ConstHostVector(), dmat, tree);
+      builder.Update(gpair->Data()->ConstHostVector(), dmat, tree);
     }
   }
 
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 9e94c46c6..33dfbf8c5 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -760,16 +760,18 @@ class GPUHistMaker : public TreeUpdater {
     dh::GlobalMemoryLogger().Log();
   }
 
-  void Update(TrainParam const* param, HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
+  void Update(TrainParam const* param, linalg::Matrix<GradientPair>* gpair, DMatrix* dmat,
               common::Span<HostDeviceVector<bst_node_t>> out_position,
               const std::vector<RegTree*>& trees) override {
     monitor_.Start("Update");
 
+    CHECK_EQ(gpair->Shape(1), 1) << MTNotImplemented();
+    auto gpair_hdv = gpair->Data();
     // build tree
     try {
       std::size_t t_idx{0};
       for (xgboost::RegTree* tree : trees) {
-        this->UpdateTree(param, gpair, dmat, tree, &out_position[t_idx]);
+        this->UpdateTree(param, gpair_hdv, dmat, tree, &out_position[t_idx]);
         this->hist_maker_param_.CheckTreesSynchronized(tree);
         ++t_idx;
       }
@@ -887,7 +889,7 @@ class GPUGlobalApproxMaker : public TreeUpdater {
   }
   ~GPUGlobalApproxMaker() override { dh::GlobalMemoryLogger().Log(); }
 
-  void Update(TrainParam const* param, HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat,
+  void Update(TrainParam const* param, linalg::Matrix<GradientPair>* gpair, DMatrix* p_fmat,
               common::Span<HostDeviceVector<bst_node_t>> out_position,
               const std::vector<RegTree*>& trees) override {
     monitor_.Start("Update");
@@ -898,7 +900,7 @@ class GPUGlobalApproxMaker : public TreeUpdater {
     auto hess = dh::ToSpan(hess_);
 
     gpair->SetDevice(ctx_->Device());
-    auto d_gpair = gpair->ConstDeviceSpan();
+    auto d_gpair = gpair->Data()->ConstDeviceSpan();
     auto cuctx = ctx_->CUDACtx();
     thrust::transform(cuctx->CTP(), dh::tcbegin(d_gpair), dh::tcend(d_gpair), dh::tbegin(hess),
                       [=] XGBOOST_DEVICE(GradientPair const& g) { return g.GetHess(); });
@@ -912,7 +914,7 @@ class GPUGlobalApproxMaker : public TreeUpdater {
 
     std::size_t t_idx{0};
     for (xgboost::RegTree* tree : trees) {
-      this->UpdateTree(gpair, p_fmat, tree, &out_position[t_idx]);
+      this->UpdateTree(gpair->Data(), p_fmat, tree, &out_position[t_idx]);
       this->hist_maker_param_.CheckTreesSynchronized(tree);
       ++t_idx;
     }
diff --git a/src/tree/updater_prune.cc b/src/tree/updater_prune.cc
index 29f9917ba..2c2d1a2f0 100644
--- a/src/tree/updater_prune.cc
+++ b/src/tree/updater_prune.cc
@@ -31,7 +31,7 @@ class TreePruner : public TreeUpdater {
   [[nodiscard]] bool CanModifyTree() const override { return true; }
 
   // update the tree, do pruning
-  void Update(TrainParam const* param, HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat,
+  void Update(TrainParam const* param, linalg::Matrix<GradientPair>* gpair, DMatrix* p_fmat,
               common::Span<HostDeviceVector<bst_node_t>> out_position,
               const std::vector<RegTree*>& trees) override {
     pruner_monitor_.Start("PrunerUpdate");
diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index 883c18f36..34890c2e5 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -492,7 +492,7 @@ class QuantileHistMaker : public TreeUpdater {
 
   [[nodiscard]] char const *Name() const override { return "grow_quantile_histmaker"; }
 
-  void Update(TrainParam const *param, HostDeviceVector<GradientPair> *gpair, DMatrix *p_fmat,
+  void Update(TrainParam const *param, linalg::Matrix<GradientPair> *gpair, DMatrix *p_fmat,
               common::Span<HostDeviceVector<bst_node_t>> out_position,
               const std::vector<RegTree *> &trees) override {
     if (trees.front()->IsMultiTarget()) {
@@ -511,8 +511,7 @@ class QuantileHistMaker : public TreeUpdater {
     }
 
     bst_target_t n_targets = trees.front()->NumTargets();
-    auto h_gpair =
-        linalg::MakeTensorView(ctx_, gpair->HostSpan(), p_fmat->Info().num_row_, n_targets);
+    auto h_gpair = gpair->HostView();
 
     linalg::Matrix<GradientPair> sample_out;
     auto h_sample_out = h_gpair;
diff --git a/src/tree/updater_refresh.cc b/src/tree/updater_refresh.cc
index 2bfd3c8de..941df7aec 100644
--- a/src/tree/updater_refresh.cc
+++ b/src/tree/updater_refresh.cc
@@ -31,11 +31,14 @@ class TreeRefresher : public TreeUpdater {
   [[nodiscard]] char const *Name() const override { return "refresh"; }
   [[nodiscard]] bool CanModifyTree() const override { return true; }
   // update the tree, do pruning
-  void Update(TrainParam const *param, HostDeviceVector<GradientPair> *gpair, DMatrix *p_fmat,
+  void Update(TrainParam const *param, linalg::Matrix<GradientPair> *gpair, DMatrix *p_fmat,
               common::Span<HostDeviceVector<bst_node_t>> /*out_position*/,
               const std::vector<RegTree *> &trees) override {
-    if (trees.size() == 0) return;
-    const std::vector<GradientPair> &gpair_h = gpair->ConstHostVector();
+    if (trees.size() == 0) {
+      return;
+    }
+    CHECK_EQ(gpair->Shape(1), 1) << MTNotImplemented();
+    const std::vector<GradientPair> &gpair_h = gpair->Data()->ConstHostVector();
     // thread temporal space
     std::vector<std::vector<GradStats> > stemp;
     std::vector<RegTree::FVec> fvec_temp;
diff --git a/src/tree/updater_sync.cc b/src/tree/updater_sync.cc
index 2422807e2..f64f35483 100644
--- a/src/tree/updater_sync.cc
+++ b/src/tree/updater_sync.cc
@@ -31,7 +31,7 @@ class TreeSyncher : public TreeUpdater {
 
   [[nodiscard]] char const* Name() const override { return "prune"; }
 
-  void Update(TrainParam const*, HostDeviceVector<GradientPair>*, DMatrix*,
+  void Update(TrainParam const*, linalg::Matrix<GradientPair>*, DMatrix*,
               common::Span<HostDeviceVector<bst_node_t>> /*out_position*/,
               const std::vector<RegTree*>& trees) override {
     if (collective::GetWorldSize() == 1) return;
diff --git a/tests/cpp/c_api/test_c_api.cc b/tests/cpp/c_api/test_c_api.cc
index 3bf03c955..7fcab199e 100644
--- a/tests/cpp/c_api/test_c_api.cc
+++ b/tests/cpp/c_api/test_c_api.cc
@@ -565,7 +565,7 @@ void TestXGDMatrixGetQuantileCut(Context const *ctx) {
     ASSERT_EQ(XGBoosterCreate(mats.data(), 1, &booster), 0);
     ASSERT_EQ(XGBoosterSetParam(booster, "max_bin", "16"), 0);
     if (ctx->IsCUDA()) {
-      ASSERT_EQ(XGBoosterSetParam(booster, "tree_method", "gpu_hist"), 0);
+      ASSERT_EQ(XGBoosterSetParam(booster, "device", ctx->DeviceName().c_str()), 0);
     }
     ASSERT_EQ(XGBoosterUpdateOneIter(booster, 0, p_fmat), 0);
     ASSERT_EQ(XGDMatrixGetQuantileCut(p_fmat, s_config.c_str(), &out_indptr, &out_data), 0);
@@ -596,7 +596,7 @@ void TestXGDMatrixGetQuantileCut(Context const *ctx) {
     ASSERT_EQ(XGBoosterCreate(mats.data(), 1, &booster), 0);
     ASSERT_EQ(XGBoosterSetParam(booster, "max_bin", "16"), 0);
     if (ctx->IsCUDA()) {
-      ASSERT_EQ(XGBoosterSetParam(booster, "tree_method", "gpu_hist"), 0);
+      ASSERT_EQ(XGBoosterSetParam(booster, "device", ctx->DeviceName().c_str()), 0);
     }
     ASSERT_EQ(XGBoosterUpdateOneIter(booster, 0, p_fmat), 0);
     ASSERT_EQ(XGDMatrixGetQuantileCut(p_fmat, s_config.c_str(), &out_indptr, &out_data), 0);
diff --git a/tests/cpp/gbm/test_gbtree.cc b/tests/cpp/gbm/test_gbtree.cc
index 9e6311701..d7b7e588d 100644
--- a/tests/cpp/gbm/test_gbtree.cc
+++ b/tests/cpp/gbm/test_gbtree.cc
@@ -65,7 +65,9 @@ TEST(GBTree, PredictionCache) {
 
   gbtree.Configure({{"tree_method", "hist"}});
   auto p_m = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
-  auto gpair = GenerateRandomGradients(kRows);
+  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Ordinal());
+  gpair.Data()->Copy(GenerateRandomGradients(kRows));
+
   PredictionCacheEntry out_predictions;
   gbtree.DoBoost(p_m.get(), &gpair, &out_predictions, nullptr);
 
@@ -213,7 +215,8 @@ TEST(GBTree, ChooseTreeMethod) {
     }
     learner->Configure();
     for (std::int32_t i = 0; i < 3; ++i) {
-      HostDeviceVector<GradientPair> gpair{GenerateRandomGradients(Xy->Info().num_row_)};
+      linalg::Matrix<GradientPair> gpair{{Xy->Info().num_row_}, Context::kCpuId};
+      gpair.Data()->Copy(GenerateRandomGradients(Xy->Info().num_row_));
       learner->BoostOneIter(0, Xy, &gpair);
     }
 
diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index 111c7b30e..a9ff347ea 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -96,9 +96,9 @@ void CheckObjFunctionImpl(std::unique_ptr<xgboost::ObjFunction> const& obj,
                           std::vector<xgboost::bst_float> out_grad,
                           std::vector<xgboost::bst_float> out_hess) {
   xgboost::HostDeviceVector<xgboost::bst_float> in_preds(preds);
-  xgboost::HostDeviceVector<xgboost::GradientPair> out_gpair;
-  obj->GetGradient(in_preds, info, 1, &out_gpair);
-  std::vector<xgboost::GradientPair>& gpair = out_gpair.HostVector();
+  xgboost::linalg::Matrix<xgboost::GradientPair> out_gpair;
+  obj->GetGradient(in_preds, info, 0, &out_gpair);
+  std::vector<xgboost::GradientPair>& gpair = out_gpair.Data()->HostVector();
 
   ASSERT_EQ(gpair.size(), in_preds.Size());
   for (int i = 0; i < static_cast<int>(gpair.size()); ++i) {
@@ -119,8 +119,8 @@ void CheckObjFunction(std::unique_ptr<xgboost::ObjFunction> const& obj,
                       std::vector<xgboost::bst_float> out_hess) {
   xgboost::MetaInfo info;
   info.num_row_ = labels.size();
-  info.labels =
-      xgboost::linalg::Tensor<float, 2>{labels.cbegin(), labels.cend(), {labels.size()}, -1};
+  info.labels = xgboost::linalg::Tensor<float, 2>{
+      labels.cbegin(), labels.cend(), {labels.size(), static_cast<std::size_t>(1)}, -1};
   info.weights_.HostVector() = weights;
 
   CheckObjFunctionImpl(obj, preds, labels, weights, info, out_grad, out_hess);
@@ -155,8 +155,8 @@ void CheckRankingObjFunction(std::unique_ptr<xgboost::ObjFunction> const& obj,
                              std::vector<xgboost::bst_float> out_hess) {
   xgboost::MetaInfo info;
   info.num_row_ = labels.size();
-  info.labels = xgboost::linalg::Tensor<float, 2>{
-      labels.cbegin(), labels.cend(), {labels.size(), static_cast<size_t>(1)}, -1};
+  info.labels = xgboost::linalg::Matrix<float>{
+      labels.cbegin(), labels.cend(), {labels.size(), static_cast<std::size_t>(1)}, -1};
   info.weights_.HostVector() = weights;
   info.group_ptr_ = groups;
 
@@ -645,11 +645,10 @@ std::unique_ptr<GradientBooster> CreateTrainedGBM(std::string name, Args kwargs,
   }
   p_dmat->Info().labels =
       linalg::Tensor<float, 2>{labels.cbegin(), labels.cend(), {labels.size()}, -1};
-  HostDeviceVector<GradientPair> gpair;
-  auto& h_gpair = gpair.HostVector();
-  h_gpair.resize(kRows);
+  linalg::Matrix<GradientPair> gpair({kRows}, ctx->Ordinal());
+  auto h_gpair = gpair.HostView();
   for (size_t i = 0; i < kRows; ++i) {
-    h_gpair[i] = GradientPair{static_cast<float>(i), 1};
+    h_gpair(i) = GradientPair{static_cast<float>(i), 1};
   }
 
   PredictionCacheEntry predts;
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index e39375dfa..bad15c695 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -387,23 +387,6 @@ std::unique_ptr<GradientBooster> CreateTrainedGBM(std::string name, Args kwargs,
                                                   LearnerModelParam const* learner_model_param,
                                                   Context const* generic_param);
 
-inline std::unique_ptr<HostDeviceVector<GradientPair>> GenerateGradients(
-    std::size_t rows, bst_target_t n_targets = 1) {
-  auto p_gradients = std::make_unique<HostDeviceVector<GradientPair>>(rows * n_targets);
-  auto& h_gradients = p_gradients->HostVector();
-
-  xgboost::SimpleLCG gen;
-  xgboost::SimpleRealUniformDistribution<bst_float> dist(0.0f, 1.0f);
-
-  for (std::size_t i = 0; i < rows * n_targets; ++i) {
-    auto grad = dist(&gen);
-    auto hess = dist(&gen);
-    h_gradients[i] = GradientPair{grad, hess};
-  }
-
-  return p_gradients;
-}
-
 /**
  * \brief Make a context that uses CUDA if device >= 0.
  */
@@ -415,11 +398,12 @@ inline Context MakeCUDACtx(std::int32_t device) {
 }
 
 inline HostDeviceVector<GradientPair> GenerateRandomGradients(const size_t n_rows,
-                                                              float lower= 0.0f, float upper = 1.0f) {
+                                                              float lower = 0.0f,
+                                                              float upper = 1.0f) {
   xgboost::SimpleLCG gen;
   xgboost::SimpleRealUniformDistribution<bst_float> dist(lower, upper);
   std::vector<GradientPair> h_gpair(n_rows);
-  for (auto &gpair : h_gpair) {
+  for (auto& gpair : h_gpair) {
     bst_float grad = dist(&gen);
     bst_float hess = dist(&gen);
     gpair = GradientPair(grad, hess);
@@ -428,6 +412,16 @@ inline HostDeviceVector<GradientPair> GenerateRandomGradients(const size_t n_row
   return gpair;
 }
 
+inline linalg::Matrix<GradientPair> GenerateRandomGradients(Context const* ctx, bst_row_t n_rows,
+                                                            bst_target_t n_targets,
+                                                            float lower = 0.0f,
+                                                            float upper = 1.0f) {
+  auto g = GenerateRandomGradients(n_rows * n_targets, lower, upper);
+  linalg::Matrix<GradientPair> gpair({n_rows, static_cast<bst_row_t>(n_targets)}, ctx->Device());
+  gpair.Data()->Copy(g);
+  return gpair;
+}
+
 typedef void *DMatrixHandle;  // NOLINT(*);
 
 class ArrayIterForTest {
diff --git a/tests/cpp/linear/test_linear.cc b/tests/cpp/linear/test_linear.cc
index 6b2d17e10..8f81428b3 100644
--- a/tests/cpp/linear/test_linear.cc
+++ b/tests/cpp/linear/test_linear.cc
@@ -24,8 +24,8 @@ TEST(Linear, Shotgun) {
     auto updater =
         std::unique_ptr<xgboost::LinearUpdater>(xgboost::LinearUpdater::Create("shotgun", &ctx));
     updater->Configure({{"eta", "1."}});
-    xgboost::HostDeviceVector<xgboost::GradientPair> gpair(
-        p_fmat->Info().num_row_, xgboost::GradientPair(-5, 1.0));
+    linalg::Matrix<xgboost::GradientPair> gpair{
+        linalg::Constant(&ctx, xgboost::GradientPair(-5, 1.0), p_fmat->Info().num_row_, 1)};
     xgboost::gbm::GBLinearModel model{&mparam};
     model.LazyInitModel();
     updater->Update(&gpair, p_fmat.get(), &model, gpair.Size());
@@ -55,8 +55,8 @@ TEST(Linear, coordinate) {
   auto updater = std::unique_ptr<xgboost::LinearUpdater>(
       xgboost::LinearUpdater::Create("coord_descent", &ctx));
   updater->Configure({{"eta", "1."}});
-  xgboost::HostDeviceVector<xgboost::GradientPair> gpair(
-      p_fmat->Info().num_row_, xgboost::GradientPair(-5, 1.0));
+  linalg::Matrix<xgboost::GradientPair> gpair{
+      linalg::Constant(&ctx, xgboost::GradientPair(-5, 1.0), p_fmat->Info().num_row_, 1)};
   xgboost::gbm::GBLinearModel model{&mparam};
   model.LazyInitModel();
   updater->Update(&gpair, p_fmat.get(), &model, gpair.Size());
diff --git a/tests/cpp/linear/test_linear.cu b/tests/cpp/linear/test_linear.cu
index 6a2a6ef8c..8475116bc 100644
--- a/tests/cpp/linear/test_linear.cu
+++ b/tests/cpp/linear/test_linear.cu
@@ -1,4 +1,6 @@
-// Copyright by Contributors
+/**
+ * Copyright 2018-2023, XGBoost Contributors
+ */
 #include <xgboost/linear_updater.h>
 #include <xgboost/gbm.h>
 
@@ -19,8 +21,7 @@ TEST(Linear, GPUCoordinate) {
   auto updater = std::unique_ptr<xgboost::LinearUpdater>(
       xgboost::LinearUpdater::Create("gpu_coord_descent", &ctx));
   updater->Configure({{"eta", "1."}});
-  xgboost::HostDeviceVector<xgboost::GradientPair> gpair(
-      mat->Info().num_row_, xgboost::GradientPair(-5, 1.0));
+  auto gpair = linalg::Constant(&ctx, xgboost::GradientPair(-5, 1.0), mat->Info().num_row_, 1);
   xgboost::gbm::GBLinearModel model{&mparam};
 
   model.LazyInitModel();
diff --git a/tests/cpp/objective/test_aft_obj.cc b/tests/cpp/objective/test_aft_obj.cc
index 74973918c..972dfc53f 100644
--- a/tests/cpp/objective/test_aft_obj.cc
+++ b/tests/cpp/objective/test_aft_obj.cc
@@ -1,5 +1,5 @@
-/*!
- * Copyright (c) by Contributors 2020
+/**
+ * Copyright 2020-2023, XGBoost Contributors 
  */
 #include <gtest/gtest.h>
 #include <memory>
@@ -12,9 +12,7 @@
 #include "../helpers.h"
 #include "../../../src/common/survival_util.h"
 
-namespace xgboost {
-namespace common {
-
+namespace xgboost::common {
 TEST(Objective, DeclareUnifiedTest(AFTObjConfiguration)) {
   auto ctx = MakeCUDACtx(GPUIDX);
   std::unique_ptr<ObjFunction> objective(ObjFunction::Create("survival:aft", &ctx));
@@ -65,14 +63,14 @@ static inline void CheckGPairOverGridPoints(
     preds[i] = std::log(std::pow(2.0, i * (log_y_high - log_y_low) / (num_point - 1) + log_y_low));
   }
 
-  HostDeviceVector<GradientPair> out_gpair;
+  linalg::Matrix<GradientPair> out_gpair;
   obj->GetGradient(HostDeviceVector<bst_float>(preds), info, 1, &out_gpair);
-  const auto& gpair = out_gpair.HostVector();
+  const auto gpair = out_gpair.HostView();
   CHECK_EQ(num_point, expected_grad.size());
   CHECK_EQ(num_point, expected_hess.size());
   for (int i = 0; i < num_point; ++i) {
-    EXPECT_NEAR(gpair[i].GetGrad(), expected_grad[i], ftol);
-    EXPECT_NEAR(gpair[i].GetHess(), expected_hess[i], ftol);
+    EXPECT_NEAR(gpair(i).GetGrad(), expected_grad[i], ftol);
+    EXPECT_NEAR(gpair(i).GetHess(), expected_hess[i], ftol);
   }
 }
 
@@ -169,5 +167,4 @@ TEST(Objective, DeclareUnifiedTest(AFTObjGPairIntervalCensoredLabels)) {
       0.2757f, 0.1776f, 0.1110f, 0.0682f, 0.0415f, 0.0251f, 0.0151f, 0.0091f, 0.0055f, 0.0033f });
 }
 
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
diff --git a/tests/cpp/objective/test_lambdarank_obj.cc b/tests/cpp/objective/test_lambdarank_obj.cc
index c808e97f0..963f69639 100644
--- a/tests/cpp/objective/test_lambdarank_obj.cc
+++ b/tests/cpp/objective/test_lambdarank_obj.cc
@@ -74,35 +74,35 @@ void TestNDCGGPair(Context const* ctx) {
   info.labels = linalg::Tensor<float, 2>{{0, 1, 0, 1}, {4, 1}, GPUIDX};
   info.group_ptr_ = {0, 2, 4};
   info.num_row_ = 4;
-  HostDeviceVector<GradientPair> gpairs;
+  linalg::Matrix<GradientPair> gpairs;
   obj->GetGradient(predts, info, 0, &gpairs);
   ASSERT_EQ(gpairs.Size(), predts.Size());
 
   {
     predts = {1, 0, 1, 0};
-    HostDeviceVector<GradientPair> gpairs;
+    linalg::Matrix<GradientPair> gpairs;
     obj->GetGradient(predts, info, 0, &gpairs);
-    for (size_t i = 0; i < gpairs.Size(); ++i) {
-      ASSERT_GT(gpairs.HostSpan()[i].GetHess(), 0);
+    for (std::size_t i = 0; i < gpairs.Size(); ++i) {
+      ASSERT_GT(gpairs.HostView()(i).GetHess(), 0);
     }
-    ASSERT_LT(gpairs.HostSpan()[1].GetGrad(), 0);
-    ASSERT_LT(gpairs.HostSpan()[3].GetGrad(), 0);
+    ASSERT_LT(gpairs.HostView()(1).GetGrad(), 0);
+    ASSERT_LT(gpairs.HostView()(3).GetGrad(), 0);
 
-    ASSERT_GT(gpairs.HostSpan()[0].GetGrad(), 0);
-    ASSERT_GT(gpairs.HostSpan()[2].GetGrad(), 0);
+    ASSERT_GT(gpairs.HostView()(0).GetGrad(), 0);
+    ASSERT_GT(gpairs.HostView()(2).GetGrad(), 0);
 
     info.weights_ = {2, 3};
-    HostDeviceVector<GradientPair> weighted_gpairs;
+    linalg::Matrix<GradientPair> weighted_gpairs;
     obj->GetGradient(predts, info, 0, &weighted_gpairs);
-    auto const& h_gpairs = gpairs.ConstHostSpan();
-    auto const& h_weighted_gpairs = weighted_gpairs.ConstHostSpan();
+    auto const& h_gpairs = gpairs.HostView();
+    auto const& h_weighted_gpairs = weighted_gpairs.HostView();
     for (size_t i : {0ul, 1ul}) {
-      ASSERT_FLOAT_EQ(h_weighted_gpairs[i].GetGrad(), h_gpairs[i].GetGrad() * 2.0f);
-      ASSERT_FLOAT_EQ(h_weighted_gpairs[i].GetHess(), h_gpairs[i].GetHess() * 2.0f);
+      ASSERT_FLOAT_EQ(h_weighted_gpairs(i).GetGrad(), h_gpairs(i).GetGrad() * 2.0f);
+      ASSERT_FLOAT_EQ(h_weighted_gpairs(i).GetHess(), h_gpairs(i).GetHess() * 2.0f);
     }
     for (size_t i : {2ul, 3ul}) {
-      ASSERT_FLOAT_EQ(h_weighted_gpairs[i].GetGrad(), h_gpairs[i].GetGrad() * 3.0f);
-      ASSERT_FLOAT_EQ(h_weighted_gpairs[i].GetHess(), h_gpairs[i].GetHess() * 3.0f);
+      ASSERT_FLOAT_EQ(h_weighted_gpairs(i).GetGrad(), h_gpairs(i).GetGrad() * 3.0f);
+      ASSERT_FLOAT_EQ(h_weighted_gpairs(i).GetHess(), h_gpairs(i).GetHess() * 3.0f);
     }
   }
 
@@ -125,7 +125,7 @@ void TestUnbiasedNDCG(Context const* ctx) {
   std::sort(h_label.begin(), h_label.end(), std::greater<>{});
   HostDeviceVector<float> predt(p_fmat->Info().num_row_, 1.0f);
 
-  HostDeviceVector<GradientPair> out_gpair;
+  linalg::Matrix<GradientPair> out_gpair;
   obj->GetGradient(predt, p_fmat->Info(), 0, &out_gpair);
 
   Json config{Object{}};
diff --git a/tests/cpp/objective/test_lambdarank_obj.cu b/tests/cpp/objective/test_lambdarank_obj.cu
index 16dc45307..1c13665fc 100644
--- a/tests/cpp/objective/test_lambdarank_obj.cu
+++ b/tests/cpp/objective/test_lambdarank_obj.cu
@@ -42,20 +42,21 @@ void TestGPUMakePair() {
     auto d = dummy.View(ctx.gpu_id);
     linalg::Vector<GradientPair> dgpair;
     auto dg = dgpair.View(ctx.gpu_id);
-    cuda_impl::KernelInputs args{d,
-                                 d,
-                                 d,
-                                 d,
-                                 p_cache->DataGroupPtr(&ctx),
-                                 p_cache->CUDAThreadsGroupPtr(),
-                                 rank_idx,
-                                 info.labels.View(ctx.gpu_id),
-                                 predt.ConstDeviceSpan(),
-                                 {},
-                                 dg,
-                                 nullptr,
-                                 y_sorted_idx,
-                                 0};
+    cuda_impl::KernelInputs args{
+        d,
+        d,
+        d,
+        d,
+        p_cache->DataGroupPtr(&ctx),
+        p_cache->CUDAThreadsGroupPtr(),
+        rank_idx,
+        info.labels.View(ctx.gpu_id),
+        predt.ConstDeviceSpan(),
+        linalg::MatrixView<GradientPair>{common::Span<GradientPair>{}, {0}, 0},
+        dg,
+        nullptr,
+        y_sorted_idx,
+        0};
     return args;
   };
 
diff --git a/tests/cpp/objective/test_regression_obj.cc b/tests/cpp/objective/test_regression_obj.cc
index b8a40603b..35e8287b6 100644
--- a/tests/cpp/objective/test_regression_obj.cc
+++ b/tests/cpp/objective/test_regression_obj.cc
@@ -122,8 +122,8 @@ TEST(Objective, DeclareUnifiedTest(LogisticRegressionBasic)) {
   EXPECT_NEAR(obj->ProbToMargin(0.1f), -2.197f, 0.01f);
   EXPECT_NEAR(obj->ProbToMargin(0.5f), 0, 0.01f);
   EXPECT_NEAR(obj->ProbToMargin(0.9f), 2.197f, 0.01f);
-  EXPECT_ANY_THROW(obj->ProbToMargin(10))
-    << "Expected error when base_score not in range [0,1f] for LogisticRegression";
+  EXPECT_ANY_THROW((void)obj->ProbToMargin(10))
+      << "Expected error when base_score not in range [0,1f] for LogisticRegression";
 
   // test PredTransform
   HostDeviceVector<bst_float> io_preds = {0, 0.1f, 0.5f, 0.9f, 1};
@@ -282,9 +282,9 @@ TEST(Objective, DeclareUnifiedTest(TweedieRegressionGPair)) {
 TEST(Objective, CPU_vs_CUDA) {
   Context ctx = MakeCUDACtx(GPUIDX);
 
-  ObjFunction* obj = ObjFunction::Create("reg:squarederror", &ctx);
-  HostDeviceVector<GradientPair> cpu_out_preds;
-  HostDeviceVector<GradientPair> cuda_out_preds;
+  std::unique_ptr<ObjFunction> obj{ObjFunction::Create("reg:squarederror", &ctx)};
+  linalg::Matrix<GradientPair> cpu_out_preds;
+  linalg::Matrix<GradientPair> cuda_out_preds;
 
   constexpr size_t kRows = 400;
   constexpr size_t kCols = 100;
@@ -300,7 +300,7 @@ TEST(Objective, CPU_vs_CUDA) {
   info.labels.Reshape(kRows);
   auto& h_labels = info.labels.Data()->HostVector();
   for (size_t i = 0; i < h_labels.size(); ++i) {
-    h_labels[i] = 1 / (float)(i+1);
+    h_labels[i] = 1 / static_cast<float>(i+1);
   }
 
   {
@@ -314,19 +314,17 @@ TEST(Objective, CPU_vs_CUDA) {
     obj->GetGradient(preds, info, 0, &cuda_out_preds);
   }
 
-  auto& h_cpu_out = cpu_out_preds.HostVector();
-  auto& h_cuda_out = cuda_out_preds.HostVector();
+  auto h_cpu_out = cpu_out_preds.HostView();
+  auto h_cuda_out = cuda_out_preds.HostView();
 
   float sgrad = 0;
   float shess = 0;
   for (size_t i = 0; i < kRows; ++i) {
-    sgrad += std::pow(h_cpu_out[i].GetGrad() - h_cuda_out[i].GetGrad(), 2);
-    shess += std::pow(h_cpu_out[i].GetHess() - h_cuda_out[i].GetHess(), 2);
+    sgrad += std::pow(h_cpu_out(i).GetGrad() - h_cuda_out(i).GetGrad(), 2);
+    shess += std::pow(h_cpu_out(i).GetHess() - h_cuda_out(i).GetHess(), 2);
   }
   ASSERT_NEAR(sgrad, 0.0f, kRtEps);
   ASSERT_NEAR(shess, 0.0f, kRtEps);
-
-  delete obj;
 }
 #endif
 
diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc
index a54c42a98..5ff0fdeec 100644
--- a/tests/cpp/predictor/test_cpu_predictor.cc
+++ b/tests/cpp/predictor/test_cpu_predictor.cc
@@ -189,11 +189,10 @@ void TestUpdatePredictionCache(bool use_subsampling) {
 
   auto dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix(true, true, kClasses);
 
-  HostDeviceVector<GradientPair> gpair;
-  auto& h_gpair = gpair.HostVector();
-  h_gpair.resize(kRows * kClasses);
+  linalg::Matrix<GradientPair> gpair({kRows, kClasses}, ctx.Device());
+  auto h_gpair = gpair.HostView();
   for (size_t i = 0; i < kRows * kClasses; ++i) {
-    h_gpair[i] = {static_cast<float>(i), 1};
+    std::apply(h_gpair, linalg::UnravelIndex(i, kRows, kClasses)) = {static_cast<float>(i), 1};
   }
 
   PredictionCacheEntry predtion_cache;
diff --git a/tests/cpp/test_multi_target.cc b/tests/cpp/test_multi_target.cc
index c8d371941..cc81a4ba2 100644
--- a/tests/cpp/test_multi_target.cc
+++ b/tests/cpp/test_multi_target.cc
@@ -68,10 +68,12 @@ class TestL1MultiTarget : public ::testing::Test {
     }
   }
 
-  void RunTest(std::string const& tree_method, bool weight) {
+  void RunTest(Context const* ctx, std::string const& tree_method, bool weight) {
     auto p_fmat = weight ? Xyw_ : Xy_;
     std::unique_ptr<Learner> learner{Learner::Create({p_fmat})};
-    learner->SetParams(Args{{"tree_method", tree_method}, {"objective", "reg:absoluteerror"}});
+    learner->SetParams(Args{{"tree_method", tree_method},
+                            {"objective", "reg:absoluteerror"},
+                            {"device", ctx->DeviceName()}});
     learner->Configure();
     for (auto i = 0; i < 4; ++i) {
       learner->UpdateOneIter(i, p_fmat);
@@ -87,7 +89,9 @@ class TestL1MultiTarget : public ::testing::Test {
     for (bst_target_t t{0}; t < p_fmat->Info().labels.Shape(1); ++t) {
       auto t_Xy = weight ? single_w_[t] : single_[t];
       std::unique_ptr<Learner> sl{Learner::Create({t_Xy})};
-      sl->SetParams(Args{{"tree_method", tree_method}, {"objective", "reg:absoluteerror"}});
+      sl->SetParams(Args{{"tree_method", tree_method},
+                         {"objective", "reg:absoluteerror"},
+                         {"device", ctx->DeviceName()}});
       sl->Configure();
       sl->UpdateOneIter(0, t_Xy);
       Json s_config{Object{}};
@@ -104,20 +108,32 @@ class TestL1MultiTarget : public ::testing::Test {
     ASSERT_FLOAT_EQ(mean, base_score);
   }
 
-  void RunTest(std::string const& tree_method) {
-    this->RunTest(tree_method, false);
-    this->RunTest(tree_method, true);
+  void RunTest(Context const* ctx, std::string const& tree_method) {
+    this->RunTest(ctx, tree_method, false);
+    this->RunTest(ctx, tree_method, true);
   }
 };
 
-TEST_F(TestL1MultiTarget, Hist) { this->RunTest("hist"); }
+TEST_F(TestL1MultiTarget, Hist) {
+  Context ctx;
+  this->RunTest(&ctx, "hist");
+}
 
-TEST_F(TestL1MultiTarget, Exact) { this->RunTest("exact"); }
+TEST_F(TestL1MultiTarget, Exact) {
+  Context ctx;
+  this->RunTest(&ctx, "exact");
+}
 
-TEST_F(TestL1MultiTarget, Approx) { this->RunTest("approx"); }
+TEST_F(TestL1MultiTarget, Approx) {
+  Context ctx;
+  this->RunTest(&ctx, "approx");
+}
 
 #if defined(XGBOOST_USE_CUDA)
-TEST_F(TestL1MultiTarget, GpuHist) { this->RunTest("gpu_hist"); }
+TEST_F(TestL1MultiTarget, GpuHist) {
+  auto ctx = MakeCUDACtx(0);
+  this->RunTest(&ctx, "hist");
+}
 #endif  // defined(XGBOOST_USE_CUDA)
 
 TEST(MultiStrategy, Configure) {
diff --git a/tests/cpp/tree/test_fit_stump.cc b/tests/cpp/tree/test_fit_stump.cc
index 18511c3a0..d9441fd6f 100644
--- a/tests/cpp/tree/test_fit_stump.cc
+++ b/tests/cpp/tree/test_fit_stump.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2022 by XGBoost Contributors
+ * Copyright 2022-2023, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/linalg.h>
@@ -8,17 +8,17 @@
 #include "../../src/tree/fit_stump.h"
 #include "../helpers.h"
 
-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {
 namespace {
 void TestFitStump(Context const *ctx, DataSplitMode split = DataSplitMode::kRow) {
   std::size_t constexpr kRows = 16, kTargets = 2;
-  HostDeviceVector<GradientPair> gpair;
-  auto &h_gpair = gpair.HostVector();
-  h_gpair.resize(kRows * kTargets);
+  linalg::Matrix<GradientPair> gpair;
+  gpair.SetDevice(ctx->Device());
+  gpair.Reshape(kRows, kTargets);
+  auto h_gpair = gpair.HostView();
   for (std::size_t i = 0; i < kRows; ++i) {
     for (std::size_t t = 0; t < kTargets; ++t) {
-      h_gpair.at(i * kTargets + t) = GradientPair{static_cast<float>(i), 1};
+      h_gpair(i, t) = GradientPair{static_cast<float>(i), 1};
     }
   }
   linalg::Vector<float> out;
@@ -53,6 +53,4 @@ TEST(InitEstimation, FitStumpColumnSplit) {
   auto constexpr kWorldSize{3};
   RunWithInMemoryCommunicator(kWorldSize, &TestFitStump, &ctx, DataSplitMode::kCol);
 }
-
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index dd2d802ca..50cdae741 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -214,7 +214,7 @@ TEST(GpuHist, TestHistogramIndex) {
   TestHistogramIndexImpl();
 }
 
-void UpdateTree(Context const* ctx, HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
+void UpdateTree(Context const* ctx, linalg::Matrix<GradientPair>* gpair, DMatrix* dmat,
                 size_t gpu_page_size, RegTree* tree, HostDeviceVector<bst_float>* preds,
                 float subsample = 1.0f, const std::string& sampling_method = "uniform",
                 int max_bin = 2) {
@@ -264,7 +264,8 @@ TEST(GpuHist, UniformSampling) {
   // Create an in-memory DMatrix.
   std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, 0, true));
 
-  auto gpair = GenerateRandomGradients(kRows);
+  linalg::Matrix<GradientPair> gpair({kRows}, Context{}.MakeCUDA().Ordinal());
+  gpair.Data()->Copy(GenerateRandomGradients(kRows));
 
   // Build a tree using the in-memory DMatrix.
   RegTree tree;
@@ -294,7 +295,8 @@ TEST(GpuHist, GradientBasedSampling) {
   // Create an in-memory DMatrix.
   std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, 0, true));
 
-  auto gpair = GenerateRandomGradients(kRows);
+  linalg::Matrix<GradientPair> gpair({kRows}, MakeCUDACtx(0).Ordinal());
+  gpair.Data()->Copy(GenerateRandomGradients(kRows));
 
   // Build a tree using the in-memory DMatrix.
   RegTree tree;
@@ -330,11 +332,12 @@ TEST(GpuHist, ExternalMemory) {
   // Create a single batch DMatrix.
   std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrix(kRows, kCols, 1, tmpdir.path + "/cache"));
 
-  auto gpair = GenerateRandomGradients(kRows);
+  Context ctx(MakeCUDACtx(0));
+  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Ordinal());
+  gpair.Data()->Copy(GenerateRandomGradients(kRows));
 
   // Build a tree using the in-memory DMatrix.
   RegTree tree;
-  Context ctx(MakeCUDACtx(0));
   HostDeviceVector<bst_float> preds(kRows, 0.0, 0);
   UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree, &preds, 1.0, "uniform", kRows);
   // Build another tree using multiple ELLPACK pages.
@@ -367,12 +370,13 @@ TEST(GpuHist, ExternalMemoryWithSampling) {
   std::unique_ptr<DMatrix> dmat_ext(
       CreateSparsePageDMatrix(kRows, kCols, kRows / kPageSize, tmpdir.path + "/cache"));
 
-  auto gpair = GenerateRandomGradients(kRows);
+  Context ctx(MakeCUDACtx(0));
+  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Ordinal());
+  gpair.Data()->Copy(GenerateRandomGradients(kRows));
 
   // Build a tree using the in-memory DMatrix.
   auto rng = common::GlobalRandom();
 
-  Context ctx(MakeCUDACtx(0));
   RegTree tree;
   HostDeviceVector<bst_float> preds(kRows, 0.0, 0);
   UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree, &preds, kSubsample, kSamplingMethod, kRows);
diff --git a/tests/cpp/tree/test_histmaker.cc b/tests/cpp/tree/test_histmaker.cc
index d03440339..e90120231 100644
--- a/tests/cpp/tree/test_histmaker.cc
+++ b/tests/cpp/tree/test_histmaker.cc
@@ -26,9 +26,11 @@ TEST(GrowHistMaker, InteractionConstraint) {
   auto constexpr kRows = 32;
   auto constexpr kCols = 16;
   auto p_dmat = GenerateDMatrix(kRows, kCols);
-  auto p_gradients = GenerateGradients(kRows);
-
   Context ctx;
+
+  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Ordinal());
+  gpair.Data()->Copy(GenerateRandomGradients(kRows));
+
   ObjInfo task{ObjInfo::kRegression};
   {
     // With constraints
@@ -40,7 +42,7 @@ TEST(GrowHistMaker, InteractionConstraint) {
         Args{{"interaction_constraints", "[[0, 1]]"}, {"num_feature", std::to_string(kCols)}});
     std::vector<HostDeviceVector<bst_node_t>> position(1);
     updater->Configure(Args{});
-    updater->Update(&param, p_gradients.get(), p_dmat.get(), position, {&tree});
+    updater->Update(&param, &gpair, p_dmat.get(), position, {&tree});
 
     ASSERT_EQ(tree.NumExtraNodes(), 4);
     ASSERT_EQ(tree[0].SplitIndex(), 1);
@@ -57,7 +59,7 @@ TEST(GrowHistMaker, InteractionConstraint) {
     TrainParam param;
     param.Init(Args{});
     updater->Configure(Args{});
-    updater->Update(&param, p_gradients.get(), p_dmat.get(), position, {&tree});
+    updater->Update(&param, &gpair, p_dmat.get(), position, {&tree});
 
     ASSERT_EQ(tree.NumExtraNodes(), 10);
     ASSERT_EQ(tree[0].SplitIndex(), 1);
@@ -70,9 +72,12 @@ TEST(GrowHistMaker, InteractionConstraint) {
 namespace {
 void VerifyColumnSplit(int32_t rows, bst_feature_t cols, bool categorical,
                        RegTree const& expected_tree) {
-  auto p_dmat = GenerateDMatrix(rows, cols, categorical);
-  auto p_gradients = GenerateGradients(rows);
   Context ctx;
+  auto p_dmat = GenerateDMatrix(rows, cols, categorical);
+  linalg::Matrix<GradientPair> gpair({rows}, ctx.Ordinal());
+  gpair.Data()->Copy(GenerateRandomGradients(rows));
+
+
   ObjInfo task{ObjInfo::kRegression};
   std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_histmaker", &ctx, &task)};
   std::vector<HostDeviceVector<bst_node_t>> position(1);
@@ -84,7 +89,7 @@ void VerifyColumnSplit(int32_t rows, bst_feature_t cols, bool categorical,
   TrainParam param;
   param.Init(Args{});
   updater->Configure(Args{});
-  updater->Update(&param, p_gradients.get(), sliced.get(), position, {&tree});
+  updater->Update(&param, &gpair, sliced.get(), position, {&tree});
 
   Json json{Object{}};
   tree.SaveModel(&json);
@@ -100,15 +105,16 @@ void TestColumnSplit(bool categorical) {
   RegTree expected_tree{1u, kCols};
   ObjInfo task{ObjInfo::kRegression};
   {
-    auto p_dmat = GenerateDMatrix(kRows, kCols, categorical);
-    auto p_gradients = GenerateGradients(kRows);
     Context ctx;
+    auto p_dmat = GenerateDMatrix(kRows, kCols, categorical);
+    linalg::Matrix<GradientPair> gpair({kRows}, ctx.Ordinal());
+    gpair.Data()->Copy(GenerateRandomGradients(kRows));
     std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_histmaker", &ctx, &task)};
     std::vector<HostDeviceVector<bst_node_t>> position(1);
     TrainParam param;
     param.Init(Args{});
     updater->Configure(Args{});
-    updater->Update(&param, p_gradients.get(), p_dmat.get(), position, {&expected_tree});
+    updater->Update(&param, &gpair, p_dmat.get(), position, {&expected_tree});
   }
 
   auto constexpr kWorldSize = 2;
diff --git a/tests/cpp/tree/test_prediction_cache.cc b/tests/cpp/tree/test_prediction_cache.cc
index 0aafb0a4f..fc1d05087 100644
--- a/tests/cpp/tree/test_prediction_cache.cc
+++ b/tests/cpp/tree/test_prediction_cache.cc
@@ -69,7 +69,7 @@ class TestPredictionCache : public ::testing::Test {
       std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create(updater_name, ctx, &task)};
       RegTree tree;
       std::vector<RegTree*> trees{&tree};
-      auto gpair = GenerateRandomGradients(n_samples_);
+      auto gpair = GenerateRandomGradients(ctx, n_samples_, 1);
       tree::TrainParam param;
       param.UpdateAllowUnknown(Args{{"max_bin", "64"}});
 
diff --git a/tests/cpp/tree/test_prune.cc b/tests/cpp/tree/test_prune.cc
index 843e2b2ee..1a3ec532e 100644
--- a/tests/cpp/tree/test_prune.cc
+++ b/tests/cpp/tree/test_prune.cc
@@ -21,15 +21,13 @@ TEST(Updater, Prune) {
   std::vector<std::pair<std::string, std::string>> cfg;
   cfg.emplace_back("num_feature", std::to_string(kCols));
   cfg.emplace_back("min_split_loss", "10");
+  Context ctx;
 
   // These data are just place holders.
-  HostDeviceVector<GradientPair> gpair =
-      { {0.50f, 0.25f}, {0.50f, 0.25f}, {0.50f, 0.25f}, {0.50f, 0.25f},
-        {0.25f, 0.24f}, {0.25f, 0.24f}, {0.25f, 0.24f}, {0.25f, 0.24f} };
-  std::shared_ptr<DMatrix> p_dmat {
-    RandomDataGenerator{32, 10, 0}.GenerateDMatrix() };
-
-  Context ctx;
+  linalg::Matrix<GradientPair> gpair
+      {{ {0.50f, 0.25f}, {0.50f, 0.25f}, {0.50f, 0.25f}, {0.50f, 0.25f},
+         {0.25f, 0.24f}, {0.25f, 0.24f}, {0.25f, 0.24f}, {0.25f, 0.24f} }, {8, 1}, ctx.Device()};
+  std::shared_ptr<DMatrix> p_dmat{RandomDataGenerator{32, 10, 0}.GenerateDMatrix()};
 
   // prepare tree
   RegTree tree = RegTree{1u, kCols};
diff --git a/tests/cpp/tree/test_quantile_hist.cc b/tests/cpp/tree/test_quantile_hist.cc
index 4afea74ce..6327703ed 100644
--- a/tests/cpp/tree/test_quantile_hist.cc
+++ b/tests/cpp/tree/test_quantile_hist.cc
@@ -202,13 +202,13 @@ TEST(QuantileHist, PartitionerColSplit) { TestColumnSplitPartitioner<CPUExpandEn
 TEST(QuantileHist, MultiPartitionerColSplit) { TestColumnSplitPartitioner<MultiExpandEntry>(3); }
 
 namespace {
-void VerifyColumnSplit(bst_row_t rows, bst_feature_t cols, bst_target_t n_targets,
+void VerifyColumnSplit(Context const* ctx, bst_row_t rows, bst_feature_t cols, bst_target_t n_targets,
                        RegTree const& expected_tree) {
   auto Xy = RandomDataGenerator{rows, cols, 0}.GenerateDMatrix(true);
-  auto p_gradients = GenerateGradients(rows, n_targets);
-  Context ctx;
+  linalg::Matrix<GradientPair> gpair = GenerateRandomGradients(ctx, rows, n_targets);
+
   ObjInfo task{ObjInfo::kRegression};
-  std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_quantile_histmaker", &ctx, &task)};
+  std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_quantile_histmaker", ctx, &task)};
   std::vector<HostDeviceVector<bst_node_t>> position(1);
 
   std::unique_ptr<DMatrix> sliced{Xy->SliceCol(collective::GetWorldSize(), collective::GetRank())};
@@ -217,7 +217,7 @@ void VerifyColumnSplit(bst_row_t rows, bst_feature_t cols, bst_target_t n_target
   TrainParam param;
   param.Init(Args{});
   updater->Configure(Args{});
-  updater->Update(&param, p_gradients.get(), sliced.get(), position, {&tree});
+  updater->Update(&param, &gpair, sliced.get(), position, {&tree});
 
   Json json{Object{}};
   tree.SaveModel(&json);
@@ -232,21 +232,21 @@ void TestColumnSplit(bst_target_t n_targets) {
 
   RegTree expected_tree{n_targets, kCols};
   ObjInfo task{ObjInfo::kRegression};
+  Context ctx;
   {
     auto Xy = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true);
-    auto p_gradients = GenerateGradients(kRows, n_targets);
-    Context ctx;
+    auto gpair = GenerateRandomGradients(&ctx, kRows, n_targets);
     std::unique_ptr<TreeUpdater> updater{
         TreeUpdater::Create("grow_quantile_histmaker", &ctx, &task)};
     std::vector<HostDeviceVector<bst_node_t>> position(1);
     TrainParam param;
     param.Init(Args{});
     updater->Configure(Args{});
-    updater->Update(&param, p_gradients.get(), Xy.get(), position, {&expected_tree});
+    updater->Update(&param, &gpair, Xy.get(), position, {&expected_tree});
   }
 
   auto constexpr kWorldSize = 2;
-  RunWithInMemoryCommunicator(kWorldSize, VerifyColumnSplit, kRows, kCols, n_targets,
+  RunWithInMemoryCommunicator(kWorldSize, VerifyColumnSplit, &ctx, kRows, kCols, n_targets,
                               std::cref(expected_tree));
 }
 }  // anonymous namespace
diff --git a/tests/cpp/tree/test_refresh.cc b/tests/cpp/tree/test_refresh.cc
index 11ce94f59..c8859c898 100644
--- a/tests/cpp/tree/test_refresh.cc
+++ b/tests/cpp/tree/test_refresh.cc
@@ -17,10 +17,11 @@ namespace xgboost::tree {
 TEST(Updater, Refresh) {
   bst_row_t constexpr kRows = 8;
   bst_feature_t constexpr kCols = 16;
+  Context ctx;
 
-  HostDeviceVector<GradientPair> gpair =
-      { {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f},
-        {0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f} };
+  linalg::Matrix<GradientPair> gpair
+      {{ {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f},
+         {0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f} }, {8, 1}, ctx.Device()};
   std::shared_ptr<DMatrix> p_dmat{
     RandomDataGenerator{kRows, kCols, 0.4f}.Seed(3).GenerateDMatrix()};
   std::vector<std::pair<std::string, std::string>> cfg{
@@ -29,7 +30,6 @@ TEST(Updater, Refresh) {
       {"reg_lambda", "1"}};
 
   RegTree tree = RegTree{1u, kCols};
-  Context ctx;
   std::vector<RegTree*> trees{&tree};
 
   ObjInfo task{ObjInfo::kRegression};
diff --git a/tests/cpp/tree/test_tree_stat.cc b/tests/cpp/tree/test_tree_stat.cc
index d125c84d5..dc9a9c209 100644
--- a/tests/cpp/tree/test_tree_stat.cc
+++ b/tests/cpp/tree/test_tree_stat.cc
@@ -16,7 +16,7 @@ namespace xgboost {
 class UpdaterTreeStatTest : public ::testing::Test {
  protected:
   std::shared_ptr<DMatrix> p_dmat_;
-  HostDeviceVector<GradientPair> gpairs_;
+  linalg::Matrix<GradientPair> gpairs_;
   size_t constexpr static kRows = 10;
   size_t constexpr static kCols = 10;
 
@@ -24,8 +24,8 @@ class UpdaterTreeStatTest : public ::testing::Test {
   void SetUp() override {
     p_dmat_ = RandomDataGenerator(kRows, kCols, .5f).GenerateDMatrix(true);
     auto g = GenerateRandomGradients(kRows);
-    gpairs_.Resize(kRows);
-    gpairs_.Copy(g);
+    gpairs_.Reshape(kRows, 1);
+    gpairs_.Data()->Copy(g);
   }
 
   void RunTest(std::string updater) {
@@ -63,7 +63,7 @@ TEST_F(UpdaterTreeStatTest, Approx) { this->RunTest("grow_histmaker"); }
 class UpdaterEtaTest : public ::testing::Test {
  protected:
   std::shared_ptr<DMatrix> p_dmat_;
-  HostDeviceVector<GradientPair> gpairs_;
+  linalg::Matrix<GradientPair> gpairs_;
   size_t constexpr static kRows = 10;
   size_t constexpr static kCols = 10;
   size_t constexpr static kClasses = 10;
@@ -71,8 +71,8 @@ class UpdaterEtaTest : public ::testing::Test {
   void SetUp() override {
     p_dmat_ = RandomDataGenerator(kRows, kCols, .5f).GenerateDMatrix(true, false, kClasses);
     auto g = GenerateRandomGradients(kRows);
-    gpairs_.Resize(kRows);
-    gpairs_.Copy(g);
+    gpairs_.Reshape(kRows, 1);
+    gpairs_.Data()->Copy(g);
   }
 
   void RunTest(std::string updater) {
@@ -125,14 +125,15 @@ TEST_F(UpdaterEtaTest, GpuHist) { this->RunTest("grow_gpu_hist"); }
 
 class TestMinSplitLoss : public ::testing::Test {
   std::shared_ptr<DMatrix> dmat_;
-  HostDeviceVector<GradientPair> gpair_;
+  linalg::Matrix<GradientPair> gpair_;
 
   void SetUp() override {
     constexpr size_t kRows = 32;
     constexpr size_t kCols = 16;
     constexpr float kSparsity = 0.6;
     dmat_ = RandomDataGenerator(kRows, kCols, kSparsity).Seed(3).GenerateDMatrix();
-    gpair_ = GenerateRandomGradients(kRows);
+    gpair_.Reshape(kRows, 1);
+    gpair_.Data()->Copy(GenerateRandomGradients(kRows));
   }
 
   std::int32_t Update(Context const* ctx, std::string updater, float gamma) {
diff --git a/tests/python-gpu/test_gpu_with_sklearn.py b/tests/python-gpu/test_gpu_with_sklearn.py
index 530d3e9df..9f902ce32 100644
--- a/tests/python-gpu/test_gpu_with_sklearn.py
+++ b/tests/python-gpu/test_gpu_with_sklearn.py
@@ -1,3 +1,4 @@
+import itertools
 import json
 import os
 import sys
@@ -158,6 +159,96 @@ def test_classififer():
         clf.fit(X, y)
 
 
+@pytest.mark.parametrize(
+    "use_cupy,tree_method,device,order,gdtype,strategy",
+    [
+        c
+        for c in itertools.product(
+            (True, False),
+            ("hist", "approx"),
+            ("cpu", "cuda"),
+            ("C", "F"),
+            ("float64", "float32"),
+            ("one_output_per_tree", "multi_output_tree"),
+        )
+    ],
+)
+def test_custom_objective(
+    use_cupy: bool,
+    tree_method: str,
+    device: str,
+    order: str,
+    gdtype: str,
+    strategy: str,
+) -> None:
+    from sklearn.datasets import load_iris
+
+    X, y = load_iris(return_X_y=True)
+
+    params = {
+        "tree_method": tree_method,
+        "device": device,
+        "n_estimators": 8,
+        "multi_strategy": strategy,
+    }
+
+    obj = tm.softprob_obj(y.max() + 1, use_cupy=use_cupy, order=order, gdtype=gdtype)
+
+    clf = xgb.XGBClassifier(objective=obj, **params)
+
+    if strategy == "multi_output_tree" and tree_method == "approx":
+        with pytest.raises(ValueError, match=r"Only the hist"):
+            clf.fit(X, y)
+        return
+    if strategy == "multi_output_tree" and device == "cuda":
+        with pytest.raises(ValueError, match=r"GPU is not yet"):
+            clf.fit(X, y)
+        return
+
+    clf.fit(X, y)
+
+    clf_1 = xgb.XGBClassifier(**params)
+    clf_1.fit(X, y)
+
+    np.testing.assert_allclose(clf.predict_proba(X), clf_1.predict_proba(X), rtol=1e-4)
+
+    params["n_estimators"] = 2
+
+    def wrong_shape(labels, predt):
+        grad, hess = obj(labels, predt)
+        return grad[:, :-1], hess[:, :-1]
+
+    with pytest.raises(ValueError, match="should be equal to the number of"):
+        clf = xgb.XGBClassifier(objective=wrong_shape, **params)
+        clf.fit(X, y)
+
+    def wrong_shape_1(labels, predt):
+        grad, hess = obj(labels, predt)
+        return grad[:-1, :], hess[:-1, :]
+
+    with pytest.raises(ValueError, match="Mismatched size between the gradient"):
+        clf = xgb.XGBClassifier(objective=wrong_shape_1, **params)
+        clf.fit(X, y)
+
+    def wrong_shape_2(labels, predt):
+        grad, hess = obj(labels, predt)
+        return grad[:, :], hess[:-1, :]
+
+    with pytest.raises(ValueError, match="Mismatched shape between the gradient"):
+        clf = xgb.XGBClassifier(objective=wrong_shape_2, **params)
+        clf.fit(X, y)
+
+    def wrong_shape_3(labels, predt):
+        grad, hess = obj(labels, predt)
+        grad = grad.reshape(grad.size)
+        hess = hess.reshape(hess.size)
+        return grad, hess
+
+    with pytest.warns(FutureWarning, match="required to be"):
+        clf = xgb.XGBClassifier(objective=wrong_shape_3, **params)
+        clf.fit(X, y)
+
+
 @pytest.mark.skipif(**tm.no_pandas())
 def test_ranking_qid_df():
     import cudf

From aa86bd5207964882a6a11eb401387747c8b1f18c Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 25 Aug 2023 20:23:47 +0800
Subject: [PATCH 120/136] [dask] Filter models on worker. (#9518)

---
 python-package/xgboost/dask.py | 72 +++++++++++++++++-----------------
 1 file changed, 36 insertions(+), 36 deletions(-)

diff --git a/python-package/xgboost/dask.py b/python-package/xgboost/dask.py
index 219ad2698..f62a3e5af 100644
--- a/python-package/xgboost/dask.py
+++ b/python-package/xgboost/dask.py
@@ -47,6 +47,7 @@ from typing import (
     Callable,
     Dict,
     Generator,
+    Iterable,
     List,
     Optional,
     Sequence,
@@ -97,10 +98,12 @@ if TYPE_CHECKING:
     import dask
     import distributed
     from dask import array as da
+    from dask import bag as db
     from dask import dataframe as dd
 else:
     dd = LazyLoader("dd", globals(), "dask.dataframe")
     da = LazyLoader("da", globals(), "dask.array")
+    db = LazyLoader("db", globals(), "dask.bag")
     dask = LazyLoader("dask", globals(), "dask")
     distributed = LazyLoader("distributed", globals(), "dask.distributed")
 
@@ -509,12 +512,10 @@ async def map_worker_partitions(
     func: Callable[..., _MapRetT],
     *refs: Any,
     workers: Sequence[str],
-) -> List[_MapRetT]:
+) -> _MapRetT:
     """Map a function onto partitions of each worker."""
     # Note for function purity:
-    # XGBoost is deterministic in most of the cases, which means train function is
-    # supposed to be idempotent.  One known exception is gblinear with shotgun updater.
-    # We haven't been able to do a full verification so here we keep pure to be False.
+    # XGBoost is sensitive to data partition and uses random number generator.
     client = _xgb_get_client(client)
     futures = []
     for addr in workers:
@@ -526,11 +527,26 @@ async def map_worker_partitions(
             else:
                 args.append(ref)
         fut = client.submit(
-            func, *args, pure=False, workers=[addr], allow_other_workers=False
+            # turn result into a list for bag construction
+            lambda *args, **kwargs: [func(*args, **kwargs)],
+            *args,
+            pure=False,
+            workers=[addr],
+            allow_other_workers=False,
         )
         futures.append(fut)
-    results = await client.gather(futures)
-    return results
+
+    def first_valid(results: Iterable[Optional[_MapRetT]]) -> Optional[_MapRetT]:
+        for v in results:
+            if v is not None:
+                return v
+        return None
+
+    bag = db.from_delayed(futures)
+    fut = await bag.reduction(first_valid, first_valid)
+    result = await client.compute(fut).result()
+
+    return result
 
 
 _DataParts = List[Dict[str, Any]]
@@ -882,29 +898,6 @@ def _get_workers_from_data(
     return list(X_worker_map)
 
 
-def _filter_empty(
-    booster: Booster, local_history: TrainingCallback.EvalsLog, is_valid: bool
-) -> Optional[TrainReturnT]:
-    n_workers = collective.get_world_size()
-    non_empty = numpy.zeros(shape=(n_workers,), dtype=numpy.int32)
-    rank = collective.get_rank()
-    non_empty[rank] = int(is_valid)
-    non_empty = collective.allreduce(non_empty, collective.Op.SUM)
-    non_empty = non_empty.astype(bool)
-    ret: Optional[TrainReturnT] = {
-        "booster": booster,
-        "history": local_history,
-    }
-    for i in range(non_empty.size):
-        # This is the first valid worker
-        if non_empty[i] and i == rank:
-            return ret
-        if non_empty[i]:
-            return None
-
-    raise ValueError("None of the workers can provide a valid result.")
-
-
 async def _check_workers_are_alive(
     workers: List[str], client: "distributed.Client"
 ) -> None:
@@ -997,10 +990,17 @@ async def _train_async(
                 xgb_model=xgb_model,
                 callbacks=callbacks,
             )
-            # Don't return the boosters from empty workers. It's quite difficult to
-            # guarantee everything is in sync in the present of empty workers,
-            # especially with complex objectives like quantile.
-            return _filter_empty(booster, local_history, Xy.num_row() != 0)
+        # Don't return the boosters from empty workers. It's quite difficult to
+        # guarantee everything is in sync in the present of empty workers, especially
+        # with complex objectives like quantile.
+        if Xy.num_row() != 0:
+            ret: Optional[TrainReturnT] = {
+                "booster": booster,
+                "history": local_history,
+            }
+        else:
+            ret = None
+        return ret
 
     async with distributed.MultiLock(workers, client):
         if evals is not None:
@@ -1012,7 +1012,7 @@ async def _train_async(
             evals_name = []
             evals_id = []
 
-        results = await map_worker_partitions(
+        result = await map_worker_partitions(
             client,
             dispatched_train,
             # extra function parameters
@@ -1025,7 +1025,7 @@ async def _train_async(
             # workers to be used for training
             workers=workers,
         )
-        return list(filter(lambda ret: ret is not None, results))[0]
+        return result
 
 
 @_deprecate_positional_args

From 209335b18cea53814aa78daee02079f1f39ee72a Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sun, 27 Aug 2023 03:37:05 +0800
Subject: [PATCH 121/136] Remove the deprecated Python rabit module. (#9523)

---
 python-package/xgboost/__init__.py |   2 +-
 python-package/xgboost/rabit.py    | 169 -----------------------------
 tests/python/test_collective.py    |  31 ------
 3 files changed, 1 insertion(+), 201 deletions(-)
 delete mode 100644 python-package/xgboost/rabit.py

diff --git a/python-package/xgboost/__init__.py b/python-package/xgboost/__init__.py
index f17ac23ba..220093b47 100644
--- a/python-package/xgboost/__init__.py
+++ b/python-package/xgboost/__init__.py
@@ -4,7 +4,7 @@ Contributors: https://github.com/dmlc/xgboost/blob/master/CONTRIBUTORS.md
 """
 
 from . import tracker  # noqa
-from . import collective, dask, rabit
+from . import collective, dask
 from .core import (
     Booster,
     DataIter,
diff --git a/python-package/xgboost/rabit.py b/python-package/xgboost/rabit.py
deleted file mode 100644
index 132d72178..000000000
--- a/python-package/xgboost/rabit.py
+++ /dev/null
@@ -1,169 +0,0 @@
-"""Compatibility shim for xgboost.rabit; to be removed in 2.0"""
-import logging
-import warnings
-from enum import IntEnum, unique
-from typing import Any, Callable, List, Optional, TypeVar
-
-import numpy as np
-
-from . import collective
-
-LOGGER = logging.getLogger("[xgboost.rabit]")
-
-
-def _deprecation_warning() -> str:
-    return (
-        "The xgboost.rabit submodule is marked as deprecated in 1.7 and will be removed "
-        "in 2.0. Please use xgboost.collective instead."
-    )
-
-
-def init(args: Optional[List[bytes]] = None) -> None:
-    """Initialize the rabit library with arguments"""
-    warnings.warn(_deprecation_warning(), FutureWarning)
-    parsed = {}
-    if args:
-        for arg in args:
-            kv = arg.decode().split("=")
-            if len(kv) == 2:
-                parsed[kv[0]] = kv[1]
-    collective.init(**parsed)
-
-
-def finalize() -> None:
-    """Finalize the process, notify tracker everything is done."""
-    collective.finalize()
-
-
-def get_rank() -> int:
-    """Get rank of current process.
-    Returns
-    -------
-    rank : int
-        Rank of current process.
-    """
-    return collective.get_rank()
-
-
-def get_world_size() -> int:
-    """Get total number workers.
-    Returns
-    -------
-    n : int
-        Total number of process.
-    """
-    return collective.get_world_size()
-
-
-def is_distributed() -> int:
-    """If rabit is distributed."""
-    return collective.is_distributed()
-
-
-def tracker_print(msg: Any) -> None:
-    """Print message to the tracker.
-    This function can be used to communicate the information of
-    the progress to the tracker
-    Parameters
-    ----------
-    msg : str
-        The message to be printed to tracker.
-    """
-    collective.communicator_print(msg)
-
-
-def get_processor_name() -> bytes:
-    """Get the processor name.
-    Returns
-    -------
-    name : str
-        the name of processor(host)
-    """
-    return collective.get_processor_name().encode()
-
-
-T = TypeVar("T")  # pylint:disable=invalid-name
-
-
-def broadcast(data: T, root: int) -> T:
-    """Broadcast object from one node to all other nodes.
-    Parameters
-    ----------
-    data : any type that can be pickled
-        Input data, if current rank does not equal root, this can be None
-    root : int
-        Rank of the node to broadcast data from.
-    Returns
-    -------
-    object : int
-        the result of broadcast.
-    """
-    return collective.broadcast(data, root)
-
-
-@unique
-class Op(IntEnum):
-    """Supported operations for rabit."""
-
-    MAX = 0
-    MIN = 1
-    SUM = 2
-    OR = 3
-
-
-def allreduce(  # pylint:disable=invalid-name
-    data: np.ndarray, op: Op, prepare_fun: Optional[Callable[[np.ndarray], None]] = None
-) -> np.ndarray:
-    """Perform allreduce, return the result.
-    Parameters
-    ----------
-    data :
-        Input data.
-    op :
-        Reduction operators, can be MIN, MAX, SUM, BITOR
-    prepare_fun :
-        Lazy preprocessing function, if it is not None, prepare_fun(data)
-        will be called by the function before performing allreduce, to initialize the data
-        If the result of Allreduce can be recovered directly,
-        then prepare_fun will NOT be called
-    Returns
-    -------
-    result :
-        The result of allreduce, have same shape as data
-    Notes
-    -----
-    This function is not thread-safe.
-    """
-    if prepare_fun is None:
-        return collective.allreduce(data, collective.Op(op))
-    raise ValueError("preprocessing function is no longer supported")
-
-
-def version_number() -> int:
-    """Returns version number of current stored model.
-    This means how many calls to CheckPoint we made so far.
-    Returns
-    -------
-    version : int
-        Version number of currently stored model
-    """
-    return 0
-
-
-class RabitContext:
-    """A context controlling rabit initialization and finalization."""
-
-    def __init__(self, args: Optional[List[bytes]] = None) -> None:
-        if args is None:
-            args = []
-        self.args = args
-
-    def __enter__(self) -> None:
-        init(self.args)
-        assert is_distributed()
-        LOGGER.warning(_deprecation_warning())
-        LOGGER.debug("-------------- rabit say hello ------------------")
-
-    def __exit__(self, *args: List) -> None:
-        finalize()
-        LOGGER.debug("--------------- rabit say bye ------------------")
diff --git a/tests/python/test_collective.py b/tests/python/test_collective.py
index 32b0a67a7..f7de0400d 100644
--- a/tests/python/test_collective.py
+++ b/tests/python/test_collective.py
@@ -39,37 +39,6 @@ def test_rabit_communicator():
         assert worker.exitcode == 0
 
 
-# TODO(rongou): remove this once we remove the rabit api.
-def run_rabit_api_worker(rabit_env, world_size):
-    with xgb.rabit.RabitContext(rabit_env):
-        assert xgb.rabit.get_world_size() == world_size
-        assert xgb.rabit.is_distributed()
-        assert xgb.rabit.get_processor_name().decode() == socket.gethostname()
-        ret = xgb.rabit.broadcast('test1234', 0)
-        assert str(ret) == 'test1234'
-        ret = xgb.rabit.allreduce(np.asarray([1, 2, 3]), xgb.rabit.Op.SUM)
-        assert np.array_equal(ret, np.asarray([2, 4, 6]))
-
-
-# TODO(rongou): remove this once we remove the rabit api.
-def test_rabit_api():
-    world_size = 2
-    tracker = RabitTracker(host_ip='127.0.0.1', n_workers=world_size)
-    tracker.start(world_size)
-    rabit_env = []
-    for k, v in tracker.worker_envs().items():
-        rabit_env.append(f"{k}={v}".encode())
-    workers = []
-    for _ in range(world_size):
-        worker = multiprocessing.Process(target=run_rabit_api_worker,
-                                         args=(rabit_env, world_size))
-        workers.append(worker)
-        worker.start()
-    for worker in workers:
-        worker.join()
-        assert worker.exitcode == 0
-
-
 def run_federated_worker(port, world_size, rank):
     with xgb.collective.CommunicatorContext(xgboost_communicator='federated',
                                             federated_server_address=f'localhost:{port}',

From 1b87a1d8f8c1b372f1101badc7b3fdf03499ea28 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sun, 27 Aug 2023 05:10:59 +0800
Subject: [PATCH 122/136] [rabit] Small cleanup to tracker initialization.
 (#9524)

- Remove recover related code.
- Clean startup, no need to consider previously connected nodes.
---
 python-package/xgboost/tracker.py | 10 ++--------
 rabit/src/allreduce_base.cc       | 13 +------------
 2 files changed, 3 insertions(+), 20 deletions(-)

diff --git a/python-package/xgboost/tracker.py b/python-package/xgboost/tracker.py
index 142a70fc5..606c63791 100644
--- a/python-package/xgboost/tracker.py
+++ b/python-package/xgboost/tracker.py
@@ -137,15 +137,9 @@ class WorkerEntry:
         return self._get_remote(wait_conn, nnset)
 
     def _get_remote(
-        self, wait_conn: Dict[int, "WorkerEntry"], nnset: Set[int]
+        self, wait_conn: Dict[int, "WorkerEntry"], badset: Set[int]
     ) -> List[int]:
         while True:
-            ngood = self.sock.recvint()
-            goodset = set()
-            for _ in range(ngood):
-                goodset.add(self.sock.recvint())
-            assert goodset.issubset(nnset)
-            badset = nnset - goodset
             conset = []
             for r in badset:
                 if r in wait_conn:
@@ -343,7 +337,7 @@ class RabitTracker:
                 shutdown[s.rank] = s
                 logging.debug("Received %s signal from %d", s.cmd, s.rank)
                 continue
-            assert s.cmd in ("start", "recover")
+            assert s.cmd == "start"
             # lazily initialize the workers
             if tree_map is None:
                 assert s.cmd == "start"
diff --git a/rabit/src/allreduce_base.cc b/rabit/src/allreduce_base.cc
index e123b52d8..ac08ac12a 100644
--- a/rabit/src/allreduce_base.cc
+++ b/rabit/src/allreduce_base.cc
@@ -318,21 +318,10 @@ bool AllreduceBase::ReConnectLinks(const char *cmd) {
     // get number of to connect and number of to accept nodes from tracker
     int num_conn, num_accept, num_error = 1;
     do {
-      // send over good links
-      std::vector<int> good_link;
       for (auto & all_link : all_links) {
-        if (!all_link.sock.BadSocket()) {
-          good_link.push_back(static_cast<int>(all_link.rank));
-        } else {
-          if (!all_link.sock.IsClosed()) all_link.sock.Close();
-        }
+        all_link.sock.Close();
       }
-      int ngood = static_cast<int>(good_link.size());
       // tracker construct goodset
-      Assert(tracker.SendAll(&ngood, sizeof(ngood)) == sizeof(ngood), "ReConnectLink failure 5");
-      for (int &i : good_link) {
-        Assert(tracker.SendAll(&i, sizeof(i)) == sizeof(i), "ReConnectLink failure 6");
-      }
       Assert(tracker.RecvAll(&num_conn, sizeof(num_conn)) == sizeof(num_conn),
              "ReConnectLink failure 7");
       Assert(tracker.RecvAll(&num_accept, sizeof(num_accept)) == sizeof(num_accept),

From c3574d932f5555c090ac9e6c4ee6b82baf3ace7a Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 28 Aug 2023 18:36:11 +0800
Subject: [PATCH 123/136] [R] Fix integer inputs with NA. (#9522)

---
 R-package/src/xgboost_R.cc              | 24 +++++++++++++----
 R-package/tests/testthat/test_dmatrix.R | 36 +++++++++++++++++++++++++
 2 files changed, 55 insertions(+), 5 deletions(-)

diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc
index b975ab8ba..b7eae03c4 100644
--- a/R-package/src/xgboost_R.cc
+++ b/R-package/src/xgboost_R.cc
@@ -113,11 +113,25 @@ XGB_DLL SEXP XGDMatrixCreateFromMat_R(SEXP mat, SEXP missing, SEXP n_threads) {
   ctx.nthread = asInteger(n_threads);
   std::int32_t threads = ctx.Threads();
 
-  xgboost::common::ParallelFor(nrow, threads, [&](xgboost::omp_ulong i) {
-    for (size_t j = 0; j < ncol; ++j) {
-      data[i * ncol + j] = is_int ? static_cast<float>(iin[i + nrow * j]) : din[i + nrow * j];
-    }
-  });
+  if (is_int) {
+    xgboost::common::ParallelFor(nrow, threads, [&](xgboost::omp_ulong i) {
+      for (size_t j = 0; j < ncol; ++j) {
+        auto v = iin[i + nrow * j];
+        if (v == NA_INTEGER) {
+          data[i * ncol + j] = std::numeric_limits<float>::quiet_NaN();
+        } else {
+          data[i * ncol + j] = static_cast<float>(v);
+        }
+      }
+    });
+  } else {
+    xgboost::common::ParallelFor(nrow, threads, [&](xgboost::omp_ulong i) {
+      for (size_t j = 0; j < ncol; ++j) {
+        data[i * ncol + j] = din[i + nrow * j];
+      }
+    });
+  }
+
   DMatrixHandle handle;
   CHECK_CALL(XGDMatrixCreateFromMat_omp(BeginPtr(data), nrow, ncol,
                                         asReal(missing), &handle, threads));
diff --git a/R-package/tests/testthat/test_dmatrix.R b/R-package/tests/testthat/test_dmatrix.R
index 57cc82c17..8d74a0357 100644
--- a/R-package/tests/testthat/test_dmatrix.R
+++ b/R-package/tests/testthat/test_dmatrix.R
@@ -56,6 +56,42 @@ test_that("xgb.DMatrix: basic construction", {
   expect_equal(raw_fd, raw_dgc)
 })
 
+test_that("xgb.DMatrix: NA", {
+  n_samples <- 3
+  x <- cbind(
+    x1 = sample(x = 4, size = n_samples, replace = TRUE),
+    x2 = sample(x = 4, size = n_samples, replace = TRUE)
+  )
+  x[1, "x1"] <- NA
+
+  m <- xgb.DMatrix(x)
+  xgb.DMatrix.save(m, "int.dmatrix")
+
+  x <- matrix(as.numeric(x), nrow = n_samples, ncol = 2)
+  colnames(x) <- c("x1", "x2")
+  m <- xgb.DMatrix(x)
+
+  xgb.DMatrix.save(m, "float.dmatrix")
+
+  iconn <- file("int.dmatrix", "rb")
+  fconn <- file("float.dmatrix", "rb")
+
+  expect_equal(file.size("int.dmatrix"), file.size("float.dmatrix"))
+
+  bytes <- file.size("int.dmatrix")
+  idmatrix <- readBin(iconn, "raw", n = bytes)
+  fdmatrix <- readBin(fconn, "raw", n = bytes)
+
+  expect_equal(length(idmatrix), length(fdmatrix))
+  expect_equal(idmatrix, fdmatrix)
+
+  close(iconn)
+  close(fconn)
+
+  file.remove("int.dmatrix")
+  file.remove("float.dmatrix")
+})
+
 test_that("xgb.DMatrix: saving, loading", {
   # save to a local file
   dtest1 <- xgb.DMatrix(test_data, label = test_label)

From 90ef250ea10618865193bd0ca5c6491fc589d741 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 28 Aug 2023 21:01:22 +0800
Subject: [PATCH 124/136] [rabit] Drop support for MPI backend. (#9525)

- Add checks in cmake.
- Remove mpi related code.
---
 CMakeLists.txt          |  30 +++++---
 cmake/Utils.cmake       |   4 -
 rabit/CMakeLists.txt    |   4 +-
 rabit/src/engine_mpi.cc | 162 ----------------------------------------
 4 files changed, 21 insertions(+), 179 deletions(-)
 delete mode 100644 rabit/src/engine_mpi.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 27da42376..d9d2d7cc8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,7 +47,6 @@ option(USE_OPENMP "Build with OpenMP support." ON)
 option(BUILD_STATIC_LIB "Build static library" OFF)
 option(BUILD_DEPRECATED_CLI "Build the deprecated command line interface" OFF)
 option(FORCE_SHARED_CRT "Build with dynamic CRT on Windows (/MD)" OFF)
-option(RABIT_BUILD_MPI "Build MPI" OFF)
 ## Bindings
 option(JVM_BINDINGS "Build JVM bindings" OFF)
 option(R_LIB "Build shared library for R package" OFF)
@@ -106,12 +105,6 @@ if (R_LIB AND GOOGLE_TEST)
   message(WARNING "Some C++ unittests will fail with `R_LIB` enabled,
  as R package redirects some functions to R runtime implementation.")
 endif (R_LIB AND GOOGLE_TEST)
-if (USE_AVX)
-  message(SEND_ERROR  "The option 'USE_AVX' is deprecated as experimental AVX features have been removed from XGBoost.")
-endif (USE_AVX)
-if (PLUGIN_LZ4)
-  message(SEND_ERROR  "The option 'PLUGIN_LZ4' is removed from XGBoost.")
-endif (PLUGIN_LZ4)
 if (PLUGIN_RMM AND NOT (USE_CUDA))
   message(SEND_ERROR "`PLUGIN_RMM` must be enabled with `USE_CUDA` flag.")
 endif (PLUGIN_RMM AND NOT (USE_CUDA))
@@ -144,6 +137,26 @@ if (PLUGIN_FEDERATED)
   endif ()
 endif ()
 
+#-- Removed options
+if (USE_AVX)
+  message(SEND_ERROR  "The option `USE_AVX` is deprecated as experimental AVX features have been removed from XGBoost.")
+endif (USE_AVX)
+if (PLUGIN_LZ4)
+  message(SEND_ERROR  "The option `PLUGIN_LZ4` is removed from XGBoost.")
+endif (PLUGIN_LZ4)
+if (RABIT_BUILD_MPI)
+  message(SEND_ERROR "The option `RABIT_BUILD_MPI` has been removed from XGBoost.")
+endif (RABIT_BUILD_MPI)
+if (USE_S3)
+  message(SEND_ERROR "The option `USE_S3` has been removed from XGBoost")
+endif (USE_S3)
+if (USE_AZURE)
+  message(SEND_ERROR "The option `USE_AZURE` has been removed from XGBoost")
+endif (USE_AZURE)
+if (USE_HDFS)
+  message(SEND_ERROR "The option `USE_HDFS` has been removed from XGBoost")
+endif (USE_HDFS)
+
 #-- Sanitizer
 if (USE_SANITIZER)
   include(cmake/Sanitizer.cmake)
@@ -222,9 +235,6 @@ endif (MSVC)
 
 # rabit
 add_subdirectory(rabit)
-if (RABIT_BUILD_MPI)
-  find_package(MPI REQUIRED)
-endif (RABIT_BUILD_MPI)
 
 # core xgboost
 add_subdirectory(${xgboost_SOURCE_DIR}/src)
diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
index 08050205c..f215c3eca 100644
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@@ -295,10 +295,6 @@ macro(xgboost_target_link_libraries target)
     target_link_libraries(${target} PRIVATE CUDA::nvToolsExt)
   endif (USE_NVTX)
 
-  if (RABIT_BUILD_MPI)
-    target_link_libraries(${target} PRIVATE MPI::MPI_CXX)
-  endif (RABIT_BUILD_MPI)
-
   if (MINGW)
     target_link_libraries(${target} PRIVATE wsock32 ws2_32)
   endif (MINGW)
diff --git a/rabit/CMakeLists.txt b/rabit/CMakeLists.txt
index ab8171b2b..977d4867e 100644
--- a/rabit/CMakeLists.txt
+++ b/rabit/CMakeLists.txt
@@ -6,9 +6,7 @@ set(RABIT_SOURCES
   ${CMAKE_CURRENT_LIST_DIR}/src/allreduce_base.cc
   ${CMAKE_CURRENT_LIST_DIR}/src/rabit_c_api.cc)
 
-if (RABIT_BUILD_MPI)
-  list(APPEND RABIT_SOURCES ${CMAKE_CURRENT_LIST_DIR}/src/engine_mpi.cc)
-elseif (RABIT_MOCK)
+if (RABIT_MOCK)
   list(APPEND RABIT_SOURCES ${CMAKE_CURRENT_LIST_DIR}/src/engine_mock.cc)
 else ()
   list(APPEND RABIT_SOURCES ${CMAKE_CURRENT_LIST_DIR}/src/engine.cc)
diff --git a/rabit/src/engine_mpi.cc b/rabit/src/engine_mpi.cc
deleted file mode 100644
index d5ed2f454..000000000
--- a/rabit/src/engine_mpi.cc
+++ /dev/null
@@ -1,162 +0,0 @@
-/*!
- *  Copyright (c) 2014 by Contributors
- * \file engine_mpi.cc
- * \brief this file gives an implementation of engine interface using MPI,
- *   this will allow rabit program to run with MPI, but do not comes with fault tolerant
- *
- * \author Tianqi Chen
- */
-#define NOMINMAX
-#include <mpi.h>
-#include <rabit/base.h>
-#include <cstdio>
-#include <string>
-#include "rabit/internal/engine.h"
-#include "rabit/internal/utils.h"
-
-namespace rabit {
-namespace engine {
-/*! \brief implementation of engine using MPI */
-class MPIEngine : public IEngine {
- public:
-  MPIEngine(void) {
-    version_number = 0;
-  }
-  void Allgather(void *sendrecvbuf_, size_t total_size, size_t slice_begin,
-                 size_t slice_end, size_t size_prev_slice) override {
-    utils::Error("MPIEngine:: Allgather is not supported");
-  }
-  void Allreduce(void *sendrecvbuf_, size_t type_nbytes, size_t count,
-                 ReduceFunction reducer, PreprocFunction prepare_fun,
-                 void *prepare_arg) override {
-    utils::Error("MPIEngine:: Allreduce is not supported,"\
-                 "use Allreduce_ instead");
-  }
-  int GetRingPrevRank(void) const override {
-    utils::Error("MPIEngine:: GetRingPrevRank is not supported");
-    return -1;
-  }
-  void Broadcast(void *sendrecvbuf_, size_t size, int root) override {
-    MPI::COMM_WORLD.Bcast(sendrecvbuf_, size, MPI::CHAR, root);
-  }
-  virtual void InitAfterException(void) {
-    utils::Error("MPI is not fault tolerant");
-  }
-  virtual int LoadCheckPoint(Serializable *global_model,
-                             Serializable *local_model = NULL) {
-    return 0;
-  }
-  virtual void CheckPoint(const Serializable *global_model,
-                          const Serializable *local_model = NULL) {
-    version_number += 1;
-  }
-  virtual void LazyCheckPoint(const Serializable *global_model) {
-    version_number += 1;
-  }
-  virtual int VersionNumber(void) const {
-    return version_number;
-  }
-  /*! \brief get rank of current node */
-  virtual int GetRank(void) const {
-    return MPI::COMM_WORLD.Get_rank();
-  }
-  /*! \brief get total number of */
-  virtual int GetWorldSize(void) const {
-    return MPI::COMM_WORLD.Get_size();
-  }
-  /*! \brief whether it is distributed */
-  virtual bool IsDistributed(void) const {
-    return true;
-  }
-  /*! \brief get the host name of current node */
-  virtual std::string GetHost(void) const {
-    int len;
-    char name[MPI_MAX_PROCESSOR_NAME];
-    MPI::Get_processor_name(name, len);
-    name[len] = '\0';
-    return std::string(name);
-  }
-  virtual void TrackerPrint(const std::string &msg) {
-    // simply print information into the tracker
-    if (GetRank() == 0) {
-      utils::Printf("%s", msg.c_str());
-    }
-  }
-
- private:
-  int version_number;
-};
-
-// singleton sync manager
-MPIEngine manager;
-
-/*! \brief initialize the synchronization module */
-bool Init(int argc, char *argv[]) {
-  try {
-    MPI::Init(argc, argv);
-    return true;
-  } catch (const std::exception& e) {
-    fprintf(stderr, " failed in MPI Init %s\n", e.what());
-    return false;
-  }
-}
-/*! \brief finalize syncrhonization module */
-bool Finalize(void) {
-  try {
-    MPI::Finalize();
-    return true;
-  } catch (const std::exception& e) {
-    fprintf(stderr, "failed in MPI shutdown %s\n", e.what());
-    return false;
-  }
-}
-
-/*! \brief singleton method to get engine */
-IEngine *GetEngine(void) {
-  return &manager;
-}
-// transform enum to MPI data type
-inline MPI::Datatype GetType(mpi::DataType dtype) {
-  using namespace mpi;
-  switch (dtype) {
-    case kChar: return MPI::CHAR;
-    case kUChar: return MPI::BYTE;
-    case kInt: return MPI::INT;
-    case kUInt: return MPI::UNSIGNED;
-    case kLong: return MPI::LONG;
-    case kULong: return MPI::UNSIGNED_LONG;
-    case kFloat: return MPI::FLOAT;
-    case kDouble: return MPI::DOUBLE;
-    case kLongLong: return MPI::LONG_LONG;
-    case kULongLong: return MPI::UNSIGNED_LONG_LONG;
-  }
-  utils::Error("unknown mpi::DataType");
-  return MPI::CHAR;
-}
-// transform enum to MPI OP
-inline MPI::Op GetOp(mpi::OpType otype) {
-  using namespace mpi;
-  switch (otype) {
-    case kMax: return MPI::MAX;
-    case kMin: return MPI::MIN;
-    case kSum: return MPI::SUM;
-    case kBitwiseOR: return MPI::BOR;
-  }
-  utils::Error("unknown mpi::OpType");
-  return MPI::MAX;
-}
-// perform in-place allreduce, on sendrecvbuf
-void Allreduce_(void *sendrecvbuf,
-                size_t type_nbytes,
-                size_t count,
-                IEngine::ReduceFunction red,
-                mpi::DataType dtype,
-                mpi::OpType op,
-                IEngine::PreprocFunction prepare_fun,
-                void *prepare_arg) {
-  if (prepare_fun != NULL) prepare_fun(prepare_arg);
-  MPI::COMM_WORLD.Allreduce(MPI_IN_PLACE, sendrecvbuf,
-                            count, GetType(dtype), GetOp(op));
-}
-}  // namespace engine
-}  // namespace rabit

From be6a55295695cb00c6d5b009dd466c3aa351b224 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 29 Aug 2023 08:27:13 +0800
Subject: [PATCH 125/136] [R] Support multi-class custom objective. (#9526)

---
 R-package/R/utils.R                           | 28 +++++--
 R-package/src/xgboost_R.cc                    |  8 +-
 .../tests/testthat/test_custom_objective.R    | 73 +++++++++++++++++--
 include/xgboost/linalg.h                      |  7 ++
 .../xgboost4j/src/native/xgboost4j.cpp        |  4 +-
 src/c_api/c_api_utils.h                       | 12 +--
 6 files changed, 106 insertions(+), 26 deletions(-)

diff --git a/R-package/R/utils.R b/R-package/R/utils.R
index 5faca2ef4..2fc2321ca 100644
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@@ -151,14 +151,30 @@ xgb.iter.update <- function(booster_handle, dtrain, iter, obj) {
   if (is.null(obj)) {
     .Call(XGBoosterUpdateOneIter_R, booster_handle, as.integer(iter), dtrain)
   } else {
-    pred <- predict(booster_handle, dtrain, outputmargin = TRUE, training = TRUE,
-                    ntreelimit = 0)
+    pred <- predict(
+      booster_handle,
+      dtrain,
+      outputmargin = TRUE,
+      training = TRUE,
+      reshape = TRUE
+    )
     gpair <- obj(pred, dtrain)
     n_samples <- dim(dtrain)[1]
-    # We still require row-major in R as I'm not quite sure sure how to get the stride of
-    # the matrix in C.
-    gpair$grad <- matrix(gpair$grad, nrow = n_samples, byrow = TRUE)
-    gpair$hess <- matrix(gpair$hess, nrow = n_samples, byrow = TRUE)
+
+    msg <- paste(
+      "Since 2.1.0, the shape of the gradient and hessian is required to be ",
+      "(n_samples, n_targets) or (n_samples, n_classes).",
+      sep = ""
+    )
+    if (is.matrix(gpair$grad) && dim(gpair$grad)[1] != n_samples) {
+      warning(msg)
+    }
+    if (is.numeric(gpair$grad) && length(gpair$grad) != n_samples) {
+      warning(msg)
+    }
+
+    gpair$grad <- matrix(gpair$grad, nrow = n_samples)
+    gpair$hess <- matrix(gpair$hess, nrow = n_samples)
     .Call(
       XGBoosterBoostOneIter_R, booster_handle, dtrain, iter, gpair$grad, gpair$hess
     )
diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc
index b7eae03c4..44082f255 100644
--- a/R-package/src/xgboost_R.cc
+++ b/R-package/src/xgboost_R.cc
@@ -403,7 +403,7 @@ XGB_DLL SEXP XGBoosterUpdateOneIter_R(SEXP handle, SEXP iter, SEXP dtrain) {
 
 XGB_DLL SEXP XGBoosterTrainOneIter_R(SEXP handle, SEXP dtrain, SEXP iter, SEXP grad, SEXP hess) {
   R_API_BEGIN();
-  CHECK_EQ(length(grad), length(hess)) << "gradient and hess must have same length";
+  CHECK_EQ(length(grad), length(hess)) << "gradient and hess must have same length.";
   SEXP gdim = getAttrib(grad, R_DimSymbol);
   auto n_samples = static_cast<std::size_t>(INTEGER(gdim)[0]);
   auto n_targets = static_cast<std::size_t>(INTEGER(gdim)[1]);
@@ -415,8 +415,8 @@ XGB_DLL SEXP XGBoosterTrainOneIter_R(SEXP handle, SEXP dtrain, SEXP iter, SEXP g
   double const *d_hess = REAL(hess);
 
   auto ctx = xgboost::detail::BoosterCtx(R_ExternalPtrAddr(handle));
-  auto [s_grad, s_hess] =
-      xgboost::detail::MakeGradientInterface(ctx, d_grad, d_hess, n_samples, n_targets);
+  auto [s_grad, s_hess] = xgboost::detail::MakeGradientInterface(
+      ctx, d_grad, d_hess, xgboost::linalg::kF, n_samples, n_targets);
   CHECK_CALL(XGBoosterTrainOneIter(R_ExternalPtrAddr(handle), R_ExternalPtrAddr(dtrain),
                                    asInteger(iter), s_grad.c_str(), s_hess.c_str()));
 
@@ -435,7 +435,7 @@ XGB_DLL SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evn
   std::vector<const char*> vec_sptr;
   for (int i = 0; i < len; ++i) {
     vec_dmats.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i)));
-    vec_names.push_back(std::string(CHAR(asChar(VECTOR_ELT(evnames, i)))));
+    vec_names.emplace_back(CHAR(asChar(VECTOR_ELT(evnames, i))));
   }
   for (int i = 0; i < len; ++i) {
     vec_sptr.push_back(vec_names[i].c_str());
diff --git a/R-package/tests/testthat/test_custom_objective.R b/R-package/tests/testthat/test_custom_objective.R
index 95be4140f..42f43cede 100644
--- a/R-package/tests/testthat/test_custom_objective.R
+++ b/R-package/tests/testthat/test_custom_objective.R
@@ -64,23 +64,80 @@ test_that("custom objective using DMatrix attr works", {
   expect_equal(class(bst), "xgb.Booster")
 })
 
-test_that("custom objective with multi-class works", {
+test_that("custom objective with multi-class shape", {
   data <- as.matrix(iris[, -5])
   label <-  as.numeric(iris$Species) - 1
   dtrain <- xgb.DMatrix(data = data, label = label)
-  nclasses <- 3
+  n_classes <- 3
 
   fake_softprob <- function(preds, dtrain) {
     expect_true(all(matrix(preds) == 0.5))
-    grad <- rnorm(dim(as.matrix(preds))[1])
-    expect_equal(dim(data)[1] * nclasses, dim(as.matrix(preds))[1])
-    hess <- rnorm(dim(as.matrix(preds))[1])
-    return (list(grad = grad, hess = hess))
+    ## use numeric vector here to test compatibility with XGBoost < 2.1
+    grad <- rnorm(length(as.matrix(preds)))
+    expect_equal(dim(data)[1] * n_classes, dim(as.matrix(preds))[1] * n_classes)
+    hess <- rnorm(length(as.matrix(preds)))
+    return(list(grad = grad, hess = hess))
   }
   fake_merror <- function(preds, dtrain) {
-    expect_equal(dim(data)[1] * nclasses, dim(as.matrix(preds))[1])
+    expect_equal(dim(data)[1] * n_classes, dim(as.matrix(preds))[1])
   }
   param$objective <- fake_softprob
   param$eval_metric <- fake_merror
-  bst <- xgb.train(param, dtrain, 1, num_class = nclasses)
+  bst <- xgb.train(param, dtrain, 1, num_class = n_classes)
+})
+
+softmax <- function(values) {
+  values <- as.numeric(values)
+  exps <- exp(values)
+  den <- sum(exps)
+  return(exps / den)
+}
+
+softprob <- function(predt, dtrain) {
+  y <- getinfo(dtrain, "label")
+
+  n_samples <- dim(predt)[1]
+  n_classes <- dim(predt)[2]
+
+  grad <- matrix(nrow = n_samples, ncol = n_classes)
+  hess <- matrix(nrow = n_samples, ncol = n_classes)
+
+  for (i in seq_len(n_samples)) {
+    t <- y[i]
+    p <- softmax(predt[i, ])
+    for (c in seq_len(n_classes)) {
+      g <- if (c - 1 == t) {
+        p[c] - 1.0
+      } else {
+        p[c]
+      }
+      h <- max((2.0 * p[c] * (1.0 - p[c])), 1e-6)
+      grad[i, c] <- g
+      hess[i, c] <- h
+    }
+  }
+
+  return(list(grad = grad, hess = hess))
+}
+
+
+test_that("custom objective with multi-class works", {
+  data <- as.matrix(iris[, -5])
+  label <- as.numeric(iris$Species) - 1
+
+  dtrain <- xgb.DMatrix(data = data, label = label)
+
+  param$num_class <- 3
+  param$objective <- softprob
+  param$eval_metric <- "merror"
+  param$base_score <- 0.5
+
+  custom_bst <- xgb.train(param, dtrain, 2)
+  custom_predt <- predict(custom_bst, dtrain)
+
+  param$objective <- "multi:softmax"
+  builtin_bst <- xgb.train(param, dtrain, 2)
+  builtin_predt <- predict(builtin_bst, dtrain)
+
+  expect_equal(custom_predt, builtin_predt)
 })
diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h
index ae3489e3b..d95651ca7 100644
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -602,6 +602,13 @@ auto MakeTensorView(Context const *ctx, common::Span<T> data, S &&...shape) {
   return MakeTensorView(ctx->gpu_id, data, std::forward<S>(shape)...);
 }
 
+template <typename T, typename... S>
+auto MakeTensorView(Context const *ctx, Order order, common::Span<T> data, S &&...shape) {
+  std::size_t in_shape[sizeof...(S)];
+  detail::IndexToArr(in_shape, std::forward<S>(shape)...);
+  return TensorView<T, sizeof...(S)>{data, in_shape, ctx->Ordinal(), order};
+}
+
 template <typename T, typename... S>
 auto MakeTensorView(Context const *ctx, HostDeviceVector<T> *data, S &&...shape) {
   auto span = ctx->IsCPU() ? data->HostSpan() : data->DeviceSpan();
diff --git a/jvm-packages/xgboost4j/src/native/xgboost4j.cpp b/jvm-packages/xgboost4j/src/native/xgboost4j.cpp
index 60c2f126c..c0c077430 100644
--- a/jvm-packages/xgboost4j/src/native/xgboost4j.cpp
+++ b/jvm-packages/xgboost4j/src/native/xgboost4j.cpp
@@ -607,8 +607,8 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterTrainOneI
   }
 
   auto ctx = xgboost::detail::BoosterCtx(handle);
-  auto [s_grad, s_hess] =
-      xgboost::detail::MakeGradientInterface(ctx, grad, hess, n_samples, n_targets);
+  auto [s_grad, s_hess] = xgboost::detail::MakeGradientInterface(
+      ctx, grad, hess, xgboost::linalg::kC, n_samples, n_targets);
   int ret = XGBoosterTrainOneIter(handle, dtrain, static_cast<std::int32_t>(jiter), s_grad.c_str(),
                                   s_hess.c_str());
 
diff --git a/src/c_api/c_api_utils.h b/src/c_api/c_api_utils.h
index e42eed633..19dd6d639 100644
--- a/src/c_api/c_api_utils.h
+++ b/src/c_api/c_api_utils.h
@@ -354,12 +354,12 @@ void MakeSparseFromPtr(PtrT const *p_indptr, I const *p_indices, T const *p_data
  * @brief Make array interface for other language bindings.
  */
 template <typename G, typename H>
-auto MakeGradientInterface(Context const *ctx, G const *grad, H const *hess, std::size_t n_samples,
-                           std::size_t n_targets) {
-  auto t_grad =
-      linalg::MakeTensorView(ctx, common::Span{grad, n_samples * n_targets}, n_samples, n_targets);
-  auto t_hess =
-      linalg::MakeTensorView(ctx, common::Span{hess, n_samples * n_targets}, n_samples, n_targets);
+auto MakeGradientInterface(Context const *ctx, G const *grad, H const *hess, linalg::Order order,
+                           std::size_t n_samples, std::size_t n_targets) {
+  auto t_grad = linalg::MakeTensorView(ctx, order, common::Span{grad, n_samples * n_targets},
+                                       n_samples, n_targets);
+  auto t_hess = linalg::MakeTensorView(ctx, order, common::Span{hess, n_samples * n_targets},
+                                       n_samples, n_targets);
   auto s_grad = linalg::ArrayInterfaceStr(t_grad);
   auto s_hess = linalg::ArrayInterfaceStr(t_hess);
   return std::make_tuple(s_grad, s_hess);

From 942b957eef8e81c071ba543847eea8f4d2806df6 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 29 Aug 2023 10:06:03 +0800
Subject: [PATCH 126/136] Fix GPU categorical split memory allocation. (#9529)

---
 src/common/categorical.h              |  2 +-
 src/tree/gpu_hist/evaluate_splits.cu  | 21 +++++------
 src/tree/gpu_hist/evaluate_splits.cuh | 22 ++++++++----
 src/tree/updater_gpu_common.cuh       | 36 ++++++++-----------
 src/tree/updater_gpu_hist.cu          | 50 +++++++++++++++------------
 5 files changed, 67 insertions(+), 64 deletions(-)

diff --git a/src/common/categorical.h b/src/common/categorical.h
index 249a818e5..32b771ad6 100644
--- a/src/common/categorical.h
+++ b/src/common/categorical.h
@@ -52,7 +52,7 @@ inline XGBOOST_DEVICE bool InvalidCat(float cat) {
  *
  *   Go to left if it's NOT the matching category, which matches one-hot encoding.
  */
-inline XGBOOST_DEVICE bool Decision(common::Span<uint32_t const> cats, float cat) {
+inline XGBOOST_DEVICE bool Decision(common::Span<CatBitField::value_type const> cats, float cat) {
   KCatBitField const s_cats(cats);
   if (XGBOOST_EXPECT(InvalidCat(cat), false)) {
     return true;
diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu
index 30941c060..ecfc6c3ce 100644
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2020-2022 by XGBoost Contributors
+/**
+ * Copyright 2020-2023, XGBoost Contributors
  */
 #include <algorithm>  // std::max
 #include <vector>
@@ -11,9 +11,7 @@
 #include "evaluate_splits.cuh"
 #include "expand_entry.cuh"
 
-namespace xgboost {
-namespace tree {
-
+namespace xgboost::tree {
 // With constraints
 XGBOOST_DEVICE float LossChangeMissing(const GradientPairInt64 &scan,
                                        const GradientPairInt64 &missing,
@@ -315,11 +313,11 @@ __device__ void SetCategoricalSplit(const EvaluateSplitSharedInputs &shared_inpu
                                     common::Span<common::CatBitField::value_type> out,
                                     DeviceSplitCandidate *p_out_split) {
   auto &out_split = *p_out_split;
-  out_split.split_cats = common::CatBitField{out};
+  auto out_cats = common::CatBitField{out};
 
   // Simple case for one hot split
   if (common::UseOneHot(shared_inputs.FeatureBins(fidx), shared_inputs.param.max_cat_to_onehot)) {
-    out_split.split_cats.Set(common::AsCat(out_split.thresh));
+    out_cats.Set(common::AsCat(out_split.thresh));
     return;
   }
 
@@ -339,7 +337,7 @@ __device__ void SetCategoricalSplit(const EvaluateSplitSharedInputs &shared_inpu
   assert(partition > 0 && "Invalid partition.");
   thrust::for_each(thrust::seq, beg, beg + partition, [&](size_t c) {
     auto cat = shared_inputs.feature_values[c - node_offset];
-    out_split.SetCat(cat);
+    out_cats.Set(common::AsCat(cat));
   });
 }
 
@@ -444,8 +442,7 @@ void GPUHistEvaluator::EvaluateSplits(
 
     if (split.is_cat) {
       SetCategoricalSplit(shared_inputs, d_sorted_idx, fidx, i,
-                          device_cats_accessor.GetNodeCatStorage(input.nidx),
-                          &out_splits[i]);
+                          device_cats_accessor.GetNodeCatStorage(input.nidx), &out_splits[i]);
     }
 
     float base_weight =
@@ -477,6 +474,4 @@ GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(
                                 cudaMemcpyDeviceToHost));
   return root_entry;
 }
-
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh
index 25a8cde89..667982aa9 100644
--- a/src/tree/gpu_hist/evaluate_splits.cuh
+++ b/src/tree/gpu_hist/evaluate_splits.cuh
@@ -37,8 +37,8 @@ struct EvaluateSplitSharedInputs {
   common::Span<const float> feature_values;
   common::Span<const float> min_fvalue;
   bool is_dense;
-  XGBOOST_DEVICE auto Features() const { return feature_segments.size() - 1; }
-  __device__ auto FeatureBins(bst_feature_t fidx) const {
+  [[nodiscard]] XGBOOST_DEVICE auto Features() const { return feature_segments.size() - 1; }
+  [[nodiscard]] __device__ std::uint32_t FeatureBins(bst_feature_t fidx) const {
     return feature_segments[fidx + 1] - feature_segments[fidx];
   }
 };
@@ -105,7 +105,7 @@ class GPUHistEvaluator {
   }
 
   /**
-   * \brief Get device category storage of nidx for internal calculation.
+   * @brief Get device category storage of nidx for internal calculation.
    */
   auto DeviceCatStorage(const std::vector<bst_node_t> &nidx) {
     if (!has_categoricals_) return CatAccessor{};
@@ -120,8 +120,8 @@ class GPUHistEvaluator {
   /**
    * \brief Get sorted index storage based on the left node of inputs.
    */
-  auto SortedIdx(int num_nodes, bst_feature_t total_bins) {
-    if(!need_sort_histogram_) return common::Span<bst_feature_t>();
+  auto SortedIdx(int num_nodes, bst_bin_t total_bins) {
+    if (!need_sort_histogram_) return common::Span<bst_feature_t>{};
     cat_sorted_idx_.resize(num_nodes * total_bins);
     return dh::ToSpan(cat_sorted_idx_);
   }
@@ -146,12 +146,22 @@ class GPUHistEvaluator {
    * \brief Get host category storage for nidx.  Different from the internal version, this
    *        returns strictly 1 node.
    */
-  common::Span<CatST const> GetHostNodeCats(bst_node_t nidx) const {
+  [[nodiscard]] common::Span<CatST const> GetHostNodeCats(bst_node_t nidx) const {
     copy_stream_.View().Sync();
     auto cats_out = common::Span<CatST const>{h_split_cats_}.subspan(
         nidx * node_categorical_storage_size_, node_categorical_storage_size_);
     return cats_out;
   }
+
+  [[nodiscard]] auto GetDeviceNodeCats(bst_node_t nidx) {
+    copy_stream_.View().Sync();
+    if (has_categoricals_) {
+      CatAccessor accessor = {dh::ToSpan(split_cats_), node_categorical_storage_size_};
+      return common::KCatBitField{accessor.GetNodeCatStorage(nidx)};
+    } else {
+      return common::KCatBitField{};
+    }
+  }
   /**
    * \brief Add a split to the internal tree evaluator.
    */
diff --git a/src/tree/updater_gpu_common.cuh b/src/tree/updater_gpu_common.cuh
index 1637300b6..8f5b27ac6 100644
--- a/src/tree/updater_gpu_common.cuh
+++ b/src/tree/updater_gpu_common.cuh
@@ -64,7 +64,6 @@ struct DeviceSplitCandidate {
   // split.
   bst_cat_t thresh{-1};
 
-  common::CatBitField split_cats;
   bool is_cat { false };
 
   GradientPairInt64 left_sum;
@@ -72,12 +71,6 @@ struct DeviceSplitCandidate {
 
   XGBOOST_DEVICE DeviceSplitCandidate() {}  // NOLINT
 
-  template <typename T>
-  XGBOOST_DEVICE void SetCat(T c) {
-    this->split_cats.Set(common::AsCat(c));
-    fvalue = std::max(this->fvalue, static_cast<float>(c));
-  }
-
   XGBOOST_DEVICE void Update(float loss_chg_in, DefaultDirection dir_in, float fvalue_in,
                              int findex_in, GradientPairInt64 left_sum_in,
                              GradientPairInt64 right_sum_in, bool cat,
@@ -100,22 +93,23 @@ struct DeviceSplitCandidate {
    */
   XGBOOST_DEVICE void UpdateCat(float loss_chg_in, DefaultDirection dir_in, bst_cat_t thresh_in,
                                 bst_feature_t findex_in, GradientPairInt64 left_sum_in,
-                                GradientPairInt64 right_sum_in, GPUTrainingParam const& param, const GradientQuantiser& quantiser) {
-    if (loss_chg_in > loss_chg &&
-        quantiser.ToFloatingPoint(left_sum_in).GetHess() >= param.min_child_weight &&
-        quantiser.ToFloatingPoint(right_sum_in).GetHess() >= param.min_child_weight) {
-      loss_chg = loss_chg_in;
-      dir = dir_in;
-      fvalue = std::numeric_limits<float>::quiet_NaN();
-      thresh = thresh_in;
-      is_cat = true;
-      left_sum = left_sum_in;
-      right_sum = right_sum_in;
-      findex = findex_in;
-    }
+                                GradientPairInt64 right_sum_in, GPUTrainingParam const& param,
+                                const GradientQuantiser& quantiser) {
+      if (loss_chg_in > loss_chg &&
+          quantiser.ToFloatingPoint(left_sum_in).GetHess() >= param.min_child_weight &&
+          quantiser.ToFloatingPoint(right_sum_in).GetHess() >= param.min_child_weight) {
+        loss_chg = loss_chg_in;
+        dir = dir_in;
+        fvalue = std::numeric_limits<float>::quiet_NaN();
+        thresh = thresh_in;
+        is_cat = true;
+        left_sum = left_sum_in;
+        right_sum = right_sum_in;
+        findex = findex_in;
+      }
   }
 
-  XGBOOST_DEVICE bool IsValid() const { return loss_chg > 0.0f; }
+  [[nodiscard]] XGBOOST_DEVICE bool IsValid() const { return loss_chg > 0.0f; }
 
   friend std::ostream& operator<<(std::ostream& os, DeviceSplitCandidate const& c) {
     os << "loss_chg:" << c.loss_chg << ", "
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 33dfbf8c5..10fb913b3 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -7,9 +7,9 @@
 
 #include <algorithm>
 #include <cmath>
-#include <limits>
-#include <memory>
-#include <utility>
+#include <cstddef>  // for size_t
+#include <memory>   // for unique_ptr, make_unique
+#include <utility>  // for move
 #include <vector>
 
 #include "../collective/communicator-inl.cuh"
@@ -216,9 +216,9 @@ struct GPUHistMakerDevice {
   void InitFeatureGroupsOnce() {
     if (!feature_groups) {
       CHECK(page);
-      feature_groups.reset(new FeatureGroups(page->Cuts(), page->is_dense,
-                                             dh::MaxSharedMemoryOptin(ctx_->gpu_id),
-                                             sizeof(GradientPairPrecise)));
+      feature_groups = std::make_unique<FeatureGroups>(page->Cuts(), page->is_dense,
+                                                       dh::MaxSharedMemoryOptin(ctx_->gpu_id),
+                                                       sizeof(GradientPairPrecise));
     }
   }
 
@@ -245,10 +245,10 @@ struct GPUHistMakerDevice {
     this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param,
                            dmat->Info().IsColumnSplit(), ctx_->gpu_id);
 
-    quantiser.reset(new GradientQuantiser(this->gpair));
+    quantiser = std::make_unique<GradientQuantiser>(this->gpair);
 
     row_partitioner.reset();  // Release the device memory first before reallocating
-    row_partitioner.reset(new RowPartitioner(ctx_->gpu_id, sample.sample_rows));
+    row_partitioner = std::make_unique<RowPartitioner>(ctx_->gpu_id, sample.sample_rows);
 
     // Init histogram
     hist.Init(ctx_->gpu_id, page->Cuts().TotalBins());
@@ -295,7 +295,7 @@ struct GPUHistMakerDevice {
     dh::TemporaryArray<GPUExpandEntry> entries(2 * candidates.size());
     // Store the feature set ptrs so they dont go out of scope before the kernel is called
     std::vector<std::shared_ptr<HostDeviceVector<bst_feature_t>>> feature_sets;
-    for (size_t i = 0; i < candidates.size(); i++) {
+    for (std::size_t i = 0; i < candidates.size(); i++) {
       auto candidate = candidates.at(i);
       int left_nidx = tree[candidate.nid].LeftChild();
       int right_nidx = tree[candidate.nid].RightChild();
@@ -328,14 +328,13 @@ struct GPUHistMakerDevice {
         d_node_inputs.data().get(), h_node_inputs.data(),
         h_node_inputs.size() * sizeof(EvaluateSplitInputs), cudaMemcpyDefault));
 
-    this->evaluator_.EvaluateSplits(nidx, max_active_features,
-                                    dh::ToSpan(d_node_inputs), shared_inputs,
-                                    dh::ToSpan(entries));
+    this->evaluator_.EvaluateSplits(nidx, max_active_features, dh::ToSpan(d_node_inputs),
+                                    shared_inputs, dh::ToSpan(entries));
     dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(),
                                   entries.data().get(), sizeof(GPUExpandEntry) * entries.size(),
                                   cudaMemcpyDeviceToHost));
     dh::DefaultStream().Sync();
-    }
+  }
 
   void BuildHist(int nidx) {
     auto d_node_hist = hist.GetNodeHistogram(nidx);
@@ -367,23 +366,29 @@ struct GPUHistMakerDevice {
   struct NodeSplitData {
     RegTree::Node split_node;
     FeatureType split_type;
-    common::CatBitField node_cats;
+    common::KCatBitField node_cats;
   };
 
-  void UpdatePosition(const std::vector<GPUExpandEntry>& candidates, RegTree* p_tree) {
-    if (candidates.empty()) return;
-    std::vector<int> nidx(candidates.size());
-    std::vector<int> left_nidx(candidates.size());
-    std::vector<int> right_nidx(candidates.size());
+  void UpdatePosition(std::vector<GPUExpandEntry> const& candidates, RegTree* p_tree) {
+    if (candidates.empty()) {
+      return;
+    }
+
+    std::vector<bst_node_t> nidx(candidates.size());
+    std::vector<bst_node_t> left_nidx(candidates.size());
+    std::vector<bst_node_t> right_nidx(candidates.size());
     std::vector<NodeSplitData> split_data(candidates.size());
+
     for (size_t i = 0; i < candidates.size(); i++) {
-      auto& e = candidates[i];
+      auto const& e = candidates[i];
       RegTree::Node split_node = (*p_tree)[e.nid];
       auto split_type = p_tree->NodeSplitType(e.nid);
       nidx.at(i) = e.nid;
       left_nidx.at(i) = split_node.LeftChild();
       right_nidx.at(i) = split_node.RightChild();
-      split_data.at(i) = NodeSplitData{split_node, split_type, e.split.split_cats};
+      split_data.at(i) = NodeSplitData{split_node, split_type, evaluator_.GetDeviceNodeCats(e.nid)};
+
+      CHECK_EQ(split_type == FeatureType::kCategorical, e.split.is_cat);
     }
 
     auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
@@ -391,7 +396,7 @@ struct GPUHistMakerDevice {
         nidx, left_nidx, right_nidx, split_data,
         [=] __device__(bst_uint ridx, const NodeSplitData& data) {
           // given a row index, returns the node id it belongs to
-          bst_float cut_value = d_matrix.GetFvalue(ridx, data.split_node.SplitIndex());
+          float cut_value = d_matrix.GetFvalue(ridx, data.split_node.SplitIndex());
           // Missing value
           bool go_left = true;
           if (isnan(cut_value)) {
@@ -621,7 +626,6 @@ struct GPUHistMakerDevice {
       CHECK(common::CheckNAN(candidate.split.fvalue));
       std::vector<common::CatBitField::value_type> split_cats;
 
-      CHECK_GT(candidate.split.split_cats.Bits().size(), 0);
       auto h_cats = this->evaluator_.GetHostNodeCats(candidate.nid);
       auto n_bins_feature = page->Cuts().FeatureBins(candidate.split.findex);
       split_cats.resize(common::CatBitField::ComputeStorageSize(n_bins_feature), 0);

From ddf2e688219c0e5510a6931b7dac37e34ac14762 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 29 Aug 2023 13:37:29 +0800
Subject: [PATCH 127/136] Use the new `DeviceOrd` in the linalg module. (#9527)

---
 include/xgboost/host_device_vector.h       |  8 +++
 include/xgboost/learner.h                  |  2 +-
 include/xgboost/linalg.h                   | 73 +++++++---------------
 include/xgboost/multi_target_tree_model.h  |  4 +-
 src/c_api/c_api.cu                         |  2 +-
 src/common/linalg_op.cuh                   |  2 +-
 src/common/ranking_utils.cu                | 16 ++---
 src/common/ranking_utils.h                 | 30 ++++-----
 src/common/stats.cc                        |  6 +-
 src/data/data.cc                           | 11 ++--
 src/data/data.cu                           |  2 +-
 src/gbm/gblinear.cc                        | 13 ++--
 src/gbm/gbtree.cc                          |  6 +-
 src/learner.cc                             | 16 ++---
 src/metric/auc.cc                          | 28 ++++-----
 src/metric/auc.cu                          | 34 +++++-----
 src/metric/auc.h                           |  4 +-
 src/metric/elementwise_metric.cu           | 22 +++----
 src/metric/rank_metric.cc                  |  4 +-
 src/metric/rank_metric.cu                  |  8 +--
 src/objective/adaptive.cu                  | 14 ++---
 src/objective/lambdarank_obj.cc            | 26 ++++----
 src/objective/lambdarank_obj.cu            | 38 +++++------
 src/objective/quantile_obj.cu              |  4 +-
 src/objective/regression_obj.cu            |  6 +-
 src/predictor/cpu_predictor.cc             |  8 +--
 src/predictor/predictor.cc                 |  2 +-
 src/tree/fit_stump.cc                      |  2 +-
 src/tree/fit_stump.cu                      |  4 +-
 src/tree/hist/evaluate_splits.h            |  4 +-
 src/tree/updater_gpu_hist.cu               |  5 +-
 tests/cpp/common/test_linalg.cc            | 46 +++++++-------
 tests/cpp/common/test_linalg.cu            | 11 ++--
 tests/cpp/common/test_ranking_utils.cu     |  2 +-
 tests/cpp/common/test_stats.cu             | 14 ++---
 tests/cpp/data/test_array_interface.cc     |  2 +-
 tests/cpp/data/test_metainfo.cc            | 10 +--
 tests/cpp/data/test_metainfo.cu            |  4 +-
 tests/cpp/data/test_metainfo.h             | 12 ++--
 tests/cpp/data/test_simple_dmatrix.cc      |  8 +--
 tests/cpp/objective/test_lambdarank_obj.cu |  8 +--
 tests/cpp/predictor/test_gpu_predictor.cu  |  2 +-
 tests/cpp/predictor/test_predictor.cc      |  2 +-
 43 files changed, 252 insertions(+), 273 deletions(-)

diff --git a/include/xgboost/host_device_vector.h b/include/xgboost/host_device_vector.h
index b221d7206..ed7117d65 100644
--- a/include/xgboost/host_device_vector.h
+++ b/include/xgboost/host_device_vector.h
@@ -102,6 +102,14 @@ class HostDeviceVector {
   bool Empty() const { return Size() == 0; }
   size_t Size() const;
   int DeviceIdx() const;
+  DeviceOrd Device() const {
+    auto idx = this->DeviceIdx();
+    if (idx == DeviceOrd::CPU().ordinal) {
+      return DeviceOrd::CPU();
+    } else {
+      return DeviceOrd::CUDA(idx);
+    }
+  }
   common::Span<T> DeviceSpan();
   common::Span<const T> ConstDeviceSpan() const;
   common::Span<const T> DeviceSpan() const { return ConstDeviceSpan(); }
diff --git a/include/xgboost/learner.h b/include/xgboost/learner.h
index cd081a2e8..939324e4a 100644
--- a/include/xgboost/learner.h
+++ b/include/xgboost/learner.h
@@ -330,7 +330,7 @@ struct LearnerModelParam {
         multi_strategy{multi_strategy} {}
 
   linalg::TensorView<float const, 1> BaseScore(Context const* ctx) const;
-  [[nodiscard]] linalg::TensorView<float const, 1> BaseScore(std::int32_t device) const;
+  [[nodiscard]] linalg::TensorView<float const, 1> BaseScore(DeviceOrd device) const;
 
   void Copy(LearnerModelParam const& that);
   [[nodiscard]] bool IsVectorLeaf() const noexcept {
diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h
index d95651ca7..b3ae2f169 100644
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -302,7 +302,7 @@ class TensorView {
   T *ptr_{nullptr};  // pointer of data_ to avoid bound check.
 
   size_t size_{0};
-  int32_t device_{-1};
+  DeviceOrd device_;
 
   // Unlike `Tensor`, the data_ can have arbitrary size since this is just a view.
   LINALG_HD void CalcSize() {
@@ -401,15 +401,11 @@ class TensorView {
    * \param device Device ordinal
    */
   template <typename I, std::int32_t D>
-  LINALG_HD TensorView(common::Span<T> data, I const (&shape)[D], std::int32_t device)
+  LINALG_HD TensorView(common::Span<T> data, I const (&shape)[D], DeviceOrd device)
       : TensorView{data, shape, device, Order::kC} {}
 
-  template <typename I, std::int32_t D>
-  LINALG_HD TensorView(common::Span<T> data, I const (&shape)[D], DeviceOrd device)
-      : TensorView{data, shape, device.ordinal, Order::kC} {}
-
   template <typename I, int32_t D>
-  LINALG_HD TensorView(common::Span<T> data, I const (&shape)[D], std::int32_t device, Order order)
+  LINALG_HD TensorView(common::Span<T> data, I const (&shape)[D], DeviceOrd device, Order order)
       : data_{data}, ptr_{data_.data()}, device_{device} {
     static_assert(D > 0 && D <= kDim, "Invalid shape.");
     // shape
@@ -441,7 +437,7 @@ class TensorView {
    */
   template <typename I, std::int32_t D>
   LINALG_HD TensorView(common::Span<T> data, I const (&shape)[D], I const (&stride)[D],
-                       std::int32_t device)
+                       DeviceOrd device)
       : data_{data}, ptr_{data_.data()}, device_{device} {
     static_assert(D == kDim, "Invalid shape & stride.");
     detail::UnrollLoop<D>([&](auto i) {
@@ -450,16 +446,12 @@ class TensorView {
     });
     this->CalcSize();
   }
-  template <typename I, std::int32_t D>
-  LINALG_HD TensorView(common::Span<T> data, I const (&shape)[D], I const (&stride)[D],
-                       DeviceOrd device)
-      : TensorView{data, shape, stride, device.ordinal} {}
 
   template <
       typename U,
       std::enable_if_t<common::detail::IsAllowedElementTypeConversion<U, T>::value> * = nullptr>
   LINALG_HD TensorView(TensorView<U, kDim> const &that)  // NOLINT
-      : data_{that.Values()}, ptr_{data_.data()}, size_{that.Size()}, device_{that.DeviceIdx()} {
+      : data_{that.Values()}, ptr_{data_.data()}, size_{that.Size()}, device_{that.Device()} {
     detail::UnrollLoop<kDim>([&](auto i) {
       stride_[i] = that.Stride(i);
       shape_[i] = that.Shape(i);
@@ -572,7 +564,7 @@ class TensorView {
   /**
    * \brief Obtain the CUDA device ordinal.
    */
-  LINALG_HD auto DeviceIdx() const { return device_; }
+  LINALG_HD auto Device() const { return device_; }
 };
 
 /**
@@ -587,11 +579,11 @@ auto MakeTensorView(Context const *ctx, Container &data, S &&...shape) {  // NOL
                                typename Container::value_type>;
   std::size_t in_shape[sizeof...(S)];
   detail::IndexToArr(in_shape, std::forward<S>(shape)...);
-  return TensorView<T, sizeof...(S)>{data, in_shape, ctx->gpu_id};
+  return TensorView<T, sizeof...(S)>{data, in_shape, ctx->Device()};
 }
 
 template <typename T, typename... S>
-LINALG_HD auto MakeTensorView(std::int32_t device, common::Span<T> data, S &&...shape) {
+LINALG_HD auto MakeTensorView(DeviceOrd device, common::Span<T> data, S &&...shape) {
   std::size_t in_shape[sizeof...(S)];
   detail::IndexToArr(in_shape, std::forward<S>(shape)...);
   return TensorView<T, sizeof...(S)>{data, in_shape, device};
@@ -599,26 +591,26 @@ LINALG_HD auto MakeTensorView(std::int32_t device, common::Span<T> data, S &&...
 
 template <typename T, typename... S>
 auto MakeTensorView(Context const *ctx, common::Span<T> data, S &&...shape) {
-  return MakeTensorView(ctx->gpu_id, data, std::forward<S>(shape)...);
+  return MakeTensorView(ctx->Device(), data, std::forward<S>(shape)...);
 }
 
 template <typename T, typename... S>
 auto MakeTensorView(Context const *ctx, Order order, common::Span<T> data, S &&...shape) {
   std::size_t in_shape[sizeof...(S)];
   detail::IndexToArr(in_shape, std::forward<S>(shape)...);
-  return TensorView<T, sizeof...(S)>{data, in_shape, ctx->Ordinal(), order};
+  return TensorView<T, sizeof...(S)>{data, in_shape, ctx->Device(), order};
 }
 
 template <typename T, typename... S>
 auto MakeTensorView(Context const *ctx, HostDeviceVector<T> *data, S &&...shape) {
   auto span = ctx->IsCPU() ? data->HostSpan() : data->DeviceSpan();
-  return MakeTensorView(ctx->gpu_id, span, std::forward<S>(shape)...);
+  return MakeTensorView(ctx->Device(), span, std::forward<S>(shape)...);
 }
 
 template <typename T, typename... S>
 auto MakeTensorView(Context const *ctx, HostDeviceVector<T> const *data, S &&...shape) {
   auto span = ctx->IsCPU() ? data->ConstHostSpan() : data->ConstDeviceSpan();
-  return MakeTensorView(ctx->gpu_id, span, std::forward<S>(shape)...);
+  return MakeTensorView(ctx->Device(), span, std::forward<S>(shape)...);
 }
 
 /**
@@ -661,20 +653,20 @@ using VectorView = TensorView<T, 1>;
  * \param device (optional) Device ordinal, default to be host.
  */
 template <typename T>
-auto MakeVec(T *ptr, size_t s, int32_t device = -1) {
+auto MakeVec(T *ptr, size_t s, DeviceOrd device = DeviceOrd::CPU()) {
   return linalg::TensorView<T, 1>{{ptr, s}, {s}, device};
 }
 
 template <typename T>
 auto MakeVec(HostDeviceVector<T> *data) {
   return MakeVec(data->DeviceIdx() == -1 ? data->HostPointer() : data->DevicePointer(),
-                 data->Size(), data->DeviceIdx());
+                 data->Size(), data->Device());
 }
 
 template <typename T>
 auto MakeVec(HostDeviceVector<T> const *data) {
   return MakeVec(data->DeviceIdx() == -1 ? data->ConstHostPointer() : data->ConstDevicePointer(),
-                 data->Size(), data->DeviceIdx());
+                 data->Size(), data->Device());
 }
 
 /**
@@ -697,7 +689,7 @@ Json ArrayInterface(TensorView<T const, D> const &t) {
   array_interface["data"] = std::vector<Json>(2);
   array_interface["data"][0] = Integer{reinterpret_cast<int64_t>(t.Values().data())};
   array_interface["data"][1] = Boolean{true};
-  if (t.DeviceIdx() >= 0) {
+  if (t.Device().IsCUDA()) {
     // Change this once we have different CUDA stream.
     array_interface["stream"] = Null{};
   }
@@ -856,49 +848,29 @@ class Tensor {
   /**
    * @brief Get a @ref TensorView for this tensor.
    */
-  TensorView<T, kDim> View(std::int32_t device) {
-    if (device >= 0) {
-      data_.SetDevice(device);
-      auto span = data_.DeviceSpan();
-      return {span, shape_, device, order_};
-    } else {
-      auto span = data_.HostSpan();
-      return {span, shape_, device, order_};
-    }
-  }
-  TensorView<T const, kDim> View(std::int32_t device) const {
-    if (device >= 0) {
-      data_.SetDevice(device);
-      auto span = data_.ConstDeviceSpan();
-      return {span, shape_, device, order_};
-    } else {
-      auto span = data_.ConstHostSpan();
-      return {span, shape_, device, order_};
-    }
-  }
   auto View(DeviceOrd device) {
     if (device.IsCUDA()) {
       data_.SetDevice(device);
       auto span = data_.DeviceSpan();
-      return TensorView<T, kDim>{span, shape_, device.ordinal, order_};
+      return TensorView<T, kDim>{span, shape_, device, order_};
     } else {
       auto span = data_.HostSpan();
-      return TensorView<T, kDim>{span, shape_, device.ordinal, order_};
+      return TensorView<T, kDim>{span, shape_, device, order_};
     }
   }
   auto View(DeviceOrd device) const {
     if (device.IsCUDA()) {
       data_.SetDevice(device);
       auto span = data_.ConstDeviceSpan();
-      return TensorView<T const, kDim>{span, shape_, device.ordinal, order_};
+      return TensorView<T const, kDim>{span, shape_, device, order_};
     } else {
       auto span = data_.ConstHostSpan();
-      return TensorView<T const, kDim>{span, shape_, device.ordinal, order_};
+      return TensorView<T const, kDim>{span, shape_, device, order_};
     }
   }
 
-  auto HostView() const { return this->View(-1); }
-  auto HostView() { return this->View(-1); }
+  auto HostView() { return this->View(DeviceOrd::CPU()); }
+  auto HostView() const { return this->View(DeviceOrd::CPU()); }
 
   [[nodiscard]] size_t Size() const { return data_.Size(); }
   auto Shape() const { return common::Span<size_t const, kDim>{shape_}; }
@@ -975,6 +947,7 @@ class Tensor {
   void SetDevice(int32_t device) const { data_.SetDevice(device); }
   void SetDevice(DeviceOrd device) const { data_.SetDevice(device); }
   [[nodiscard]] int32_t DeviceIdx() const { return data_.DeviceIdx(); }
+  [[nodiscard]] DeviceOrd Device() const { return data_.Device(); }
 };
 
 template <typename T>
diff --git a/include/xgboost/multi_target_tree_model.h b/include/xgboost/multi_target_tree_model.h
index 1ad7d6bf6..676c43196 100644
--- a/include/xgboost/multi_target_tree_model.h
+++ b/include/xgboost/multi_target_tree_model.h
@@ -37,12 +37,12 @@ class MultiTargetTree : public Model {
   [[nodiscard]] linalg::VectorView<float const> NodeWeight(bst_node_t nidx) const {
     auto beg = nidx * this->NumTarget();
     auto v = common::Span<float const>{weights_}.subspan(beg, this->NumTarget());
-    return linalg::MakeTensorView(Context::kCpuId, v, v.size());
+    return linalg::MakeTensorView(DeviceOrd::CPU(), v, v.size());
   }
   [[nodiscard]] linalg::VectorView<float> NodeWeight(bst_node_t nidx) {
     auto beg = nidx * this->NumTarget();
     auto v = common::Span<float>{weights_}.subspan(beg, this->NumTarget());
-    return linalg::MakeTensorView(Context::kCpuId, v, v.size());
+    return linalg::MakeTensorView(DeviceOrd::CPU(), v, v.size());
   }
 
  public:
diff --git a/src/c_api/c_api.cu b/src/c_api/c_api.cu
index 21674f785..1dddb1444 100644
--- a/src/c_api/c_api.cu
+++ b/src/c_api/c_api.cu
@@ -68,7 +68,7 @@ void CopyGradientFromCUDAArrays(Context const *ctx, ArrayInterface<2, false> con
   auto &gpair = *out_gpair;
   gpair.SetDevice(grad_dev);
   gpair.Reshape(grad.Shape(0), grad.Shape(1));
-  auto d_gpair = gpair.View(grad_dev);
+  auto d_gpair = gpair.View(DeviceOrd::CUDA(grad_dev));
   auto cuctx = ctx->CUDACtx();
 
   DispatchDType(grad, DeviceOrd::CUDA(grad_dev), [&](auto &&t_grad) {
diff --git a/src/common/linalg_op.cuh b/src/common/linalg_op.cuh
index 037ad1ff3..5d52e4100 100644
--- a/src/common/linalg_op.cuh
+++ b/src/common/linalg_op.cuh
@@ -13,7 +13,7 @@ namespace xgboost {
 namespace linalg {
 template <typename T, int32_t D, typename Fn>
 void ElementWiseKernelDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr) {
-  dh::safe_cuda(cudaSetDevice(t.DeviceIdx()));
+  dh::safe_cuda(cudaSetDevice(t.Device().ordinal));
   static_assert(std::is_void<std::result_of_t<Fn(size_t, T&)>>::value,
                 "For function with return, use transform instead.");
   if (t.Contiguous()) {
diff --git a/src/common/ranking_utils.cu b/src/common/ranking_utils.cu
index 283ccc21d..5ad8a575c 100644
--- a/src/common/ranking_utils.cu
+++ b/src/common/ranking_utils.cu
@@ -133,7 +133,7 @@ struct WeightOp {
 void RankingCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
   CUDAContext const* cuctx = ctx->CUDACtx();
 
-  group_ptr_.SetDevice(ctx->gpu_id);
+  group_ptr_.SetDevice(ctx->Device());
   if (info.group_ptr_.empty()) {
     group_ptr_.Resize(2, 0);
     group_ptr_.HostVector()[1] = info.num_row_;
@@ -153,7 +153,7 @@ void RankingCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
   max_group_size_ =
       thrust::reduce(cuctx->CTP(), it, it + n_groups, 0ul, thrust::maximum<std::size_t>{});
 
-  threads_group_ptr_.SetDevice(ctx->gpu_id);
+  threads_group_ptr_.SetDevice(ctx->Device());
   threads_group_ptr_.Resize(n_groups + 1, 0);
   auto d_threads_group_ptr = threads_group_ptr_.DeviceSpan();
   if (param_.HasTruncation()) {
@@ -168,7 +168,7 @@ void RankingCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
     n_cuda_threads_ = info.num_row_ * param_.NumPair();
   }
 
-  sorted_idx_cache_.SetDevice(ctx->gpu_id);
+  sorted_idx_cache_.SetDevice(ctx->Device());
   sorted_idx_cache_.Resize(info.labels.Size(), 0);
 
   auto weight = common::MakeOptionalWeights(ctx, info.weights_);
@@ -187,18 +187,18 @@ common::Span<std::size_t const> RankingCache::MakeRankOnCUDA(Context const* ctx,
 
 void NDCGCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
   CUDAContext const* cuctx = ctx->CUDACtx();
-  auto labels = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
+  auto labels = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);
   CheckNDCGLabels(this->Param(), labels, CheckNDCGOp{cuctx});
 
   auto d_group_ptr = this->DataGroupPtr(ctx);
 
   std::size_t n_groups = d_group_ptr.size() - 1;
   inv_idcg_ = linalg::Zeros<double>(ctx, n_groups);
-  auto d_inv_idcg = inv_idcg_.View(ctx->gpu_id);
+  auto d_inv_idcg = inv_idcg_.View(ctx->Device());
   cuda_impl::CalcQueriesInvIDCG(ctx, labels, d_group_ptr, d_inv_idcg, this->Param());
   CHECK_GE(this->Param().NumPair(), 1ul);
 
-  discounts_.SetDevice(ctx->gpu_id);
+  discounts_.SetDevice(ctx->Device());
   discounts_.Resize(MaxGroupSize());
   auto d_discount = discounts_.DeviceSpan();
   dh::LaunchN(MaxGroupSize(), cuctx->Stream(),
@@ -206,12 +206,12 @@ void NDCGCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
 }
 
 void PreCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
-  auto const d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
+  auto const d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);
   CheckPreLabels("pre", d_label, CheckMAPOp{ctx->CUDACtx()});
 }
 
 void MAPCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
-  auto const d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
+  auto const d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);
   CheckPreLabels("map", d_label, CheckMAPOp{ctx->CUDACtx()});
 }
 }  // namespace xgboost::ltr
diff --git a/src/common/ranking_utils.h b/src/common/ranking_utils.h
index 75622bd84..31531a597 100644
--- a/src/common/ranking_utils.h
+++ b/src/common/ranking_utils.h
@@ -217,7 +217,7 @@ class RankingCache {
   }
   // Constructed as [1, n_samples] if group ptr is not supplied by the user
   common::Span<bst_group_t const> DataGroupPtr(Context const* ctx) const {
-    group_ptr_.SetDevice(ctx->gpu_id);
+    group_ptr_.SetDevice(ctx->Device());
     return ctx->IsCPU() ? group_ptr_.ConstHostSpan() : group_ptr_.ConstDeviceSpan();
   }
 
@@ -228,7 +228,7 @@ class RankingCache {
   // Create a rank list by model prediction
   common::Span<std::size_t const> SortedIdx(Context const* ctx, common::Span<float const> predt) {
     if (sorted_idx_cache_.Empty()) {
-      sorted_idx_cache_.SetDevice(ctx->gpu_id);
+      sorted_idx_cache_.SetDevice(ctx->Device());
       sorted_idx_cache_.Resize(predt.size());
     }
     if (ctx->IsCPU()) {
@@ -242,7 +242,7 @@ class RankingCache {
   common::Span<std::size_t> SortedIdxY(Context const* ctx, std::size_t n_samples) {
     CHECK(ctx->IsCUDA()) << error::InvalidCUDAOrdinal();
     if (y_sorted_idx_cache_.Empty()) {
-      y_sorted_idx_cache_.SetDevice(ctx->gpu_id);
+      y_sorted_idx_cache_.SetDevice(ctx->Device());
       y_sorted_idx_cache_.Resize(n_samples);
     }
     return y_sorted_idx_cache_.DeviceSpan();
@@ -250,7 +250,7 @@ class RankingCache {
   common::Span<float> RankedY(Context const* ctx, std::size_t n_samples) {
     CHECK(ctx->IsCUDA()) << error::InvalidCUDAOrdinal();
     if (y_ranked_by_model_.Empty()) {
-      y_ranked_by_model_.SetDevice(ctx->gpu_id);
+      y_ranked_by_model_.SetDevice(ctx->Device());
       y_ranked_by_model_.Resize(n_samples);
     }
     return y_ranked_by_model_.DeviceSpan();
@@ -266,21 +266,21 @@ class RankingCache {
 
   linalg::VectorView<GradientPair> CUDARounding(Context const* ctx) {
     if (roundings_.Size() == 0) {
-      roundings_.SetDevice(ctx->gpu_id);
+      roundings_.SetDevice(ctx->Device());
       roundings_.Reshape(Groups());
     }
-    return roundings_.View(ctx->gpu_id);
+    return roundings_.View(ctx->Device());
   }
   common::Span<double> CUDACostRounding(Context const* ctx) {
     if (cost_rounding_.Size() == 0) {
-      cost_rounding_.SetDevice(ctx->gpu_id);
+      cost_rounding_.SetDevice(ctx->Device());
       cost_rounding_.Resize(1);
     }
     return cost_rounding_.DeviceSpan();
   }
   template <typename Type>
   common::Span<Type> MaxLambdas(Context const* ctx, std::size_t n) {
-    max_lambdas_.SetDevice(ctx->gpu_id);
+    max_lambdas_.SetDevice(ctx->Device());
     std::size_t bytes = n * sizeof(Type);
     if (bytes != max_lambdas_.Size()) {
       max_lambdas_.Resize(bytes);
@@ -315,17 +315,17 @@ class NDCGCache : public RankingCache {
   }
 
   linalg::VectorView<double const> InvIDCG(Context const* ctx) const {
-    return inv_idcg_.View(ctx->gpu_id);
+    return inv_idcg_.View(ctx->Device());
   }
   common::Span<double const> Discount(Context const* ctx) const {
     return ctx->IsCPU() ? discounts_.ConstHostSpan() : discounts_.ConstDeviceSpan();
   }
   linalg::VectorView<double> Dcg(Context const* ctx) {
     if (dcg_.Size() == 0) {
-      dcg_.SetDevice(ctx->gpu_id);
+      dcg_.SetDevice(ctx->Device());
       dcg_.Reshape(this->Groups());
     }
-    return dcg_.View(ctx->gpu_id);
+    return dcg_.View(ctx->Device());
   }
 };
 
@@ -396,7 +396,7 @@ class PreCache : public RankingCache {
 
   common::Span<double> Pre(Context const* ctx) {
     if (pre_.Empty()) {
-      pre_.SetDevice(ctx->gpu_id);
+      pre_.SetDevice(ctx->Device());
       pre_.Resize(this->Groups());
     }
     return ctx->IsCPU() ? pre_.HostSpan() : pre_.DeviceSpan();
@@ -427,21 +427,21 @@ class MAPCache : public RankingCache {
 
   common::Span<double> NumRelevant(Context const* ctx) {
     if (n_rel_.Empty()) {
-      n_rel_.SetDevice(ctx->gpu_id);
+      n_rel_.SetDevice(ctx->Device());
       n_rel_.Resize(n_samples_);
     }
     return ctx->IsCPU() ? n_rel_.HostSpan() : n_rel_.DeviceSpan();
   }
   common::Span<double> Acc(Context const* ctx) {
     if (acc_.Empty()) {
-      acc_.SetDevice(ctx->gpu_id);
+      acc_.SetDevice(ctx->Device());
       acc_.Resize(n_samples_);
     }
     return ctx->IsCPU() ? acc_.HostSpan() : acc_.DeviceSpan();
   }
   common::Span<double> Map(Context const* ctx) {
     if (map_.Empty()) {
-      map_.SetDevice(ctx->gpu_id);
+      map_.SetDevice(ctx->Device());
       map_.Resize(this->Groups());
     }
     return ctx->IsCPU() ? map_.HostSpan() : map_.DeviceSpan();
diff --git a/src/common/stats.cc b/src/common/stats.cc
index 80fc2c50d..03ee00b87 100644
--- a/src/common/stats.cc
+++ b/src/common/stats.cc
@@ -20,9 +20,9 @@ namespace common {
 void Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
             HostDeviceVector<float> const& weights, linalg::Tensor<float, 1>* out) {
   if (!ctx->IsCPU()) {
-    weights.SetDevice(ctx->gpu_id);
+    weights.SetDevice(ctx->Device());
     auto opt_weights = OptionalWeights(weights.ConstDeviceSpan());
-    auto t_v = t.View(ctx->gpu_id);
+    auto t_v = t.View(ctx->Device());
     cuda_impl::Median(ctx, t_v, opt_weights, out);
   }
 
@@ -59,7 +59,7 @@ void Mean(Context const* ctx, linalg::Vector<float> const& v, linalg::Vector<flo
     auto ret = std::accumulate(tloc.cbegin(), tloc.cend(), .0f);
     out->HostView()(0) = ret;
   } else {
-    cuda_impl::Mean(ctx, v.View(ctx->gpu_id), out->View(ctx->gpu_id));
+    cuda_impl::Mean(ctx, v.View(ctx->Device()), out->View(ctx->Device()));
   }
 }
 }  // namespace common
diff --git a/src/data/data.cc b/src/data/data.cc
index 467770715..f143faf97 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -366,7 +366,7 @@ MetaInfo MetaInfo::Slice(common::Span<int32_t const> ridxs) const {
   // Groups is maintained by a higher level Python function.  We should aim at deprecating
   // the slice function.
   if (this->labels.Size() != this->num_row_) {
-    auto t_labels = this->labels.View(this->labels.Data()->DeviceIdx());
+    auto t_labels = this->labels.View(this->labels.Data()->Device());
     out.labels.Reshape(ridxs.size(), labels.Shape(1));
     out.labels.Data()->HostVector() =
         Gather(this->labels.Data()->HostVector(), ridxs, t_labels.Stride(0));
@@ -394,7 +394,7 @@ MetaInfo MetaInfo::Slice(common::Span<int32_t const> ridxs) const {
   if (this->base_margin_.Size() != this->num_row_) {
     CHECK_EQ(this->base_margin_.Size() % this->num_row_, 0)
         << "Incorrect size of base margin vector.";
-    auto t_margin = this->base_margin_.View(this->base_margin_.Data()->DeviceIdx());
+    auto t_margin = this->base_margin_.View(this->base_margin_.Data()->Device());
     out.base_margin_.Reshape(ridxs.size(), t_margin.Shape(1));
     out.base_margin_.Data()->HostVector() =
         Gather(this->base_margin_.Data()->HostVector(), ridxs, t_margin.Stride(0));
@@ -445,7 +445,7 @@ void CopyTensorInfoImpl(Context const& ctx, Json arr_interface, linalg::Tensor<T
     return;
   }
   p_out->Reshape(array.shape);
-  auto t_out = p_out->View(Context::kCpuId);
+  auto t_out = p_out->View(DeviceOrd::CPU());
   CHECK(t_out.CContiguous());
   auto const shape = t_out.Shape();
   DispatchDType(array, DeviceOrd::CPU(), [&](auto&& in) {
@@ -564,7 +564,7 @@ void MetaInfo::SetInfo(Context const& ctx, const char* key, const void* dptr, Da
   CHECK(key);
   auto proc = [&](auto cast_d_ptr) {
     using T = std::remove_pointer_t<decltype(cast_d_ptr)>;
-    auto t = linalg::TensorView<T, 1>(common::Span<T>{cast_d_ptr, num}, {num}, Context::kCpuId);
+    auto t = linalg::TensorView<T, 1>(common::Span<T>{cast_d_ptr, num}, {num}, DeviceOrd::CPU());
     CHECK(t.CContiguous());
     Json interface {
       linalg::ArrayInterface(t)
@@ -739,8 +739,7 @@ void MetaInfo::SynchronizeNumberOfColumns() {
 namespace {
 template <typename T>
 void CheckDevice(std::int32_t device, HostDeviceVector<T> const& v) {
-  bool valid =
-      v.DeviceIdx() == Context::kCpuId || device == Context::kCpuId || v.DeviceIdx() == device;
+  bool valid = v.Device().IsCPU() || device == Context::kCpuId || v.DeviceIdx() == device;
   if (!valid) {
     LOG(FATAL) << "Invalid device ordinal. Data is associated with a different device ordinal than "
                   "the booster. The device ordinal of the data is: "
diff --git a/src/data/data.cu b/src/data/data.cu
index 0f1fda661..74db2b28c 100644
--- a/src/data/data.cu
+++ b/src/data/data.cu
@@ -50,7 +50,7 @@ void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tens
     return;
   }
   p_out->Reshape(array.shape);
-  auto t = p_out->View(ptr_device);
+  auto t = p_out->View(DeviceOrd::CUDA(ptr_device));
   linalg::ElementWiseTransformDevice(
       t,
       [=] __device__(size_t i, T) {
diff --git a/src/gbm/gblinear.cc b/src/gbm/gblinear.cc
index bf4f6b92f..4b05d55f3 100644
--- a/src/gbm/gblinear.cc
+++ b/src/gbm/gblinear.cc
@@ -183,7 +183,7 @@ class GBLinear : public GradientBooster {
                            bst_layer_t layer_begin, bst_layer_t /*layer_end*/, bool) override {
     model_.LazyInitModel();
     LinearCheckLayer(layer_begin);
-    auto base_margin = p_fmat->Info().base_margin_.View(Context::kCpuId);
+    auto base_margin = p_fmat->Info().base_margin_.View(DeviceOrd::CPU());
     const int ngroup = model_.learner_model_param->num_output_group;
     const size_t ncolumns = model_.learner_model_param->num_feature + 1;
     // allocate space for (#features + bias) times #groups times #rows
@@ -250,10 +250,9 @@ class GBLinear : public GradientBooster {
     // The bias is the last weight
     out_scores->resize(model_.weight.size() - learner_model_param_->num_output_group, 0);
     auto n_groups = learner_model_param_->num_output_group;
-    linalg::TensorView<float, 2> scores{
-        *out_scores,
-        {learner_model_param_->num_feature, n_groups},
-        Context::kCpuId};
+    auto scores = linalg::MakeTensorView(DeviceOrd::CPU(),
+                                         common::Span{out_scores->data(), out_scores->size()},
+                                         learner_model_param_->num_feature, n_groups);
     for (size_t i = 0; i < learner_model_param_->num_feature; ++i) {
       for (bst_group_t g = 0; g < n_groups; ++g) {
         scores(i, g) = model_[i][g];
@@ -275,12 +274,12 @@ class GBLinear : public GradientBooster {
     monitor_.Start("PredictBatchInternal");
     model_.LazyInitModel();
     std::vector<bst_float> &preds = *out_preds;
-    auto base_margin = p_fmat->Info().base_margin_.View(Context::kCpuId);
+    auto base_margin = p_fmat->Info().base_margin_.View(DeviceOrd::CPU());
     // start collecting the prediction
     const int ngroup = model_.learner_model_param->num_output_group;
     preds.resize(p_fmat->Info().num_row_ * ngroup);
 
-    auto base_score = learner_model_param_->BaseScore(Context::kCpuId);
+    auto base_score = learner_model_param_->BaseScore(DeviceOrd::CPU());
     for (const auto &page : p_fmat->GetBatches<SparsePage>()) {
       auto const& batch = page.GetView();
       // output convention: nrow * k, where nrow is number of rows
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index e9c5be003..50dfe9262 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -754,7 +754,7 @@ class Dart : public GBTree {
     auto n_groups = model_.learner_model_param->num_output_group;
 
     PredictionCacheEntry predts;  // temporary storage for prediction
-    if (ctx_->gpu_id != Context::kCpuId) {
+    if (ctx_->IsCUDA()) {
       predts.predictions.SetDevice(ctx_->gpu_id);
     }
     predts.predictions.Resize(p_fmat->Info().num_row_ * n_groups, 0);
@@ -859,12 +859,12 @@ class Dart : public GBTree {
       size_t n_rows = p_fmat->Info().num_row_;
       if (predts.predictions.DeviceIdx() != Context::kCpuId) {
         p_out_preds->predictions.SetDevice(predts.predictions.DeviceIdx());
-        auto base_score = model_.learner_model_param->BaseScore(predts.predictions.DeviceIdx());
+        auto base_score = model_.learner_model_param->BaseScore(predts.predictions.Device());
         GPUDartInplacePredictInc(p_out_preds->predictions.DeviceSpan(),
                                  predts.predictions.DeviceSpan(), w, n_rows, base_score, n_groups,
                                  group);
       } else {
-        auto base_score = model_.learner_model_param->BaseScore(Context::kCpuId);
+        auto base_score = model_.learner_model_param->BaseScore(DeviceOrd::CPU());
         auto& h_predts = predts.predictions.HostVector();
         auto& h_out_predts = p_out_preds->predictions.HostVector();
         common::ParallelFor(n_rows, ctx_->Threads(), [&](auto ridx) {
diff --git a/src/learner.cc b/src/learner.cc
index be562f972..33725b612 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -279,15 +279,15 @@ LearnerModelParam::LearnerModelParam(Context const* ctx, LearnerModelParamLegacy
   // Make sure read access everywhere for thread-safe prediction.
   std::as_const(base_score_).HostView();
   if (!ctx->IsCPU()) {
-    std::as_const(base_score_).View(ctx->gpu_id);
+    std::as_const(base_score_).View(ctx->Device());
   }
   CHECK(std::as_const(base_score_).Data()->HostCanRead());
 }
 
-linalg::TensorView<float const, 1> LearnerModelParam::BaseScore(int32_t device) const {
+linalg::TensorView<float const, 1> LearnerModelParam::BaseScore(DeviceOrd device) const {
   // multi-class is not yet supported.
   CHECK_EQ(base_score_.Size(), 1) << ModelNotFitted();
-  if (device == Context::kCpuId) {
+  if (device.IsCPU()) {
     // Make sure that we won't run into race condition.
     CHECK(base_score_.Data()->HostCanRead());
     return base_score_.HostView();
@@ -300,7 +300,7 @@ linalg::TensorView<float const, 1> LearnerModelParam::BaseScore(int32_t device)
 }
 
 linalg::TensorView<float const, 1> LearnerModelParam::BaseScore(Context const* ctx) const {
-  return this->BaseScore(ctx->gpu_id);
+  return this->BaseScore(ctx->Device());
 }
 
 void LearnerModelParam::Copy(LearnerModelParam const& that) {
@@ -309,7 +309,7 @@ void LearnerModelParam::Copy(LearnerModelParam const& that) {
   base_score_.Data()->Copy(*that.base_score_.Data());
   std::as_const(base_score_).HostView();
   if (that.base_score_.DeviceIdx() != Context::kCpuId) {
-    std::as_const(base_score_).View(that.base_score_.DeviceIdx());
+    std::as_const(base_score_).View(that.base_score_.Device());
   }
   CHECK_EQ(base_score_.Data()->DeviceCanRead(), that.base_score_.Data()->DeviceCanRead());
   CHECK(base_score_.Data()->HostCanRead());
@@ -388,7 +388,7 @@ class LearnerConfiguration : public Learner {
     this->ConfigureTargets();
 
     auto task = UsePtr(obj_)->Task();
-    linalg::Tensor<float, 1> base_score({1}, Ctx()->gpu_id);
+    linalg::Tensor<float, 1> base_score({1}, Ctx()->Device());
     auto h_base_score = base_score.HostView();
 
     // transform to margin
@@ -424,7 +424,7 @@ class LearnerConfiguration : public Learner {
     if (mparam_.boost_from_average && !UsePtr(gbm_)->ModelFitted()) {
       if (p_fmat) {
         auto const& info = p_fmat->Info();
-        info.Validate(Ctx()->gpu_id);
+        info.Validate(Ctx()->Ordinal());
         // We estimate it from input data.
         linalg::Tensor<float, 1> base_score;
         InitEstimation(info, &base_score);
@@ -1369,7 +1369,7 @@ class LearnerImpl : public LearnerIO {
       auto& prediction = prediction_container_.Cache(data, ctx_.gpu_id);
       this->PredictRaw(data.get(), &prediction, training, layer_begin, layer_end);
       // Copy the prediction cache to output prediction. out_preds comes from C API
-      out_preds->SetDevice(ctx_.gpu_id);
+      out_preds->SetDevice(ctx_.Device());
       out_preds->Resize(prediction.predictions.Size());
       out_preds->Copy(prediction.predictions);
       if (!output_margin) {
diff --git a/src/metric/auc.cc b/src/metric/auc.cc
index 473f5b02c..a2e7372fb 100644
--- a/src/metric/auc.cc
+++ b/src/metric/auc.cc
@@ -82,22 +82,19 @@ template <typename BinaryAUC>
 double MultiClassOVR(Context const *ctx, common::Span<float const> predts, MetaInfo const &info,
                      size_t n_classes, int32_t n_threads, BinaryAUC &&binary_auc) {
   CHECK_NE(n_classes, 0);
-  auto const labels = info.labels.View(Context::kCpuId);
+  auto const labels = info.labels.HostView();
   if (labels.Shape(0) != 0) {
     CHECK_EQ(labels.Shape(1), 1) << "AUC doesn't support multi-target model.";
   }
 
   std::vector<double> results_storage(n_classes * 3, 0);
-  linalg::TensorView<double, 2> results(results_storage, {n_classes, static_cast<size_t>(3)},
-                                        Context::kCpuId);
+  auto results = linalg::MakeTensorView(ctx, results_storage, n_classes, 3);
   auto local_area = results.Slice(linalg::All(), 0);
   auto tp = results.Slice(linalg::All(), 1);
   auto auc = results.Slice(linalg::All(), 2);
 
   auto weights = common::OptionalWeights{info.weights_.ConstHostSpan()};
-  auto predts_t = linalg::TensorView<float const, 2>(
-      predts, {static_cast<size_t>(info.num_row_), n_classes},
-      Context::kCpuId);
+  auto predts_t = linalg::MakeTensorView(ctx, predts, info.num_row_, n_classes);
 
   if (info.labels.Size() != 0) {
     common::ParallelFor(n_classes, n_threads, [&](auto c) {
@@ -108,8 +105,8 @@ double MultiClassOVR(Context const *ctx, common::Span<float const> predts, MetaI
         response[i] = labels(i) == c ? 1.0f : 0.0;
       }
       double fp;
-      std::tie(fp, tp(c), auc(c)) =
-          binary_auc(ctx, proba, linalg::MakeVec(response.data(), response.size(), -1), weights);
+      std::tie(fp, tp(c), auc(c)) = binary_auc(
+          ctx, proba, linalg::MakeVec(response.data(), response.size(), ctx->Device()), weights);
       local_area(c) = fp * tp(c);
     });
   }
@@ -220,7 +217,7 @@ std::pair<double, uint32_t> RankingAUC(Context const *ctx, std::vector<float> co
   CHECK_GE(info.group_ptr_.size(), 2);
   uint32_t n_groups = info.group_ptr_.size() - 1;
   auto s_predts = common::Span<float const>{predts};
-  auto labels = info.labels.View(Context::kCpuId);
+  auto labels = info.labels.View(ctx->Device());
   auto s_weights = info.weights_.ConstHostSpan();
 
   std::atomic<uint32_t> invalid_groups{0};
@@ -363,8 +360,8 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
                                            info.labels.HostView().Slice(linalg::All(), 0),
                                            common::OptionalWeights{info.weights_.ConstHostSpan()});
     } else {
-      std::tie(fp, tp, auc) = GPUBinaryROCAUC(predts.ConstDeviceSpan(), info,
-                                              ctx_->gpu_id, &this->d_cache_);
+      std::tie(fp, tp, auc) =
+          GPUBinaryROCAUC(predts.ConstDeviceSpan(), info, ctx_->Device(), &this->d_cache_);
     }
     return std::make_tuple(fp, tp, auc);
   }
@@ -381,8 +378,7 @@ XGBOOST_REGISTER_METRIC(EvalAUC, "auc")
 
 #if !defined(XGBOOST_USE_CUDA)
 std::tuple<double, double, double> GPUBinaryROCAUC(common::Span<float const>, MetaInfo const &,
-                                                   std::int32_t,
-                                                   std::shared_ptr<DeviceAUCCache> *) {
+                                                   DeviceOrd, std::shared_ptr<DeviceAUCCache> *) {
   common::AssertGPUSupport();
   return {};
 }
@@ -414,8 +410,8 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
           BinaryPRAUC(ctx_, predts.ConstHostSpan(), info.labels.HostView().Slice(linalg::All(), 0),
                       common::OptionalWeights{info.weights_.ConstHostSpan()});
     } else {
-      std::tie(pr, re, auc) = GPUBinaryPRAUC(predts.ConstDeviceSpan(), info,
-                                             ctx_->gpu_id, &this->d_cache_);
+      std::tie(pr, re, auc) =
+          GPUBinaryPRAUC(predts.ConstDeviceSpan(), info, ctx_->Device(), &this->d_cache_);
     }
     return std::make_tuple(pr, re, auc);
   }
@@ -459,7 +455,7 @@ XGBOOST_REGISTER_METRIC(AUCPR, "aucpr")
 
 #if !defined(XGBOOST_USE_CUDA)
 std::tuple<double, double, double> GPUBinaryPRAUC(common::Span<float const>, MetaInfo const &,
-                                                  std::int32_t, std::shared_ptr<DeviceAUCCache> *) {
+                                                  DeviceOrd, std::shared_ptr<DeviceAUCCache> *) {
   common::AssertGPUSupport();
   return {};
 }
diff --git a/src/metric/auc.cu b/src/metric/auc.cu
index 6e3032e42..dd9e4483f 100644
--- a/src/metric/auc.cu
+++ b/src/metric/auc.cu
@@ -85,11 +85,11 @@ void InitCacheOnce(common::Span<float const> predts, std::shared_ptr<DeviceAUCCa
 template <typename Fn>
 std::tuple<double, double, double>
 GPUBinaryAUC(common::Span<float const> predts, MetaInfo const &info,
-             int32_t device, common::Span<size_t const> d_sorted_idx,
+             DeviceOrd device, common::Span<size_t const> d_sorted_idx,
              Fn area_fn, std::shared_ptr<DeviceAUCCache> cache) {
   auto labels = info.labels.View(device);
   auto weights = info.weights_.ConstDeviceSpan();
-  dh::safe_cuda(cudaSetDevice(device));
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
 
   CHECK_NE(labels.Size(), 0);
   CHECK_EQ(labels.Size(), predts.size());
@@ -168,7 +168,7 @@ GPUBinaryAUC(common::Span<float const> predts, MetaInfo const &info,
 }
 
 std::tuple<double, double, double> GPUBinaryROCAUC(common::Span<float const> predts,
-                                                   MetaInfo const &info, std::int32_t device,
+                                                   MetaInfo const &info, DeviceOrd device,
                                                    std::shared_ptr<DeviceAUCCache> *p_cache) {
   auto &cache = *p_cache;
   InitCacheOnce<false>(predts, p_cache);
@@ -309,9 +309,10 @@ void SegmentedReduceAUC(common::Span<size_t const> d_unique_idx,
  * up each class in all kernels.
  */
 template <bool scale, typename Fn>
-double GPUMultiClassAUCOVR(MetaInfo const &info, int32_t device, common::Span<uint32_t> d_class_ptr,
-                           size_t n_classes, std::shared_ptr<DeviceAUCCache> cache, Fn area_fn) {
-  dh::safe_cuda(cudaSetDevice(device));
+double GPUMultiClassAUCOVR(MetaInfo const &info, DeviceOrd device,
+                           common::Span<uint32_t> d_class_ptr, size_t n_classes,
+                           std::shared_ptr<DeviceAUCCache> cache, Fn area_fn) {
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
   /**
    * Sorted idx
    */
@@ -467,11 +468,12 @@ double GPUMultiClassROCAUC(Context const *ctx, common::Span<float const> predts,
   dh::TemporaryArray<uint32_t> class_ptr(n_classes + 1, 0);
   MultiClassSortedIdx(ctx, predts, dh::ToSpan(class_ptr), cache);
 
-  auto fn = [] XGBOOST_DEVICE(double fp_prev, double fp, double tp_prev,
-                              double tp, size_t /*class_id*/) {
+  auto fn = [] XGBOOST_DEVICE(double fp_prev, double fp, double tp_prev, double tp,
+                              size_t /*class_id*/) {
     return TrapezoidArea(fp_prev, fp, tp_prev, tp);
   };
-  return GPUMultiClassAUCOVR<true>(info, ctx->gpu_id, dh::ToSpan(class_ptr), n_classes, cache, fn);
+  return GPUMultiClassAUCOVR<true>(info, ctx->Device(), dh::ToSpan(class_ptr), n_classes, cache,
+                                   fn);
 }
 
 namespace {
@@ -512,7 +514,7 @@ std::pair<double, std::uint32_t> GPURankingAUC(Context const *ctx, common::Span<
   /**
    * Sort the labels
    */
-  auto d_labels = info.labels.View(ctx->gpu_id);
+  auto d_labels = info.labels.View(ctx->Device());
 
   auto d_sorted_idx = dh::ToSpan(cache->sorted_idx);
   common::SegmentedArgSort<false, false>(ctx, d_labels.Values(), d_group_ptr, d_sorted_idx);
@@ -604,7 +606,7 @@ std::pair<double, std::uint32_t> GPURankingAUC(Context const *ctx, common::Span<
 }
 
 std::tuple<double, double, double> GPUBinaryPRAUC(common::Span<float const> predts,
-                                                  MetaInfo const &info, std::int32_t device,
+                                                  MetaInfo const &info, DeviceOrd device,
                                                   std::shared_ptr<DeviceAUCCache> *p_cache) {
   auto& cache = *p_cache;
   InitCacheOnce<false>(predts, p_cache);
@@ -662,7 +664,7 @@ double GPUMultiClassPRAUC(Context const *ctx, common::Span<float const> predts,
   /**
    * Get total positive/negative
    */
-  auto labels = info.labels.View(ctx->gpu_id);
+  auto labels = info.labels.View(ctx->Device());
   auto n_samples = info.num_row_;
   dh::caching_device_vector<Pair> totals(n_classes);
   auto key_it =
@@ -695,13 +697,13 @@ double GPUMultiClassPRAUC(Context const *ctx, common::Span<float const> predts,
     return detail::CalcDeltaPRAUC(fp_prev, fp, tp_prev, tp,
                                   d_totals[class_id].first);
   };
-  return GPUMultiClassAUCOVR<false>(info, ctx->gpu_id, d_class_ptr, n_classes, cache, fn);
+  return GPUMultiClassAUCOVR<false>(info, ctx->Device(), d_class_ptr, n_classes, cache, fn);
 }
 
 template <typename Fn>
 std::pair<double, uint32_t>
 GPURankingPRAUCImpl(common::Span<float const> predts, MetaInfo const &info,
-                    common::Span<uint32_t> d_group_ptr, int32_t device,
+                    common::Span<uint32_t> d_group_ptr, DeviceOrd device,
                     std::shared_ptr<DeviceAUCCache> cache, Fn area_fn) {
   /**
    * Sorted idx
@@ -843,7 +845,7 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *ctx,
   common::SegmentedArgSort<false, false>(ctx, predts, d_group_ptr, d_sorted_idx);
 
   dh::XGBDeviceAllocator<char> alloc;
-  auto labels = info.labels.View(ctx->gpu_id);
+  auto labels = info.labels.View(ctx->Device());
   if (thrust::any_of(thrust::cuda::par(alloc), dh::tbegin(labels.Values()),
                      dh::tend(labels.Values()), PRAUCLabelInvalid{})) {
     InvalidLabels();
@@ -882,7 +884,7 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *ctx,
     return detail::CalcDeltaPRAUC(fp_prev, fp, tp_prev, tp,
                                   d_totals[group_id].first);
   };
-  return GPURankingPRAUCImpl(predts, info, d_group_ptr, ctx->gpu_id, cache, fn);
+  return GPURankingPRAUCImpl(predts, info, d_group_ptr, ctx->Device(), cache, fn);
 }
 }  // namespace metric
 }  // namespace xgboost
diff --git a/src/metric/auc.h b/src/metric/auc.h
index d8e7f4344..fce1cc757 100644
--- a/src/metric/auc.h
+++ b/src/metric/auc.h
@@ -30,7 +30,7 @@ XGBOOST_DEVICE inline double TrapezoidArea(double x0, double x1, double y0, doub
 struct DeviceAUCCache;
 
 std::tuple<double, double, double> GPUBinaryROCAUC(common::Span<float const> predts,
-                                                   MetaInfo const &info, std::int32_t device,
+                                                   MetaInfo const &info, DeviceOrd,
                                                    std::shared_ptr<DeviceAUCCache> *p_cache);
 
 double GPUMultiClassROCAUC(Context const *ctx, common::Span<float const> predts,
@@ -45,7 +45,7 @@ std::pair<double, std::uint32_t> GPURankingAUC(Context const *ctx, common::Span<
  * PR AUC *
  **********/
 std::tuple<double, double, double> GPUBinaryPRAUC(common::Span<float const> predts,
-                                                  MetaInfo const &info, std::int32_t device,
+                                                  MetaInfo const &info, DeviceOrd,
                                                   std::shared_ptr<DeviceAUCCache> *p_cache);
 
 double GPUMultiClassPRAUC(Context const *ctx, common::Span<float const> predts,
diff --git a/src/metric/elementwise_metric.cu b/src/metric/elementwise_metric.cu
index b6888610b..e16f9f8cc 100644
--- a/src/metric/elementwise_metric.cu
+++ b/src/metric/elementwise_metric.cu
@@ -45,7 +45,7 @@ namespace {
 template <typename Fn>
 PackedReduceResult Reduce(Context const* ctx, MetaInfo const& info, Fn&& loss) {
   PackedReduceResult result;
-  auto labels = info.labels.View(ctx->gpu_id);
+  auto labels = info.labels.View(ctx->Device());
   if (ctx->IsCPU()) {
     auto n_threads = ctx->Threads();
     std::vector<double> score_tloc(n_threads, 0.0);
@@ -183,10 +183,10 @@ class PseudoErrorLoss : public MetricNoCache {
 
   double Eval(const HostDeviceVector<bst_float>& preds, const MetaInfo& info) override {
     CHECK_EQ(info.labels.Shape(0), info.num_row_);
-    auto labels = info.labels.View(ctx_->gpu_id);
-    preds.SetDevice(ctx_->gpu_id);
+    auto labels = info.labels.View(ctx_->Device());
+    preds.SetDevice(ctx_->Device());
     auto predts = ctx_->IsCPU() ? preds.ConstHostSpan() : preds.ConstDeviceSpan();
-    info.weights_.SetDevice(ctx_->gpu_id);
+    info.weights_.SetDevice(ctx_->Device());
     common::OptionalWeights weights(ctx_->IsCPU() ? info.weights_.ConstHostSpan()
                                                      : info.weights_.ConstDeviceSpan());
     float slope = this->param_.huber_slope;
@@ -349,11 +349,11 @@ struct EvalEWiseBase : public MetricNoCache {
     if (info.labels.Size() != 0) {
       CHECK_NE(info.labels.Shape(1), 0);
     }
-    auto labels = info.labels.View(ctx_->gpu_id);
-    info.weights_.SetDevice(ctx_->gpu_id);
+    auto labels = info.labels.View(ctx_->Device());
+    info.weights_.SetDevice(ctx_->Device());
     common::OptionalWeights weights(ctx_->IsCPU() ? info.weights_.ConstHostSpan()
                                                      : info.weights_.ConstDeviceSpan());
-    preds.SetDevice(ctx_->gpu_id);
+    preds.SetDevice(ctx_->Device());
     auto predts = ctx_->IsCPU() ? preds.ConstHostSpan() : preds.ConstDeviceSpan();
 
     auto d_policy = policy_;
@@ -444,16 +444,16 @@ class QuantileError : public MetricNoCache {
     }
 
     auto const* ctx = ctx_;
-    auto y_true = info.labels.View(ctx->gpu_id);
-    preds.SetDevice(ctx->gpu_id);
-    alpha_.SetDevice(ctx->gpu_id);
+    auto y_true = info.labels.View(ctx->Device());
+    preds.SetDevice(ctx->Device());
+    alpha_.SetDevice(ctx->Device());
     auto alpha = ctx->IsCPU() ? alpha_.ConstHostSpan() : alpha_.ConstDeviceSpan();
     std::size_t n_targets = preds.Size() / info.num_row_ / alpha_.Size();
     CHECK_NE(n_targets, 0);
     auto y_predt = linalg::MakeTensorView(ctx, &preds, static_cast<std::size_t>(info.num_row_),
                                           alpha_.Size(), n_targets);
 
-    info.weights_.SetDevice(ctx->gpu_id);
+    info.weights_.SetDevice(ctx->Device());
     common::OptionalWeights weight{ctx->IsCPU() ? info.weights_.ConstHostSpan()
                                                 : info.weights_.ConstDeviceSpan()};
 
diff --git a/src/metric/rank_metric.cc b/src/metric/rank_metric.cc
index 8df6e585f..41495164c 100644
--- a/src/metric/rank_metric.cc
+++ b/src/metric/rank_metric.cc
@@ -75,7 +75,7 @@ struct EvalAMS : public MetricNoCache {
     const double br = 10.0;
     unsigned thresindex = 0;
     double s_tp = 0.0, b_fp = 0.0, tams = 0.0;
-    const auto& labels = info.labels.View(Context::kCpuId);
+    const auto& labels = info.labels.View(DeviceOrd::CPU());
     for (unsigned i = 0; i < static_cast<unsigned>(ndata-1) && i < ntop; ++i) {
       const unsigned ridx = rec[i].second;
       const bst_float wt = info.GetWeight(ridx);
@@ -134,7 +134,7 @@ struct EvalRank : public MetricNoCache, public EvalRankConfig {
     std::vector<double> sum_tloc(ctx_->Threads(), 0.0);
 
     {
-      const auto& labels = info.labels.View(Context::kCpuId);
+      const auto& labels = info.labels.HostView();
       const auto &h_preds = preds.ConstHostVector();
 
       dmlc::OMPException exc;
diff --git a/src/metric/rank_metric.cu b/src/metric/rank_metric.cu
index 9ba1baf8f..f79d52742 100644
--- a/src/metric/rank_metric.cu
+++ b/src/metric/rank_metric.cu
@@ -33,7 +33,7 @@ PackedReduceResult PreScore(Context const *ctx, MetaInfo const &info,
                             HostDeviceVector<float> const &predt,
                             std::shared_ptr<ltr::PreCache> p_cache) {
   auto d_gptr = p_cache->DataGroupPtr(ctx);
-  auto d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
+  auto d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);
 
   predt.SetDevice(ctx->gpu_id);
   auto d_rank_idx = p_cache->SortedIdx(ctx, predt.ConstDeviceSpan());
@@ -89,7 +89,7 @@ PackedReduceResult NDCGScore(Context const *ctx, MetaInfo const &info,
   if (!d_weight.Empty()) {
     CHECK_EQ(d_weight.weights.size(), p_cache->Groups());
   }
-  auto d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
+  auto d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);
   predt.SetDevice(ctx->gpu_id);
   auto d_predt = linalg::MakeTensorView(ctx, predt.ConstDeviceSpan(), predt.Size());
 
@@ -119,9 +119,9 @@ PackedReduceResult MAPScore(Context const *ctx, MetaInfo const &info,
                             HostDeviceVector<float> const &predt, bool minus,
                             std::shared_ptr<ltr::MAPCache> p_cache) {
   auto d_group_ptr = p_cache->DataGroupPtr(ctx);
-  auto d_label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
+  auto d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);
 
-  predt.SetDevice(ctx->gpu_id);
+  predt.SetDevice(ctx->Device());
   auto d_rank_idx = p_cache->SortedIdx(ctx, predt.ConstDeviceSpan());
   auto key_it = dh::MakeTransformIterator<std::size_t>(
       thrust::make_counting_iterator(0ul),
diff --git a/src/objective/adaptive.cu b/src/objective/adaptive.cu
index bba8b85ad..29f70a8d8 100644
--- a/src/objective/adaptive.cu
+++ b/src/objective/adaptive.cu
@@ -19,7 +19,7 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
                           dh::device_vector<size_t>* p_ridx, HostDeviceVector<size_t>* p_nptr,
                           HostDeviceVector<bst_node_t>* p_nidx, RegTree const& tree) {
   // copy position to buffer
-  dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
+  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
   auto cuctx = ctx->CUDACtx();
   size_t n_samples = position.size();
   dh::device_vector<bst_node_t> sorted_position(position.size());
@@ -86,11 +86,11 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
    */
   auto& nidx = *p_nidx;
   auto& nptr = *p_nptr;
-  nidx.SetDevice(ctx->gpu_id);
+  nidx.SetDevice(ctx->Device());
   nidx.Resize(n_leaf);
   auto d_node_idx = nidx.DeviceSpan();
 
-  nptr.SetDevice(ctx->gpu_id);
+  nptr.SetDevice(ctx->Device());
   nptr.Resize(n_leaf + 1, 0);
   auto d_node_ptr = nptr.DeviceSpan();
 
@@ -142,7 +142,7 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
 void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> position,
                           std::int32_t group_idx, MetaInfo const& info, float learning_rate,
                           HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree) {
-  dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
+  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
   dh::device_vector<size_t> ridx;
   HostDeviceVector<size_t> nptr;
   HostDeviceVector<bst_node_t> nidx;
@@ -155,13 +155,13 @@ void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
   }
 
   HostDeviceVector<float> quantiles;
-  predt.SetDevice(ctx->gpu_id);
+  predt.SetDevice(ctx->Device());
 
   auto d_predt = linalg::MakeTensorView(ctx, predt.ConstDeviceSpan(), info.num_row_,
                                         predt.Size() / info.num_row_);
   CHECK_LT(group_idx, d_predt.Shape(1));
   auto t_predt = d_predt.Slice(linalg::All(), group_idx);
-  auto d_labels = info.labels.View(ctx->gpu_id).Slice(linalg::All(), IdxY(info, group_idx));
+  auto d_labels = info.labels.View(ctx->Device()).Slice(linalg::All(), IdxY(info, group_idx));
 
   auto d_row_index = dh::ToSpan(ridx);
   auto seg_beg = nptr.DevicePointer();
@@ -178,7 +178,7 @@ void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
   if (info.weights_.Empty()) {
     common::SegmentedQuantile(ctx, alpha, seg_beg, seg_end, val_beg, val_end, &quantiles);
   } else {
-    info.weights_.SetDevice(ctx->gpu_id);
+    info.weights_.SetDevice(ctx->Device());
     auto d_weights = info.weights_.ConstDeviceSpan();
     CHECK_EQ(d_weights.size(), d_row_index.size());
     auto w_it = thrust::make_permutation_iterator(dh::tcbegin(d_weights), dh::tcbegin(d_row_index));
diff --git a/src/objective/lambdarank_obj.cc b/src/objective/lambdarank_obj.cc
index 46fd77705..5a3a38fdf 100644
--- a/src/objective/lambdarank_obj.cc
+++ b/src/objective/lambdarank_obj.cc
@@ -109,12 +109,12 @@ class LambdaRankObj : public FitIntercept {
     lj_.SetDevice(ctx_->gpu_id);
 
     if (ctx_->IsCPU()) {
-      cpu_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->gpu_id),
-                                             lj_full_.View(ctx_->gpu_id), &ti_plus_, &tj_minus_,
+      cpu_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->Device()),
+                                             lj_full_.View(ctx_->Device()), &ti_plus_, &tj_minus_,
                                              &li_, &lj_, p_cache_);
     } else {
-      cuda_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->gpu_id),
-                                              lj_full_.View(ctx_->gpu_id), &ti_plus_, &tj_minus_,
+      cuda_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->Device()),
+                                              lj_full_.View(ctx_->Device()), &ti_plus_, &tj_minus_,
                                               &li_, &lj_, p_cache_);
     }
 
@@ -354,9 +354,9 @@ class LambdaRankNDCG : public LambdaRankObj<LambdaRankNDCG, ltr::NDCGCache> {
                        const MetaInfo& info, linalg::Matrix<GradientPair>* out_gpair) {
     if (ctx_->IsCUDA()) {
       cuda_impl::LambdaRankGetGradientNDCG(
-          ctx_, iter, predt, info, GetCache(), ti_plus_.View(ctx_->gpu_id),
-          tj_minus_.View(ctx_->gpu_id), li_full_.View(ctx_->gpu_id), lj_full_.View(ctx_->gpu_id),
-          out_gpair);
+          ctx_, iter, predt, info, GetCache(), ti_plus_.View(ctx_->Device()),
+          tj_minus_.View(ctx_->Device()), li_full_.View(ctx_->Device()),
+          lj_full_.View(ctx_->Device()), out_gpair);
       return;
     }
 
@@ -477,9 +477,9 @@ class LambdaRankMAP : public LambdaRankObj<LambdaRankMAP, ltr::MAPCache> {
     CHECK(param_.ndcg_exp_gain) << "NDCG gain can not be set for the MAP objective.";
     if (ctx_->IsCUDA()) {
       return cuda_impl::LambdaRankGetGradientMAP(
-          ctx_, iter, predt, info, GetCache(), ti_plus_.View(ctx_->gpu_id),
-          tj_minus_.View(ctx_->gpu_id), li_full_.View(ctx_->gpu_id), lj_full_.View(ctx_->gpu_id),
-          out_gpair);
+          ctx_, iter, predt, info, GetCache(), ti_plus_.View(ctx_->Device()),
+          tj_minus_.View(ctx_->Device()), li_full_.View(ctx_->Device()),
+          lj_full_.View(ctx_->Device()), out_gpair);
     }
 
     auto gptr = p_cache_->DataGroupPtr(ctx_).data();
@@ -567,9 +567,9 @@ class LambdaRankPairwise : public LambdaRankObj<LambdaRankPairwise, ltr::Ranking
     CHECK(param_.ndcg_exp_gain) << "NDCG gain can not be set for the pairwise objective.";
     if (ctx_->IsCUDA()) {
       return cuda_impl::LambdaRankGetGradientPairwise(
-          ctx_, iter, predt, info, GetCache(), ti_plus_.View(ctx_->gpu_id),
-          tj_minus_.View(ctx_->gpu_id), li_full_.View(ctx_->gpu_id), lj_full_.View(ctx_->gpu_id),
-          out_gpair);
+          ctx_, iter, predt, info, GetCache(), ti_plus_.View(ctx_->Device()),
+          tj_minus_.View(ctx_->Device()), li_full_.View(ctx_->Device()),
+          lj_full_.View(ctx_->Device()), out_gpair);
     }
 
     auto gptr = p_cache_->DataGroupPtr(ctx_);
diff --git a/src/objective/lambdarank_obj.cu b/src/objective/lambdarank_obj.cu
index 0f57fce48..ac31a2c79 100644
--- a/src/objective/lambdarank_obj.cu
+++ b/src/objective/lambdarank_obj.cu
@@ -306,7 +306,7 @@ void Launch(Context const* ctx, std::int32_t iter, HostDeviceVector<float> const
 
   CHECK_NE(d_rounding.Size(), 0);
 
-  auto label = info.labels.View(ctx->gpu_id);
+  auto label = info.labels.View(ctx->Device());
   auto predts = preds.ConstDeviceSpan();
   auto gpairs = out_gpair->View(ctx->Device());
   thrust::fill_n(ctx->CUDACtx()->CTP(), gpairs.Values().data(), gpairs.Size(),
@@ -348,7 +348,7 @@ common::Span<std::size_t const> SortY(Context const* ctx, MetaInfo const& info,
                                       common::Span<std::size_t const> d_rank,
                                       std::shared_ptr<ltr::RankingCache> p_cache) {
   auto const d_group_ptr = p_cache->DataGroupPtr(ctx);
-  auto label = info.labels.View(ctx->gpu_id);
+  auto label = info.labels.View(ctx->Device());
   // The buffer for ranked y is necessary as cub segmented sort accepts only pointer.
   auto d_y_ranked = p_cache->RankedY(ctx, info.num_row_);
   thrust::for_each_n(ctx->CUDACtx()->CTP(), thrust::make_counting_iterator(0ul), d_y_ranked.size(),
@@ -374,13 +374,13 @@ void LambdaRankGetGradientNDCG(Context const* ctx, std::int32_t iter,
                                linalg::VectorView<double> li, linalg::VectorView<double> lj,
                                linalg::Matrix<GradientPair>* out_gpair) {
   // boilerplate
-  std::int32_t device_id = ctx->gpu_id;
-  dh::safe_cuda(cudaSetDevice(device_id));
+  auto device = ctx->Device();
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
   auto const d_inv_IDCG = p_cache->InvIDCG(ctx);
   auto const discount = p_cache->Discount(ctx);
 
-  info.labels.SetDevice(device_id);
-  preds.SetDevice(device_id);
+  info.labels.SetDevice(device);
+  preds.SetDevice(device);
 
   auto const exp_gain = p_cache->Param().ndcg_exp_gain;
   auto delta_ndcg = [=] XGBOOST_DEVICE(float y_high, float y_low, std::size_t rank_high,
@@ -403,7 +403,7 @@ void MAPStat(Context const* ctx, MetaInfo const& info, common::Span<std::size_t
   auto key_it = dh::MakeTransformIterator<std::size_t>(
       thrust::make_counting_iterator(0ul),
       [=] XGBOOST_DEVICE(std::size_t i) -> std::size_t { return dh::SegmentId(group_ptr, i); });
-  auto label = info.labels.View(ctx->gpu_id).Slice(linalg::All(), 0);
+  auto label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);
   auto const* cuctx = ctx->CUDACtx();
 
   {
@@ -442,11 +442,11 @@ void LambdaRankGetGradientMAP(Context const* ctx, std::int32_t iter,
                               linalg::VectorView<double const> tj_minus,  // input bias ratio
                               linalg::VectorView<double> li, linalg::VectorView<double> lj,
                               linalg::Matrix<GradientPair>* out_gpair) {
-  std::int32_t device_id = ctx->gpu_id;
-  dh::safe_cuda(cudaSetDevice(device_id));
+  auto device = ctx->Device();
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
 
-  info.labels.SetDevice(device_id);
-  predt.SetDevice(device_id);
+  info.labels.SetDevice(device);
+  predt.SetDevice(device);
 
   CHECK(p_cache);
 
@@ -481,11 +481,11 @@ void LambdaRankGetGradientPairwise(Context const* ctx, std::int32_t iter,
                                    linalg::VectorView<double const> tj_minus,  // input bias ratio
                                    linalg::VectorView<double> li, linalg::VectorView<double> lj,
                                    linalg::Matrix<GradientPair>* out_gpair) {
-  std::int32_t device_id = ctx->gpu_id;
-  dh::safe_cuda(cudaSetDevice(device_id));
+  auto device = ctx->Device();
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
 
-  info.labels.SetDevice(device_id);
-  predt.SetDevice(device_id);
+  info.labels.SetDevice(device);
+  predt.SetDevice(device);
 
   auto d_predt = predt.ConstDeviceSpan();
   auto const d_sorted_idx = p_cache->SortedIdx(ctx, d_predt);
@@ -517,11 +517,11 @@ void LambdaRankUpdatePositionBias(Context const* ctx, linalg::VectorView<double
   auto const d_group_ptr = p_cache->DataGroupPtr(ctx);
   auto n_groups = d_group_ptr.size() - 1;
 
-  auto ti_plus = p_ti_plus->View(ctx->gpu_id);
-  auto tj_minus = p_tj_minus->View(ctx->gpu_id);
+  auto ti_plus = p_ti_plus->View(ctx->Device());
+  auto tj_minus = p_tj_minus->View(ctx->Device());
 
-  auto li = p_li->View(ctx->gpu_id);
-  auto lj = p_lj->View(ctx->gpu_id);
+  auto li = p_li->View(ctx->Device());
+  auto lj = p_lj->View(ctx->Device());
   CHECK_EQ(li.Size(), ti_plus.Size());
 
   auto const& param = p_cache->Param();
diff --git a/src/objective/quantile_obj.cu b/src/objective/quantile_obj.cu
index 0774223e7..8d83b829b 100644
--- a/src/objective/quantile_obj.cu
+++ b/src/objective/quantile_obj.cu
@@ -62,7 +62,7 @@ class QuantileRegression : public ObjFunction {
     CHECK_GE(n_targets, n_alphas);
     CHECK_EQ(preds.Size(), info.num_row_ * n_targets);
 
-    auto labels = info.labels.View(ctx_->gpu_id);
+    auto labels = info.labels.View(ctx_->Device());
 
     out_gpair->SetDevice(ctx_->Device());
     CHECK_EQ(info.labels.Shape(1), 1)
@@ -131,7 +131,7 @@ class QuantileRegression : public ObjFunction {
 #if defined(XGBOOST_USE_CUDA)
       alpha_.SetDevice(ctx_->gpu_id);
       auto d_alpha = alpha_.ConstDeviceSpan();
-      auto d_labels = info.labels.View(ctx_->gpu_id);
+      auto d_labels = info.labels.View(ctx_->Device());
       auto seg_it = dh::MakeTransformIterator<std::size_t>(
           thrust::make_counting_iterator(0ul),
           [=] XGBOOST_DEVICE(std::size_t i) { return i * d_labels.Shape(0); });
diff --git a/src/objective/regression_obj.cu b/src/objective/regression_obj.cu
index 5751d6102..4f099a537 100644
--- a/src/objective/regression_obj.cu
+++ b/src/objective/regression_obj.cu
@@ -69,7 +69,7 @@ class RegLossObj : public FitIntercept {
 
  public:
   void ValidateLabel(MetaInfo const& info) {
-    auto label = info.labels.View(ctx_->Ordinal());
+    auto label = info.labels.View(ctx_->Device());
     auto valid = ctx_->DispatchDevice(
         [&] {
           return std::all_of(linalg::cbegin(label), linalg::cend(label),
@@ -244,7 +244,7 @@ class PseudoHuberRegression : public FitIntercept {
     CheckRegInputs(info, preds);
     auto slope = param_.huber_slope;
     CHECK_NE(slope, 0.0) << "slope for pseudo huber cannot be 0.";
-    auto labels = info.labels.View(ctx_->gpu_id);
+    auto labels = info.labels.View(ctx_->Device());
 
     out_gpair->SetDevice(ctx_->gpu_id);
     out_gpair->Reshape(info.num_row_, this->Targets(info));
@@ -698,7 +698,7 @@ class MeanAbsoluteError : public ObjFunction {
   void GetGradient(HostDeviceVector<float> const& preds, const MetaInfo& info,
                    std::int32_t /*iter*/, linalg::Matrix<GradientPair>* out_gpair) override {
     CheckRegInputs(info, preds);
-    auto labels = info.labels.View(ctx_->gpu_id);
+    auto labels = info.labels.View(ctx_->Device());
 
     out_gpair->SetDevice(ctx_->Device());
     out_gpair->Reshape(info.num_row_, this->Targets(info));
diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index c092c0b04..26d8f3440 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -663,7 +663,7 @@ class CPUPredictor : public Predictor {
     std::size_t n_samples = p_fmat->Info().num_row_;
     std::size_t n_groups = model.learner_model_param->OutputLength();
     CHECK_EQ(out_preds->size(), n_samples * n_groups);
-    linalg::TensorView<float, 2> out_predt{*out_preds, {n_samples, n_groups}, ctx_->gpu_id};
+    auto out_predt = linalg::MakeTensorView(ctx_, *out_preds, n_samples, n_groups);
 
     if (!p_fmat->PageExists<SparsePage>()) {
       std::vector<Entry> workspace(p_fmat->Info().num_col_ * kUnroll * n_threads);
@@ -732,7 +732,7 @@ class CPUPredictor : public Predictor {
     std::vector<RegTree::FVec> thread_temp;
     InitThreadTemp(n_threads * kBlockSize, &thread_temp);
     std::size_t n_groups = model.learner_model_param->OutputLength();
-    linalg::TensorView<float, 2> out_predt{predictions, {m->NumRows(), n_groups}, Context::kCpuId};
+    auto out_predt = linalg::MakeTensorView(ctx_, predictions, m->NumRows(), n_groups);
     PredictBatchByBlockOfRowsKernel<AdapterView<Adapter>, kBlockSize>(
         AdapterView<Adapter>(m.get(), missing, common::Span<Entry>{workspace}, n_threads), model,
         tree_begin, tree_end, &thread_temp, n_threads, out_predt);
@@ -878,8 +878,8 @@ class CPUPredictor : public Predictor {
     common::ParallelFor(ntree_limit, n_threads, [&](bst_omp_uint i) {
       FillNodeMeanValues(model.trees[i].get(), &(mean_values[i]));
     });
-    auto base_margin = info.base_margin_.View(Context::kCpuId);
-    auto base_score = model.learner_model_param->BaseScore(Context::kCpuId)(0);
+    auto base_margin = info.base_margin_.View(ctx_->Device());
+    auto base_score = model.learner_model_param->BaseScore(ctx_->Device())(0);
     // start collecting the contributions
     for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
       auto page = batch.GetView();
diff --git a/src/predictor/predictor.cc b/src/predictor/predictor.cc
index 2559447f3..4d7fc598f 100644
--- a/src/predictor/predictor.cc
+++ b/src/predictor/predictor.cc
@@ -60,7 +60,7 @@ void Predictor::InitOutPredictions(const MetaInfo& info, HostDeviceVector<bst_fl
   } else {
     // cannot rely on the Resize to fill as it might skip if the size is already correct.
     out_preds->Resize(n);
-    auto base_score = model.learner_model_param->BaseScore(Context::kCpuId)(0);
+    auto base_score = model.learner_model_param->BaseScore(DeviceOrd::CPU())(0);
     out_preds->Fill(base_score);
   }
 }
diff --git a/src/tree/fit_stump.cc b/src/tree/fit_stump.cc
index ec654a1b2..ec1b6fe18 100644
--- a/src/tree/fit_stump.cc
+++ b/src/tree/fit_stump.cc
@@ -74,7 +74,7 @@ void FitStump(Context const* ctx, MetaInfo const& info, linalg::Matrix<GradientP
   gpair.SetDevice(ctx->Device());
   auto gpair_t = gpair.View(ctx->Device());
   ctx->IsCPU() ? cpu_impl::FitStump(ctx, info, gpair_t, out->HostView())
-               : cuda_impl::FitStump(ctx, gpair_t, out->View(ctx->gpu_id));
+      : cuda_impl::FitStump(ctx, gpair_t, out->View(ctx->Device()));
 }
 }  // namespace tree
 }  // namespace xgboost
diff --git a/src/tree/fit_stump.cu b/src/tree/fit_stump.cu
index 33f92014e..40b2a0c96 100644
--- a/src/tree/fit_stump.cu
+++ b/src/tree/fit_stump.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2022 by XGBoost Contributors
+ * Copyright 2022-2023 by XGBoost Contributors
  *
  * \brief Utilities for estimating initial score.
  */
@@ -41,7 +41,7 @@ void FitStump(Context const* ctx, linalg::TensorView<GradientPair const, 2> gpai
         auto sample = i % gpair.Shape(0);
         return GradientPairPrecise{gpair(sample, target)};
       });
-  auto d_sum = sum.View(ctx->gpu_id);
+  auto d_sum = sum.View(ctx->Device());
   CHECK(d_sum.CContiguous());
 
   dh::XGBCachingDeviceAllocator<char> alloc;
diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h
index 82dc99b12..d0267b0ed 100644
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -774,7 +774,7 @@ void UpdatePredictionCacheImpl(Context const *ctx, RegTree const *p_last_tree,
                                std::vector<Partitioner> const &partitioner,
                                linalg::VectorView<float> out_preds) {
   auto const &tree = *p_last_tree;
-  CHECK_EQ(out_preds.DeviceIdx(), Context::kCpuId);
+  CHECK(out_preds.Device().IsCPU());
   size_t n_nodes = p_last_tree->GetNodes().size();
   for (auto &part : partitioner) {
     CHECK_EQ(part.Size(), n_nodes);
@@ -809,7 +809,7 @@ void UpdatePredictionCacheImpl(Context const *ctx, RegTree const *p_last_tree,
   auto n_nodes = mttree->Size();
   auto n_targets = tree.NumTargets();
   CHECK_EQ(out_preds.Shape(1), n_targets);
-  CHECK_EQ(out_preds.DeviceIdx(), Context::kCpuId);
+  CHECK(out_preds.Device().IsCPU());
 
   for (auto &part : partitioner) {
     CHECK_EQ(part.Size(), n_nodes);
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 10fb913b3..0e42f1562 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -516,9 +516,10 @@ struct GPUHistMakerDevice {
     }
 
     CHECK(p_tree);
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
-    CHECK_EQ(out_preds_d.DeviceIdx(), ctx_->gpu_id);
+    CHECK(out_preds_d.Device().IsCUDA());
+    CHECK_EQ(out_preds_d.Device().ordinal, ctx_->Ordinal());
 
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
     auto d_position = dh::ToSpan(positions);
     CHECK_EQ(out_preds_d.Size(), d_position.size());
 
diff --git a/tests/cpp/common/test_linalg.cc b/tests/cpp/common/test_linalg.cc
index b1a90d773..f345b3a78 100644
--- a/tests/cpp/common/test_linalg.cc
+++ b/tests/cpp/common/test_linalg.cc
@@ -3,7 +3,7 @@
  */
 #include <gtest/gtest.h>
 #include <xgboost/context.h>
-#include <xgboost/host_device_vector.h>
+#include <xgboost/host_device_vector.h>  // for HostDeviceVector
 #include <xgboost/linalg.h>
 
 #include <cstddef>  // size_t
@@ -14,8 +14,8 @@
 
 namespace xgboost::linalg {
 namespace {
-auto kCpuId = Context::kCpuId;
-}
+DeviceOrd CPU() { return DeviceOrd::CPU(); }
+}  // namespace
 
 auto MakeMatrixFromTest(HostDeviceVector<float> *storage, std::size_t n_rows, std::size_t n_cols) {
   storage->Resize(n_rows * n_cols);
@@ -23,7 +23,7 @@ auto MakeMatrixFromTest(HostDeviceVector<float> *storage, std::size_t n_rows, st
 
   std::iota(h_storage.begin(), h_storage.end(), 0);
 
-  auto m = linalg::TensorView<float, 2>{h_storage, {n_rows, static_cast<size_t>(n_cols)}, -1};
+  auto m = linalg::TensorView<float, 2>{h_storage, {n_rows, static_cast<size_t>(n_cols)}, CPU()};
   return m;
 }
 
@@ -31,7 +31,7 @@ TEST(Linalg, MatrixView) {
   size_t kRows = 31, kCols = 77;
   HostDeviceVector<float> storage;
   auto m = MakeMatrixFromTest(&storage, kRows, kCols);
-  ASSERT_EQ(m.DeviceIdx(), kCpuId);
+  ASSERT_EQ(m.Device(), CPU());
   ASSERT_EQ(m(0, 0), 0);
   ASSERT_EQ(m(kRows - 1, kCols - 1), storage.Size() - 1);
 }
@@ -76,7 +76,7 @@ TEST(Linalg, TensorView) {
 
   {
     // as vector
-    TensorView<double, 1> vec{data, {data.size()}, -1};
+    TensorView<double, 1> vec{data, {data.size()}, CPU()};
     ASSERT_EQ(vec.Size(), data.size());
     ASSERT_EQ(vec.Shape(0), data.size());
     ASSERT_EQ(vec.Shape().size(), 1);
@@ -87,7 +87,7 @@ TEST(Linalg, TensorView) {
 
   {
     // as matrix
-    TensorView<double, 2> mat(data, {6, 4}, -1);
+    TensorView<double, 2> mat(data, {6, 4}, CPU());
     auto s = mat.Slice(2, All());
     ASSERT_EQ(s.Shape().size(), 1);
     s = mat.Slice(All(), 1);
@@ -96,7 +96,7 @@ TEST(Linalg, TensorView) {
 
   {
     // assignment
-    TensorView<double, 3> t{data, {2, 3, 4}, 0};
+    TensorView<double, 3> t{data, {2, 3, 4}, CPU()};
     double pi = 3.14159;
     auto old = t(1, 2, 3);
     t(1, 2, 3) = pi;
@@ -201,7 +201,7 @@ TEST(Linalg, TensorView) {
   }
   {
     // f-contiguous
-    TensorView<double, 3> t{data, {4, 3, 2}, {1, 4, 12}, kCpuId};
+    TensorView<double, 3> t{data, {4, 3, 2}, {1, 4, 12}, CPU()};
     ASSERT_TRUE(t.Contiguous());
     ASSERT_TRUE(t.FContiguous());
     ASSERT_FALSE(t.CContiguous());
@@ -210,11 +210,11 @@ TEST(Linalg, TensorView) {
 
 TEST(Linalg, Tensor) {
   {
-    Tensor<float, 3> t{{2, 3, 4}, kCpuId, Order::kC};
-    auto view = t.View(kCpuId);
+    Tensor<float, 3> t{{2, 3, 4}, CPU(), Order::kC};
+    auto view = t.View(CPU());
 
     auto const &as_const = t;
-    auto k_view = as_const.View(kCpuId);
+    auto k_view = as_const.View(CPU());
 
     size_t n = 2 * 3 * 4;
     ASSERT_EQ(t.Size(), n);
@@ -229,7 +229,7 @@ TEST(Linalg, Tensor) {
   }
   {
     // Reshape
-    Tensor<float, 3> t{{2, 3, 4}, kCpuId, Order::kC};
+    Tensor<float, 3> t{{2, 3, 4}, CPU(), Order::kC};
     t.Reshape(4, 3, 2);
     ASSERT_EQ(t.Size(), 24);
     ASSERT_EQ(t.Shape(2), 2);
@@ -247,7 +247,7 @@ TEST(Linalg, Tensor) {
 
 TEST(Linalg, Empty) {
   {
-    auto t = TensorView<double, 2>{{}, {0, 3}, kCpuId, Order::kC};
+    auto t = TensorView<double, 2>{{}, {0, 3}, CPU(), Order::kC};
     for (int32_t i : {0, 1, 2}) {
       auto s = t.Slice(All(), i);
       ASSERT_EQ(s.Size(), 0);
@@ -256,9 +256,9 @@ TEST(Linalg, Empty) {
     }
   }
   {
-    auto t = Tensor<double, 2>{{0, 3}, kCpuId, Order::kC};
+    auto t = Tensor<double, 2>{{0, 3}, CPU(), Order::kC};
     ASSERT_EQ(t.Size(), 0);
-    auto view = t.View(kCpuId);
+    auto view = t.View(CPU());
 
     for (int32_t i : {0, 1, 2}) {
       auto s = view.Slice(All(), i);
@@ -270,7 +270,7 @@ TEST(Linalg, Empty) {
 }
 
 TEST(Linalg, ArrayInterface) {
-  auto cpu = kCpuId;
+  auto cpu = CPU();
   auto t = Tensor<double, 2>{{3, 3}, cpu, Order::kC};
   auto v = t.View(cpu);
   std::iota(v.Values().begin(), v.Values().end(), 0);
@@ -315,16 +315,16 @@ TEST(Linalg, Popc) {
 }
 
 TEST(Linalg, Stack) {
-  Tensor<float, 3> l{{2, 3, 4}, kCpuId, Order::kC};
-  ElementWiseTransformHost(l.View(kCpuId), omp_get_max_threads(),
+  Tensor<float, 3> l{{2, 3, 4}, CPU(), Order::kC};
+  ElementWiseTransformHost(l.View(CPU()), omp_get_max_threads(),
                            [=](size_t i, float) { return i; });
-  Tensor<float, 3> r_0{{2, 3, 4}, kCpuId, Order::kC};
-  ElementWiseTransformHost(r_0.View(kCpuId), omp_get_max_threads(),
+  Tensor<float, 3> r_0{{2, 3, 4}, CPU(), Order::kC};
+  ElementWiseTransformHost(r_0.View(CPU()), omp_get_max_threads(),
                            [=](size_t i, float) { return i; });
 
   Stack(&l, r_0);
 
-  Tensor<float, 3> r_1{{0, 3, 4}, kCpuId, Order::kC};
+  Tensor<float, 3> r_1{{0, 3, 4}, CPU(), Order::kC};
   Stack(&l, r_1);
   ASSERT_EQ(l.Shape(0), 4);
 
@@ -335,7 +335,7 @@ TEST(Linalg, Stack) {
 TEST(Linalg, FOrder) {
   std::size_t constexpr kRows = 16, kCols = 3;
   std::vector<float> data(kRows * kCols);
-  MatrixView<float> mat{data, {kRows, kCols}, Context::kCpuId, Order::kF};
+  MatrixView<float> mat{data, {kRows, kCols}, CPU(), Order::kF};
   float k{0};
   for (std::size_t i = 0; i < kRows; ++i) {
     for (std::size_t j = 0; j < kCols; ++j) {
diff --git a/tests/cpp/common/test_linalg.cu b/tests/cpp/common/test_linalg.cu
index be89d51bc..b88b8e127 100644
--- a/tests/cpp/common/test_linalg.cu
+++ b/tests/cpp/common/test_linalg.cu
@@ -11,17 +11,18 @@
 namespace xgboost::linalg {
 namespace {
 void TestElementWiseKernel() {
+  auto device = DeviceOrd::CUDA(0);
   Tensor<float, 3> l{{2, 3, 4}, 0};
   {
     /**
      * Non-contiguous
      */
     // GPU view
-    auto t = l.View(0).Slice(linalg::All(), 1, linalg::All());
+    auto t = l.View(device).Slice(linalg::All(), 1, linalg::All());
     ASSERT_FALSE(t.CContiguous());
     ElementWiseTransformDevice(t, [] __device__(size_t i, float) { return i; });
     // CPU view
-    t = l.View(Context::kCpuId).Slice(linalg::All(), 1, linalg::All());
+    t = l.View(DeviceOrd::CPU()).Slice(linalg::All(), 1, linalg::All());
     size_t k = 0;
     for (size_t i = 0; i < l.Shape(0); ++i) {
       for (size_t j = 0; j < l.Shape(2); ++j) {
@@ -29,7 +30,7 @@ void TestElementWiseKernel() {
       }
     }
 
-    t = l.View(0).Slice(linalg::All(), 1, linalg::All());
+    t = l.View(device).Slice(linalg::All(), 1, linalg::All());
     ElementWiseKernelDevice(t, [] XGBOOST_DEVICE(size_t i, float v) { SPAN_CHECK(v == i); });
   }
 
@@ -37,11 +38,11 @@ void TestElementWiseKernel() {
     /**
      * Contiguous
      */
-    auto t = l.View(0);
+    auto t = l.View(device);
     ElementWiseTransformDevice(t, [] XGBOOST_DEVICE(size_t i, float) { return i; });
     ASSERT_TRUE(t.CContiguous());
     // CPU view
-    t = l.View(Context::kCpuId);
+    t = l.View(DeviceOrd::CPU());
 
     size_t ind = 0;
     for (size_t i = 0; i < l.Shape(0); ++i) {
diff --git a/tests/cpp/common/test_ranking_utils.cu b/tests/cpp/common/test_ranking_utils.cu
index d62f5f171..86ce4b6d0 100644
--- a/tests/cpp/common/test_ranking_utils.cu
+++ b/tests/cpp/common/test_ranking_utils.cu
@@ -41,7 +41,7 @@ void TestCalcQueriesInvIDCG() {
   p.UpdateAllowUnknown(Args{{"ndcg_exp_gain", "false"}});
 
   cuda_impl::CalcQueriesInvIDCG(&ctx, linalg::MakeTensorView(&ctx, d_scores, d_scores.size()),
-                                dh::ToSpan(group_ptr), inv_IDCG.View(ctx.gpu_id), p);
+                                dh::ToSpan(group_ptr), inv_IDCG.View(ctx.Device()), p);
   for (std::size_t i = 0; i < n_groups; ++i) {
     double inv_idcg = inv_IDCG(i);
     ASSERT_NEAR(inv_idcg, 0.00551782, kRtEps);
diff --git a/tests/cpp/common/test_stats.cu b/tests/cpp/common/test_stats.cu
index 08877ac8d..3dc90e069 100644
--- a/tests/cpp/common/test_stats.cu
+++ b/tests/cpp/common/test_stats.cu
@@ -47,7 +47,7 @@ class StatsGPU : public ::testing::Test {
     data.insert(data.cend(), seg.begin(), seg.end());
     data.insert(data.cend(), seg.begin(), seg.end());
     linalg::Tensor<float, 1> arr{data.cbegin(), data.cend(), {data.size()}, 0};
-    auto d_arr = arr.View(0);
+    auto d_arr = arr.View(DeviceOrd::CUDA(0));
 
     auto key_it = dh::MakeTransformIterator<std::size_t>(
         thrust::make_counting_iterator(0ul),
@@ -71,8 +71,8 @@ class StatsGPU : public ::testing::Test {
   }
 
   void Weighted() {
-    auto d_arr = arr_.View(0);
-    auto d_key = indptr_.View(0);
+    auto d_arr = arr_.View(DeviceOrd::CUDA(0));
+    auto d_key = indptr_.View(DeviceOrd::CUDA(0));
 
     auto key_it = dh::MakeTransformIterator<std::size_t>(
         thrust::make_counting_iterator(0ul),
@@ -81,7 +81,7 @@ class StatsGPU : public ::testing::Test {
         dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
                                          [=] XGBOOST_DEVICE(std::size_t i) { return d_arr(i); });
     linalg::Tensor<float, 1> weights{{10}, 0};
-    linalg::ElementWiseTransformDevice(weights.View(0),
+    linalg::ElementWiseTransformDevice(weights.View(DeviceOrd::CUDA(0)),
                                        [=] XGBOOST_DEVICE(std::size_t, float) { return 1.0; });
     auto w_it = weights.Data()->ConstDevicePointer();
     for (auto const& pair : TestSet{{0.0f, 1.0f}, {0.5f, 3.0f}, {1.0f, 5.0f}}) {
@@ -102,7 +102,7 @@ class StatsGPU : public ::testing::Test {
     data.insert(data.cend(), seg.begin(), seg.end());
     data.insert(data.cend(), seg.begin(), seg.end());
     linalg::Tensor<float, 1> arr{data.cbegin(), data.cend(), {data.size()}, 0};
-    auto d_arr = arr.View(0);
+    auto d_arr = arr.View(DeviceOrd::CUDA(0));
 
     auto key_it = dh::MakeTransformIterator<std::size_t>(
         thrust::make_counting_iterator(0ul),
@@ -125,8 +125,8 @@ class StatsGPU : public ::testing::Test {
   }
 
   void NonWeighted() {
-    auto d_arr = arr_.View(0);
-    auto d_key = indptr_.View(0);
+    auto d_arr = arr_.View(DeviceOrd::CUDA(0));
+    auto d_key = indptr_.View(DeviceOrd::CUDA(0));
 
     auto key_it = dh::MakeTransformIterator<std::size_t>(
         thrust::make_counting_iterator(0ul), [=] __device__(std::size_t i) { return d_key(i); });
diff --git a/tests/cpp/data/test_array_interface.cc b/tests/cpp/data/test_array_interface.cc
index 7e0484842..b692a2aa5 100644
--- a/tests/cpp/data/test_array_interface.cc
+++ b/tests/cpp/data/test_array_interface.cc
@@ -22,7 +22,7 @@ TEST(ArrayInterface, Initialize) {
 
   HostDeviceVector<size_t> u64_storage(storage.Size());
   std::string u64_arr_str{ArrayInterfaceStr(linalg::TensorView<size_t const, 2>{
-      u64_storage.ConstHostSpan(), {kRows, kCols}, Context::kCpuId})};
+      u64_storage.ConstHostSpan(), {kRows, kCols}, DeviceOrd::CPU()})};
   std::copy(storage.ConstHostVector().cbegin(), storage.ConstHostVector().cend(),
             u64_storage.HostSpan().begin());
   auto u64_arr = ArrayInterface<2>{u64_arr_str};
diff --git a/tests/cpp/data/test_metainfo.cc b/tests/cpp/data/test_metainfo.cc
index 5ebe1c6bd..dbaffb7cd 100644
--- a/tests/cpp/data/test_metainfo.cc
+++ b/tests/cpp/data/test_metainfo.cc
@@ -129,8 +129,8 @@ TEST(MetaInfo, SaveLoadBinary) {
     EXPECT_EQ(inforead.group_ptr_, info.group_ptr_);
     EXPECT_EQ(inforead.weights_.HostVector(), info.weights_.HostVector());
 
-    auto orig_margin = info.base_margin_.View(xgboost::Context::kCpuId);
-    auto read_margin = inforead.base_margin_.View(xgboost::Context::kCpuId);
+    auto orig_margin = info.base_margin_.View(xgboost::DeviceOrd::CPU());
+    auto read_margin = inforead.base_margin_.View(xgboost::DeviceOrd::CPU());
     EXPECT_TRUE(std::equal(orig_margin.Values().cbegin(), orig_margin.Values().cend(),
                            read_margin.Values().cbegin()));
 
@@ -267,8 +267,8 @@ TEST(MetaInfo, Validate) {
   xgboost::HostDeviceVector<xgboost::bst_group_t> d_groups{groups};
   d_groups.SetDevice(0);
   d_groups.DevicePointer();  // pull to device
-  std::string arr_interface_str{ArrayInterfaceStr(
-      xgboost::linalg::MakeVec(d_groups.ConstDevicePointer(), d_groups.Size(), 0))};
+  std::string arr_interface_str{ArrayInterfaceStr(xgboost::linalg::MakeVec(
+      d_groups.ConstDevicePointer(), d_groups.Size(), xgboost::DeviceOrd::CUDA(0)))};
   EXPECT_THROW(info.SetInfo(ctx, "group", xgboost::StringView{arr_interface_str}), dmlc::Error);
 #endif  // defined(XGBOOST_USE_CUDA)
 }
@@ -307,5 +307,5 @@ TEST(MetaInfo, HostExtend) {
 }
 
 namespace xgboost {
-TEST(MetaInfo, CPUStridedData) { TestMetaInfoStridedData(Context::kCpuId); }
+TEST(MetaInfo, CPUStridedData) { TestMetaInfoStridedData(DeviceOrd::CPU()); }
 }  // namespace xgboost
diff --git a/tests/cpp/data/test_metainfo.cu b/tests/cpp/data/test_metainfo.cu
index 95c8f5f39..4f02dfddc 100644
--- a/tests/cpp/data/test_metainfo.cu
+++ b/tests/cpp/data/test_metainfo.cu
@@ -65,7 +65,7 @@ TEST(MetaInfo, FromInterface) {
   }
 
   info.SetInfo(ctx, "base_margin", str.c_str());
-  auto const h_base_margin = info.base_margin_.View(Context::kCpuId);
+  auto const h_base_margin = info.base_margin_.View(DeviceOrd::CPU());
   ASSERT_EQ(h_base_margin.Size(), d_data.size());
   for (size_t i = 0; i < d_data.size(); ++i) {
     ASSERT_EQ(h_base_margin(i), d_data[i]);
@@ -83,7 +83,7 @@ TEST(MetaInfo, FromInterface) {
 }
 
 TEST(MetaInfo, GPUStridedData) {
-  TestMetaInfoStridedData(0);
+  TestMetaInfoStridedData(DeviceOrd::CUDA(0));
 }
 
 TEST(MetaInfo, Group) {
diff --git a/tests/cpp/data/test_metainfo.h b/tests/cpp/data/test_metainfo.h
index 6e45b5062..fba882e0e 100644
--- a/tests/cpp/data/test_metainfo.h
+++ b/tests/cpp/data/test_metainfo.h
@@ -14,10 +14,10 @@
 #include "../../../src/data/array_interface.h"
 
 namespace xgboost {
-inline void TestMetaInfoStridedData(int32_t device) {
+inline void TestMetaInfoStridedData(DeviceOrd device) {
   MetaInfo info;
   Context ctx;
-  ctx.UpdateAllowUnknown(Args{{"gpu_id", std::to_string(device)}});
+  ctx.UpdateAllowUnknown(Args{{"device", device.Name()}});
   {
     // labels
     linalg::Tensor<float, 3> labels;
@@ -28,9 +28,9 @@ inline void TestMetaInfoStridedData(int32_t device) {
     ASSERT_EQ(t_labels.Shape().size(), 2);
 
     info.SetInfo(ctx, "label", StringView{ArrayInterfaceStr(t_labels)});
-    auto const& h_result = info.labels.View(-1);
+    auto const& h_result = info.labels.View(DeviceOrd::CPU());
     ASSERT_EQ(h_result.Shape().size(), 2);
-    auto in_labels = labels.View(-1);
+    auto in_labels = labels.View(DeviceOrd::CPU());
     linalg::ElementWiseKernelHost(h_result, omp_get_max_threads(), [&](size_t i, float& v_0) {
       auto tup = linalg::UnravelIndex(i, h_result.Shape());
       auto i0 = std::get<0>(tup);
@@ -62,9 +62,9 @@ inline void TestMetaInfoStridedData(int32_t device) {
     ASSERT_EQ(t_margin.Shape().size(), 2);
 
     info.SetInfo(ctx, "base_margin", StringView{ArrayInterfaceStr(t_margin)});
-    auto const& h_result = info.base_margin_.View(-1);
+    auto const& h_result = info.base_margin_.View(DeviceOrd::CPU());
     ASSERT_EQ(h_result.Shape().size(), 2);
-    auto in_margin = base_margin.View(-1);
+    auto in_margin = base_margin.View(DeviceOrd::CPU());
     linalg::ElementWiseKernelHost(h_result, omp_get_max_threads(), [&](size_t i, float v_0) {
       auto tup = linalg::UnravelIndex(i, h_result.Shape());
       auto i0 = std::get<0>(tup);
diff --git a/tests/cpp/data/test_simple_dmatrix.cc b/tests/cpp/data/test_simple_dmatrix.cc
index 43d0877d3..f1d588196 100644
--- a/tests/cpp/data/test_simple_dmatrix.cc
+++ b/tests/cpp/data/test_simple_dmatrix.cc
@@ -298,8 +298,8 @@ TEST(SimpleDMatrix, Slice) {
         ASSERT_EQ(p_m->Info().weights_.HostVector().at(ridx),
                   out->Info().weights_.HostVector().at(i));
 
-        auto out_margin = out->Info().base_margin_.View(Context::kCpuId);
-        auto in_margin = margin.View(Context::kCpuId);
+        auto out_margin = out->Info().base_margin_.View(DeviceOrd::CPU());
+        auto in_margin = margin.View(DeviceOrd::CPU());
         for (size_t j = 0; j < kClasses; ++j) {
           ASSERT_EQ(out_margin(i, j), in_margin(ridx, j));
         }
@@ -372,8 +372,8 @@ TEST(SimpleDMatrix, SliceCol) {
                     out->Info().labels_upper_bound_.HostVector().at(i));
           ASSERT_EQ(p_m->Info().weights_.HostVector().at(i), out->Info().weights_.HostVector().at(i));
 
-          auto out_margin = out->Info().base_margin_.View(Context::kCpuId);
-          auto in_margin = margin.View(Context::kCpuId);
+          auto out_margin = out->Info().base_margin_.View(DeviceOrd::CPU());
+          auto in_margin = margin.View(DeviceOrd::CPU());
           for (size_t j = 0; j < kClasses; ++j) {
             ASSERT_EQ(out_margin(i, j), in_margin(i, j));
           }
diff --git a/tests/cpp/objective/test_lambdarank_obj.cu b/tests/cpp/objective/test_lambdarank_obj.cu
index 1c13665fc..c80ec20fc 100644
--- a/tests/cpp/objective/test_lambdarank_obj.cu
+++ b/tests/cpp/objective/test_lambdarank_obj.cu
@@ -39,9 +39,9 @@ void TestGPUMakePair() {
   auto make_args = [&](std::shared_ptr<ltr::RankingCache> p_cache, auto rank_idx,
                        common::Span<std::size_t const> y_sorted_idx) {
     linalg::Vector<double> dummy;
-    auto d = dummy.View(ctx.gpu_id);
+    auto d = dummy.View(ctx.Device());
     linalg::Vector<GradientPair> dgpair;
-    auto dg = dgpair.View(ctx.gpu_id);
+    auto dg = dgpair.View(ctx.Device());
     cuda_impl::KernelInputs args{
         d,
         d,
@@ -50,9 +50,9 @@ void TestGPUMakePair() {
         p_cache->DataGroupPtr(&ctx),
         p_cache->CUDAThreadsGroupPtr(),
         rank_idx,
-        info.labels.View(ctx.gpu_id),
+        info.labels.View(ctx.Device()),
         predt.ConstDeviceSpan(),
-        linalg::MatrixView<GradientPair>{common::Span<GradientPair>{}, {0}, 0},
+        linalg::MatrixView<GradientPair>{common::Span<GradientPair>{}, {0}, DeviceOrd::CUDA(0)},
         dg,
         nullptr,
         y_sorted_idx,
diff --git a/tests/cpp/predictor/test_gpu_predictor.cu b/tests/cpp/predictor/test_gpu_predictor.cu
index 3a65e3e06..f31158482 100644
--- a/tests/cpp/predictor/test_gpu_predictor.cu
+++ b/tests/cpp/predictor/test_gpu_predictor.cu
@@ -226,7 +226,7 @@ TEST(GPUPredictor, ShapStump) {
   auto dmat = RandomDataGenerator(3, 1, 0).GenerateDMatrix();
   gpu_predictor->PredictContribution(dmat.get(), &predictions, model);
   auto& phis = predictions.HostVector();
-  auto base_score = mparam.BaseScore(Context::kCpuId)(0);
+  auto base_score = mparam.BaseScore(DeviceOrd::CPU())(0);
   EXPECT_EQ(phis[0], 0.0);
   EXPECT_EQ(phis[1], base_score);
   EXPECT_EQ(phis[2], 0.0);
diff --git a/tests/cpp/predictor/test_predictor.cc b/tests/cpp/predictor/test_predictor.cc
index 993504c57..a9f218c0c 100644
--- a/tests/cpp/predictor/test_predictor.cc
+++ b/tests/cpp/predictor/test_predictor.cc
@@ -287,7 +287,7 @@ void TestCategoricalPrediction(Context const* ctx, bool is_column_split) {
 
   predictor->InitOutPredictions(m->Info(), &out_predictions.predictions, model);
   predictor->PredictBatch(m.get(), &out_predictions, model, 0);
-  auto score = mparam.BaseScore(Context::kCpuId)(0);
+  auto score = mparam.BaseScore(DeviceOrd::CPU())(0);
   ASSERT_EQ(out_predictions.predictions.Size(), 1ul);
   ASSERT_EQ(out_predictions.predictions.HostVector()[0],
             right_weight + score);  // go to right for matching cat

From 2462e22cd425d8918a4c507e0318d969e4e298bd Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 29 Aug 2023 15:10:16 +0800
Subject: [PATCH 128/136] Bump com.nvidia:rapids-4-spark_2.12 in /jvm-packages
 (#9517)

Bumps com.nvidia:rapids-4-spark_2.12 from 23.08.0 to 23.08.1.

---
updated-dependencies:
- dependency-name: com.nvidia:rapids-4-spark_2.12
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 jvm-packages/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index ba4bbdf72..5469773c5 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -44,7 +44,7 @@
         <log.capi.invocation>OFF</log.capi.invocation>
         <use.cuda>OFF</use.cuda>
         <cudf.version>23.08.0</cudf.version>
-        <spark.rapids.version>23.08.0</spark.rapids.version>
+        <spark.rapids.version>23.08.1</spark.rapids.version>
         <cudf.classifier>cuda11</cudf.classifier>
         <scalatest.version>3.2.16</scalatest.version>
         <scala-collection-compat.version>2.11.0</scala-collection-compat.version>

From ccfc90e4c6aa3d3328b092b34065f5acdc6bce8a Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 30 Aug 2023 13:00:04 +0800
Subject: [PATCH 129/136] [rabit] Improved connection handling. (#9531)

- Enable timeout.
- Report connection error from the system.
- Handle retry for both tracker connection and peer connection.
---
 .readthedocs.yaml                          |   1 -
 include/xgboost/collective/result.h        | 160 +++++++++++++++++++
 include/xgboost/collective/socket.h        |  71 ++++++---
 python-package/xgboost/testing/__init__.py |   4 +
 rabit/include/rabit/internal/socket.h      |  30 ++--
 rabit/src/allreduce_base.cc                | 169 +++++++++++++--------
 rabit/src/allreduce_base.h                 |  12 +-
 src/collective/socket.cc                   |  89 +++++++++--
 tests/cpp/collective/test_socket.cc        |  19 ++-
 tests/python/test_tracker.py               |  38 +++++
 10 files changed, 463 insertions(+), 130 deletions(-)
 create mode 100644 include/xgboost/collective/result.h

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 80c2b8404..fb7c8dbe6 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -32,4 +32,3 @@ formats:
 python:
   install:
    - requirements: doc/requirements.txt
-  system_packages: true
diff --git a/include/xgboost/collective/result.h b/include/xgboost/collective/result.h
new file mode 100644
index 000000000..209362505
--- /dev/null
+++ b/include/xgboost/collective/result.h
@@ -0,0 +1,160 @@
+/**
+ *  Copyright 2023, XGBoost Contributors
+ */
+#pragma once
+
+#include <memory>   // for unique_ptr
+#include <sstream>  // for stringstream
+#include <stack>    // for stack
+#include <string>   // for string
+#include <utility>  // for move
+
+namespace xgboost::collective {
+namespace detail {
+struct ResultImpl {
+  std::string message;
+  std::error_code errc{};  // optional for system error.
+
+  std::unique_ptr<ResultImpl> prev{nullptr};
+
+  ResultImpl() = delete;  // must initialize.
+  ResultImpl(ResultImpl const& that) = delete;
+  ResultImpl(ResultImpl&& that) = default;
+  ResultImpl& operator=(ResultImpl const& that) = delete;
+  ResultImpl& operator=(ResultImpl&& that) = default;
+
+  explicit ResultImpl(std::string msg) : message{std::move(msg)} {}
+  explicit ResultImpl(std::string msg, std::error_code errc)
+      : message{std::move(msg)}, errc{std::move(errc)} {}
+  explicit ResultImpl(std::string msg, std::unique_ptr<ResultImpl> prev)
+      : message{std::move(msg)}, prev{std::move(prev)} {}
+  explicit ResultImpl(std::string msg, std::error_code errc, std::unique_ptr<ResultImpl> prev)
+      : message{std::move(msg)}, errc{std::move(errc)}, prev{std::move(prev)} {}
+
+  [[nodiscard]] bool operator==(ResultImpl const& that) const noexcept(true) {
+    if ((prev && !that.prev) || (!prev && that.prev)) {
+      // one of them doesn't have prev
+      return false;
+    }
+
+    auto cur_eq = message == that.message && errc == that.errc;
+    if (prev && that.prev) {
+      // recursive comparison
+      auto prev_eq = *prev == *that.prev;
+      return cur_eq && prev_eq;
+    }
+    return cur_eq;
+  }
+
+  [[nodiscard]] std::string Report() {
+    std::stringstream ss;
+    ss << "\n- " << this->message;
+    if (this->errc != std::error_code{}) {
+      ss << " system error:" << this->errc.message();
+    }
+
+    auto ptr = prev.get();
+    while (ptr) {
+      ss << "\n- ";
+      ss << ptr->message;
+
+      if (ptr->errc != std::error_code{}) {
+        ss << " " << ptr->errc.message();
+      }
+      ptr = ptr->prev.get();
+    }
+
+    return ss.str();
+  }
+  [[nodiscard]] auto Code() const {
+    // Find the root error.
+    std::stack<ResultImpl const*> stack;
+    auto ptr = this;
+    while (ptr) {
+      stack.push(ptr);
+      if (ptr->prev) {
+        ptr = ptr->prev.get();
+      } else {
+        break;
+      }
+    }
+    while (!stack.empty()) {
+      auto frame = stack.top();
+      stack.pop();
+      if (frame->errc != std::error_code{}) {
+        return frame->errc;
+      }
+    }
+    return std::error_code{};
+  }
+};
+}  // namespace detail
+
+/**
+ * @brief An error type that's easier to handle than throwing dmlc exception. We can
+ *        record and propagate the system error code.
+ */
+struct Result {
+ private:
+  std::unique_ptr<detail::ResultImpl> impl_{nullptr};
+
+ public:
+  Result() noexcept(true) = default;
+  explicit Result(std::string msg) : impl_{std::make_unique<detail::ResultImpl>(std::move(msg))} {}
+  explicit Result(std::string msg, std::error_code errc)
+      : impl_{std::make_unique<detail::ResultImpl>(std::move(msg), std::move(errc))} {}
+  Result(std::string msg, Result&& prev)
+      : impl_{std::make_unique<detail::ResultImpl>(std::move(msg), std::move(prev.impl_))} {}
+  Result(std::string msg, std::error_code errc, Result&& prev)
+      : impl_{std::make_unique<detail::ResultImpl>(std::move(msg), std::move(errc),
+                                                   std::move(prev.impl_))} {}
+
+  Result(Result const& that) = delete;
+  Result& operator=(Result const& that) = delete;
+  Result(Result&& that) = default;
+  Result& operator=(Result&& that) = default;
+
+  [[nodiscard]] bool OK() const noexcept(true) { return !impl_; }
+  [[nodiscard]] std::string Report() const { return OK() ? "" : impl_->Report(); }
+  /**
+   * @brief Return the root system error. This might return success if there's no system error.
+   */
+  [[nodiscard]] auto Code() const { return OK() ? std::error_code{} : impl_->Code(); }
+  [[nodiscard]] bool operator==(Result const& that) const noexcept(true) {
+    if (OK() && that.OK()) {
+      return true;
+    }
+    if ((OK() && !that.OK()) || (!OK() && that.OK())) {
+      return false;
+    }
+    return *impl_ == *that.impl_;
+  }
+};
+
+/**
+ * @brief Return success.
+ */
+[[nodiscard]] inline auto Success() noexcept(true) { return Result{}; }
+/**
+ * @brief Return failure.
+ */
+[[nodiscard]] inline auto Fail(std::string msg) { return Result{std::move(msg)}; }
+/**
+ * @brief Return failure with `errno`.
+ */
+[[nodiscard]] inline auto Fail(std::string msg, std::error_code errc) {
+  return Result{std::move(msg), std::move(errc)};
+}
+/**
+ * @brief Return failure with a previous error.
+ */
+[[nodiscard]] inline auto Fail(std::string msg, Result&& prev) {
+  return Result{std::move(msg), std::forward<Result>(prev)};
+}
+/**
+ * @brief Return failure with a previous error and a new `errno`.
+ */
+[[nodiscard]] inline auto Fail(std::string msg, std::error_code errc, Result&& prev) {
+  return Result{std::move(msg), std::move(errc), std::forward<Result>(prev)};
+}
+}  // namespace xgboost::collective
diff --git a/include/xgboost/collective/socket.h b/include/xgboost/collective/socket.h
index b5fa7cd70..5bff2204e 100644
--- a/include/xgboost/collective/socket.h
+++ b/include/xgboost/collective/socket.h
@@ -56,9 +56,10 @@ using ssize_t = int;
 
 #endif                            // defined(_WIN32)
 
-#include "xgboost/base.h"         // XGBOOST_EXPECT
-#include "xgboost/logging.h"      // LOG
-#include "xgboost/string_view.h"  // StringView
+#include "xgboost/base.h"               // XGBOOST_EXPECT
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/logging.h"            // LOG
+#include "xgboost/string_view.h"        // StringView
 
 #if !defined(HOST_NAME_MAX)
 #define HOST_NAME_MAX 256  // macos
@@ -81,6 +82,10 @@ inline std::int32_t LastError() {
 #endif
 }
 
+[[nodiscard]] inline collective::Result FailWithCode(std::string msg) {
+  return collective::Fail(std::move(msg), std::error_code{LastError(), std::system_category()});
+}
+
 #if defined(__GLIBC__)
 inline auto ThrowAtError(StringView fn_name, std::int32_t errsv = LastError(),
                          std::int32_t line = __builtin_LINE(),
@@ -120,15 +125,19 @@ inline std::int32_t CloseSocket(SocketT fd) {
 #endif
 }
 
-inline bool LastErrorWouldBlock() {
-  int errsv = LastError();
+inline bool ErrorWouldBlock(std::int32_t errsv) noexcept(true) {
 #ifdef _WIN32
   return errsv == WSAEWOULDBLOCK;
 #else
-  return errsv == EAGAIN || errsv == EWOULDBLOCK;
+  return errsv == EAGAIN || errsv == EWOULDBLOCK || errsv == EINPROGRESS;
 #endif  // _WIN32
 }
 
+inline bool LastErrorWouldBlock() {
+  int errsv = LastError();
+  return ErrorWouldBlock(errsv);
+}
+
 inline void SocketStartup() {
 #if defined(_WIN32)
   WSADATA wsa_data;
@@ -315,23 +324,35 @@ class TCPSocket {
   bool IsClosed() const { return handle_ == InvalidSocket(); }
 
   /** \brief get last error code if any */
-  std::int32_t GetSockError() const {
-    std::int32_t error = 0;
-    socklen_t len = sizeof(error);
-    xgboost_CHECK_SYS_CALL(
-        getsockopt(handle_, SOL_SOCKET, SO_ERROR, reinterpret_cast<char *>(&error), &len), 0);
-    return error;
+  Result GetSockError() const {
+    std::int32_t optval = 0;
+    socklen_t len = sizeof(optval);
+    auto ret = getsockopt(handle_, SOL_SOCKET, SO_ERROR, reinterpret_cast<char *>(&optval), &len);
+    if (ret != 0) {
+      auto errc = std::error_code{system::LastError(), std::system_category()};
+      return Fail("Failed to retrieve socket error.", std::move(errc));
+    }
+    if (optval != 0) {
+      auto errc = std::error_code{optval, std::system_category()};
+      return Fail("Socket error.", std::move(errc));
+    }
+    return Success();
   }
+
   /** \brief check if anything bad happens */
   bool BadSocket() const {
-    if (IsClosed()) return true;
-    std::int32_t err = GetSockError();
-    if (err == EBADF || err == EINTR) return true;
+    if (IsClosed()) {
+      return true;
+    }
+    auto err = GetSockError();
+    if (err.Code() == std::error_code{EBADF, std::system_category()} ||  // NOLINT
+        err.Code() == std::error_code{EINTR, std::system_category()}) {  // NOLINT
+      return true;
+    }
     return false;
   }
 
-  void SetNonBlock() {
-    bool non_block{true};
+  void SetNonBlock(bool non_block) {
 #if defined(_WIN32)
     u_long mode = non_block ? 1 : 0;
     xgboost_CHECK_SYS_CALL(ioctlsocket(handle_, FIONBIO, &mode), NO_ERROR);
@@ -530,10 +551,20 @@ class TCPSocket {
 };
 
 /**
- * \brief Connect to remote address, returns the error code if failed (no exception is
- *        raised so that we can retry).
+ * @brief Connect to remote address, returns the error code if failed.
+ *
+ * @param host   Host IP address.
+ * @param port   Connection port.
+ * @param retry  Number of retries to attempt.
+ * @param timeout  Timeout of each connection attempt.
+ * @param out_conn Output socket if the connection is successful. Value is invalid and undefined if
+ *                 the connection failed.
+ *
+ * @return Connection status.
  */
-std::error_code Connect(SockAddress const &addr, TCPSocket *out);
+[[nodiscard]] Result Connect(xgboost::StringView host, std::int32_t port, std::int32_t retry,
+                             std::chrono::seconds timeout,
+                             xgboost::collective::TCPSocket *out_conn);
 
 /**
  * \brief Get the local host name.
diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py
index 41fd6405a..2e0933a43 100644
--- a/python-package/xgboost/testing/__init__.py
+++ b/python-package/xgboost/testing/__init__.py
@@ -94,6 +94,10 @@ def no_ipv6() -> PytestSkip:
     return {"condition": not has_ipv6(), "reason": "IPv6 is required to be enabled."}
 
 
+def not_linux() -> PytestSkip:
+    return {"condition": system() != "Linux", "reason": "Linux is required."}
+
+
 def no_ubjson() -> PytestSkip:
     return no_mod("ubjson")
 
diff --git a/rabit/include/rabit/internal/socket.h b/rabit/include/rabit/internal/socket.h
index cb7d4a078..6fb7fe725 100644
--- a/rabit/include/rabit/internal/socket.h
+++ b/rabit/include/rabit/internal/socket.h
@@ -1,10 +1,11 @@
-/*!
- *  Copyright (c) 2014-2022 by XGBoost Contributors
+/**
+ *  Copyright 2014-2023, XGBoost Contributors
  * \file socket.h
  * \author Tianqi Chen
  */
 #ifndef RABIT_INTERNAL_SOCKET_H_
 #define RABIT_INTERNAL_SOCKET_H_
+#include "xgboost/collective/result.h"
 #include "xgboost/collective/socket.h"
 
 #if defined(_WIN32)
@@ -77,7 +78,7 @@ namespace rabit {
 namespace utils {
 
 template <typename PollFD>
-int PollImpl(PollFD *pfd, int nfds, std::chrono::seconds timeout) {
+int PollImpl(PollFD* pfd, int nfds, std::chrono::seconds timeout) noexcept(true) {
 #if defined(_WIN32)
 
 #if IS_MINGW()
@@ -135,11 +136,11 @@ struct PollHelper {
    * \brief Check if the descriptor is ready for read
    * \param fd file descriptor to check status
    */
-  inline bool CheckRead(SOCKET fd) const {
+  [[nodiscard]] bool CheckRead(SOCKET fd) const {
     const auto& pfd = fds.find(fd);
     return pfd != fds.end() && ((pfd->second.events & POLLIN) != 0);
   }
-  bool CheckRead(xgboost::collective::TCPSocket const &socket) const {
+  [[nodiscard]] bool CheckRead(xgboost::collective::TCPSocket const& socket) const {
     return this->CheckRead(socket.Handle());
   }
 
@@ -147,19 +148,19 @@ struct PollHelper {
    * \brief Check if the descriptor is ready for write
    * \param fd file descriptor to check status
    */
-  inline bool CheckWrite(SOCKET fd) const {
+  [[nodiscard]] bool CheckWrite(SOCKET fd) const {
     const auto& pfd = fds.find(fd);
     return pfd != fds.end() && ((pfd->second.events & POLLOUT) != 0);
   }
-  bool CheckWrite(xgboost::collective::TCPSocket const &socket) const {
+  [[nodiscard]] bool CheckWrite(xgboost::collective::TCPSocket const& socket) const {
     return this->CheckWrite(socket.Handle());
   }
-  /*!
-   * \brief perform poll on the set defined, read, write, exception
-   * \param timeout specify timeout in milliseconds(ms) if negative, means poll will block
-   * \return
+  /**
+   * @brief perform poll on the set defined, read, write, exception
+   *
+   * @param timeout specify timeout in seconds. Block if negative.
    */
-  inline void Poll(std::chrono::seconds timeout) {  // NOLINT(*)
+  [[nodiscard]] xgboost::collective::Result Poll(std::chrono::seconds timeout) {
     std::vector<pollfd> fdset;
     fdset.reserve(fds.size());
     for (auto kv : fds) {
@@ -167,9 +168,9 @@ struct PollHelper {
     }
     int ret = PollImpl(fdset.data(), fdset.size(), timeout);
     if (ret == 0) {
-      LOG(FATAL) << "Poll timeout";
+      return xgboost::collective::Fail("Poll timeout.");
     } else if (ret < 0) {
-      LOG(FATAL) << "Failed to poll.";
+      return xgboost::system::FailWithCode("Poll failed.");
     } else {
       for (auto& pfd : fdset) {
         auto revents = pfd.revents & pfd.events;
@@ -180,6 +181,7 @@ struct PollHelper {
         }
       }
     }
+    return xgboost::collective::Success();
   }
 
   std::unordered_map<SOCKET, pollfd> fds;
diff --git a/rabit/src/allreduce_base.cc b/rabit/src/allreduce_base.cc
index ac08ac12a..bd48d3599 100644
--- a/rabit/src/allreduce_base.cc
+++ b/rabit/src/allreduce_base.cc
@@ -1,5 +1,5 @@
-/*!
- *  Copyright (c) 2014 by Contributors
+/**
+ *  Copyright 2014-2023, XGBoost Contributors
  * \file allreduce_base.cc
  * \brief Basic implementation of AllReduce
  *
@@ -9,9 +9,11 @@
 #define NOMINMAX
 #endif  // !defined(NOMINMAX)
 
+#include "allreduce_base.h"
+
 #include "rabit/base.h"
 #include "rabit/internal/rabit-inl.h"
-#include "allreduce_base.h"
+#include "xgboost/collective/result.h"
 
 #ifndef _WIN32
 #include <netinet/tcp.h>
@@ -20,8 +22,7 @@
 #include <cstring>
 #include <map>
 
-namespace rabit {
-namespace engine {
+namespace rabit::engine {
 // constructor
 AllreduceBase::AllreduceBase() {
   tracker_uri = "NULL";
@@ -116,7 +117,12 @@ bool AllreduceBase::Init(int argc, char* argv[]) {
   utils::Assert(all_links.size() == 0, "can only call Init once");
   this->host_uri = xgboost::collective::GetHostName();
   // get information from tracker
-  return this->ReConnectLinks();
+  auto rc = this->ReConnectLinks();
+  if (rc.OK()) {
+    return true;
+  }
+  LOG(FATAL) << rc.Report();
+  return false;
 }
 
 bool AllreduceBase::Shutdown() {
@@ -131,7 +137,11 @@ bool AllreduceBase::Shutdown() {
 
     if (tracker_uri == "NULL") return true;
     // notify tracker rank i have shutdown
-    xgboost::collective::TCPSocket tracker = this->ConnectTracker();
+    xgboost::collective::TCPSocket tracker;
+    auto rc = this->ConnectTracker(&tracker);
+    if (!rc.OK()) {
+      LOG(FATAL) << rc.Report();
+    }
     tracker.Send(xgboost::StringView{"shutdown"});
     tracker.Close();
     xgboost::system::SocketFinalize();
@@ -146,7 +156,12 @@ void AllreduceBase::TrackerPrint(const std::string &msg) {
   if (tracker_uri == "NULL") {
     utils::Printf("%s", msg.c_str()); return;
   }
-  xgboost::collective::TCPSocket tracker = this->ConnectTracker();
+  xgboost::collective::TCPSocket tracker;
+  auto rc = this->ConnectTracker(&tracker);
+  if (!rc.OK()) {
+    LOG(FATAL) << rc.Report();
+  }
+
   tracker.Send(xgboost::StringView{"print"});
   tracker.Send(xgboost::StringView{msg});
   tracker.Close();
@@ -215,64 +230,67 @@ void AllreduceBase::SetParam(const char *name, const char *val) {
     }
   }
 }
+
 /*!
  * \brief initialize connection to the tracker
  * \return a socket that initializes the connection
  */
-xgboost::collective::TCPSocket AllreduceBase::ConnectTracker() const {
+[[nodiscard]] xgboost::collective::Result AllreduceBase::ConnectTracker(
+    xgboost::collective::TCPSocket *out) const {
   int magic = kMagic;
   // get information from tracker
-  xgboost::collective::TCPSocket tracker;
+  xgboost::collective::TCPSocket &tracker = *out;
 
-  int retry = 0;
-  do {
-    auto rc = xgboost::collective::Connect(
-        xgboost::collective::MakeSockAddress(xgboost::StringView{tracker_uri}, tracker_port),
-        &tracker);
-    if (rc != std::errc()) {
-      if (++retry >= connect_retry) {
-        LOG(FATAL) << "Connecting to (failed): [" << tracker_uri << "]\n" << rc.message();
-      } else {
-        LOG(WARNING) << rc.message() << "\nRetry connecting to IP(retry time: " << retry << "): ["
-                     << tracker_uri << "]";
-#if defined(_MSC_VER) || defined(__MINGW32__)
-        Sleep(retry << 1);
-#else
-        sleep(retry << 1);
-#endif
-        continue;
-      }
-    }
-    break;
-  } while (true);
+  auto rc =
+      Connect(xgboost::StringView{tracker_uri}, tracker_port, connect_retry, timeout_sec, &tracker);
+  if (!rc.OK()) {
+    return xgboost::collective::Fail("Failed to connect to the tracker.", std::move(rc));
+  }
 
   using utils::Assert;
-  CHECK_EQ(tracker.SendAll(&magic, sizeof(magic)), sizeof(magic));
-  CHECK_EQ(tracker.RecvAll(&magic, sizeof(magic)), sizeof(magic));
-  utils::Check(magic == kMagic, "sync::Invalid tracker message, init failure");
-  Assert(tracker.SendAll(&rank, sizeof(rank)) == sizeof(rank), "ReConnectLink failure 3");
-  Assert(tracker.SendAll(&world_size, sizeof(world_size)) == sizeof(world_size),
-         "ReConnectLink failure 3");
-  CHECK_EQ(tracker.Send(xgboost::StringView{task_id}), task_id.size());
-  return tracker;
+  if (tracker.SendAll(&magic, sizeof(magic)) != sizeof(magic)) {
+    return xgboost::collective::Fail("Failed to send the verification number.");
+  }
+  if (tracker.RecvAll(&magic, sizeof(magic)) != sizeof(magic)) {
+    return xgboost::collective::Fail("Failed to recieve the verification number.");
+  }
+  if (magic != kMagic) {
+    return xgboost::collective::Fail("Invalid verification number.");
+  }
+  if (tracker.SendAll(&rank, sizeof(rank)) != sizeof(rank)) {
+    return xgboost::collective::Fail("Failed to send the local rank back to the tracker.");
+  }
+  if (tracker.SendAll(&world_size, sizeof(world_size)) != sizeof(world_size)) {
+    return xgboost::collective::Fail("Failed to send the world size back to the tracker.");
+  }
+  if (tracker.Send(xgboost::StringView{task_id}) != task_id.size()) {
+    return xgboost::collective::Fail("Failed to send the task ID back to the tracker.");
+  }
+
+  return xgboost::collective::Success();
 }
 /*!
  * \brief connect to the tracker to fix the the missing links
  *   this function is also used when the engine start up
  */
-bool AllreduceBase::ReConnectLinks(const char *cmd) {
+[[nodiscard]] xgboost::collective::Result AllreduceBase::ReConnectLinks(const char *cmd) {
   // single node mode
   if (tracker_uri == "NULL") {
     rank = 0;
     world_size = 1;
-    return true;
+    return xgboost::collective::Success();
   }
 
-  try {
-    xgboost::collective::TCPSocket tracker = this->ConnectTracker();
-    LOG(INFO) << "task " << task_id << " connected to the tracker";
-    tracker.Send(xgboost::StringView{cmd});
+  xgboost::collective::TCPSocket tracker;
+  auto rc = this->ConnectTracker(&tracker);
+  if (!rc.OK()) {
+    return xgboost::collective::Fail("Failed to connect to the tracker.", std::move(rc));
+  }
 
+  LOG(INFO) << "task " << task_id << " connected to the tracker";
+  tracker.Send(xgboost::StringView{cmd});
+
+  try {
     // the rank of previous link, next link in ring
     int prev_rank, next_rank;
     // the rank of neighbors
@@ -334,10 +352,10 @@ bool AllreduceBase::ReConnectLinks(const char *cmd) {
         tracker.Recv(&hname);
         Assert(tracker.RecvAll(&hport, sizeof(hport)) == sizeof(hport), "ReConnectLink failure 9");
         Assert(tracker.RecvAll(&hrank, sizeof(hrank)) == sizeof(hrank), "ReConnectLink failure 10");
-
-        if (xgboost::collective::Connect(
-                xgboost::collective::MakeSockAddress(xgboost::StringView{hname}, hport), &r.sock) !=
-            std::errc{}) {
+        // connect to peer
+        if (!xgboost::collective::Connect(xgboost::StringView{hname}, hport, connect_retry,
+                                          timeout_sec, &r.sock)
+                 .OK()) {
           num_error += 1;
           r.sock.Close();
           continue;
@@ -351,8 +369,7 @@ bool AllreduceBase::ReConnectLinks(const char *cmd) {
         bool match = false;
         for (auto & all_link : all_links) {
           if (all_link.rank == hrank) {
-            Assert(all_link.sock.IsClosed(),
-                   "Override a link that is active");
+            Assert(all_link.sock.IsClosed(), "Override a link that is active");
             all_link.sock = std::move(r.sock);
             match = true;
             break;
@@ -364,10 +381,10 @@ bool AllreduceBase::ReConnectLinks(const char *cmd) {
              "ReConnectLink failure 14");
     } while (num_error != 0);
     // send back socket listening port to tracker
-    Assert(tracker.SendAll(&port, sizeof(port)) == sizeof(port),
-           "ReConnectLink failure 14");
+    Assert(tracker.SendAll(&port, sizeof(port)) == sizeof(port), "ReConnectLink failure 14");
     // close connection to tracker
     tracker.Close();
+
     // listen to incoming links
     for (int i = 0; i < num_accept; ++i) {
       LinkRecord r;
@@ -395,7 +412,7 @@ bool AllreduceBase::ReConnectLinks(const char *cmd) {
     for (auto &all_link : all_links) {
       utils::Assert(!all_link.sock.BadSocket(), "ReConnectLink: bad socket");
       // set the socket to non-blocking mode, enable TCP keepalive
-      all_link.sock.SetNonBlock();
+      all_link.sock.SetNonBlock(true);
       all_link.sock.SetKeepAlive();
       if (rabit_enable_tcp_no_delay) {
         all_link.sock.SetNoDelay();
@@ -415,10 +432,11 @@ bool AllreduceBase::ReConnectLinks(const char *cmd) {
            "cannot find prev ring in the link");
     Assert(next_rank == -1 || ring_next != nullptr,
            "cannot find next ring in the link");
-    return true;
+    return xgboost::collective::Success();
   } catch (const std::exception& e) {
-    LOG(WARNING) << "failed in ReconnectLink " << e.what();
-    return false;
+    std::stringstream ss;
+    ss << "Failed in ReconnectLink " << e.what();
+    return xgboost::collective::Fail(ss.str());
   }
 }
 /*!
@@ -523,9 +541,15 @@ AllreduceBase::TryAllreduceTree(void *sendrecvbuf_,
       }
     }
     // finish running allreduce
-    if (finished) break;
+    if (finished) {
+      break;
+    }
     // select must return
-    watcher.Poll(timeout_sec);
+    auto poll_res = watcher.Poll(timeout_sec);
+    if (!poll_res.OK()) {
+      LOG(FATAL) << poll_res.Report();
+    }
+
     // read data from childs
     for (int i = 0; i < nlink; ++i) {
       if (i != parent_index && watcher.CheckRead(links[i].sock)) {
@@ -698,7 +722,10 @@ AllreduceBase::TryBroadcast(void *sendrecvbuf_, size_t total_size, int root) {
     // finish running
     if (finished) break;
     // select
-    watcher.Poll(timeout_sec);
+    auto poll_res = watcher.Poll(timeout_sec);
+    if (!poll_res.OK()) {
+      LOG(FATAL) << poll_res.Report();
+    }
     if (in_link == -2) {
       // probe in-link
       for (int i = 0; i < nlink; ++i) {
@@ -780,8 +807,14 @@ AllreduceBase::TryAllgatherRing(void *sendrecvbuf_, size_t total_size,
       }
       finished  = false;
     }
-    if (finished) break;
-    watcher.Poll(timeout_sec);
+    if (finished) {
+      break;
+    }
+
+    auto poll_res = watcher.Poll(timeout_sec);
+    if (!poll_res.OK()) {
+      LOG(FATAL) << poll_res.Report();
+    }
     if (read_ptr != stop_read && watcher.CheckRead(next.sock)) {
       size_t size = stop_read - read_ptr;
       size_t start = read_ptr % total_size;
@@ -880,8 +913,13 @@ AllreduceBase::TryReduceScatterRing(void *sendrecvbuf_,
       }
       finished = false;
     }
-    if (finished) break;
-    watcher.Poll(timeout_sec);
+    if (finished) {
+      break;
+    }
+    auto poll_res = watcher.Poll(timeout_sec);
+    if (!poll_res.OK()) {
+      LOG(FATAL) << poll_res.Report();
+    }
     if (read_ptr != stop_read && watcher.CheckRead(next.sock)) {
       ReturnType ret = next.ReadToRingBuffer(reduce_ptr, stop_read);
       if (ret != kSuccess) {
@@ -953,5 +991,4 @@ AllreduceBase::TryAllreduceRing(void *sendrecvbuf_,
        (std::min((prank + 1) * step, count) -
         std::min(prank * step, count)) * type_nbytes);
 }
-}  // namespace engine
-}  // namespace rabit
+}  // namespace rabit::engine
diff --git a/rabit/src/allreduce_base.h b/rabit/src/allreduce_base.h
index 67fef0ba6..f40754273 100644
--- a/rabit/src/allreduce_base.h
+++ b/rabit/src/allreduce_base.h
@@ -12,14 +12,16 @@
 #ifndef RABIT_ALLREDUCE_BASE_H_
 #define RABIT_ALLREDUCE_BASE_H_
 
+#include <algorithm>
 #include <functional>
 #include <future>
-#include <vector>
 #include <string>
-#include <algorithm>
-#include "rabit/internal/utils.h"
+#include <vector>
+
 #include "rabit/internal/engine.h"
 #include "rabit/internal/socket.h"
+#include "rabit/internal/utils.h"
+#include "xgboost/collective/result.h"
 
 #ifdef RABIT_CXXTESTDEFS_H
 #define private   public
@@ -329,13 +331,13 @@ class AllreduceBase : public IEngine {
    * \brief initialize connection to the tracker
    * \return a socket that initializes the connection
    */
-  xgboost::collective::TCPSocket ConnectTracker() const;
+  [[nodiscard]] xgboost::collective::Result ConnectTracker(xgboost::collective::TCPSocket *out) const;
   /*!
    * \brief connect to the tracker to fix the the missing links
    *   this function is also used when the engine start up
    * \param cmd possible command to sent to tracker
    */
-  bool ReConnectLinks(const char *cmd = "start");
+  [[nodiscard]] xgboost::collective::Result ReConnectLinks(const char *cmd = "start");
   /*!
    * \brief perform in-place allreduce, on sendrecvbuf, this function can fail, and will return the cause of failure
    *
diff --git a/src/collective/socket.cc b/src/collective/socket.cc
index 1ab84cef3..78dc3d79b 100644
--- a/src/collective/socket.cc
+++ b/src/collective/socket.cc
@@ -1,19 +1,22 @@
-/*!
- * Copyright (c) 2022 by XGBoost Contributors
+/**
+ * Copyright 2022-2023 by XGBoost Contributors
  */
 #include "xgboost/collective/socket.h"
 
 #include <cstddef>       // std::size_t
 #include <cstdint>       // std::int32_t
 #include <cstring>       // std::memcpy, std::memset
+#include <filesystem>    // for path
 #include <system_error>  // std::error_code, std::system_category
 
+#include "rabit/internal/socket.h"      // for PollHelper
+#include "xgboost/collective/result.h"  // for Result
+
 #if defined(__unix__) || defined(__APPLE__)
 #include <netdb.h>  // getaddrinfo, freeaddrinfo
 #endif              // defined(__unix__) || defined(__APPLE__)
 
-namespace xgboost {
-namespace collective {
+namespace xgboost::collective {
 SockAddress MakeSockAddress(StringView host, in_port_t port) {
   struct addrinfo hints;
   std::memset(&hints, 0, sizeof(hints));
@@ -71,7 +74,12 @@ std::size_t TCPSocket::Recv(std::string *p_str) {
   return bytes;
 }
 
-std::error_code Connect(SockAddress const &addr, TCPSocket *out) {
+[[nodiscard]] Result Connect(xgboost::StringView host, std::int32_t port, std::int32_t retry,
+                             std::chrono::seconds timeout,
+                             xgboost::collective::TCPSocket *out_conn) {
+  auto addr = MakeSockAddress(xgboost::StringView{host}, port);
+  auto &conn = *out_conn;
+
   sockaddr const *addr_handle{nullptr};
   socklen_t addr_len{0};
   if (addr.IsV4()) {
@@ -81,14 +89,67 @@ std::error_code Connect(SockAddress const &addr, TCPSocket *out) {
     addr_handle = reinterpret_cast<const sockaddr *>(&addr.V6().Handle());
     addr_len = sizeof(addr.V6().Handle());
   }
-  auto socket = TCPSocket::Create(addr.Domain());
-  CHECK_EQ(static_cast<std::int32_t>(socket.Domain()), static_cast<std::int32_t>(addr.Domain()));
-  auto rc = connect(socket.Handle(), addr_handle, addr_len);
-  if (rc != 0) {
-    return std::error_code{errno, std::system_category()};
+
+  conn = TCPSocket::Create(addr.Domain());
+  CHECK_EQ(static_cast<std::int32_t>(conn.Domain()), static_cast<std::int32_t>(addr.Domain()));
+  conn.SetNonBlock(true);
+
+  Result last_error;
+  auto log_failure = [&host, &last_error](Result err, char const *file, std::int32_t line) {
+    last_error = std::move(err);
+    LOG(WARNING) << std::filesystem::path{file}.filename().string() << "(" << line
+                 << "): Failed to connect to:" << host << " Error:" << last_error.Report();
+  };
+
+  for (std::int32_t attempt = 0; attempt < std::max(retry, 1); ++attempt) {
+    if (attempt > 0) {
+      LOG(WARNING) << "Retrying connection to " << host << " for the " << attempt << " time.";
+#if defined(_MSC_VER) || defined(__MINGW32__)
+      Sleep(attempt << 1);
+#else
+      sleep(attempt << 1);
+#endif
+    }
+
+    auto rc = connect(conn.Handle(), addr_handle, addr_len);
+    if (rc != 0) {
+      auto errcode = system::LastError();
+      if (!system::ErrorWouldBlock(errcode)) {
+        log_failure(Fail("connect failed.", std::error_code{errcode, std::system_category()}),
+                    __FILE__, __LINE__);
+        continue;
+      }
+
+      rabit::utils::PollHelper poll;
+      poll.WatchWrite(conn);
+      auto result = poll.Poll(timeout);
+      if (!result.OK()) {
+        log_failure(std::move(result), __FILE__, __LINE__);
+        continue;
+      }
+      if (!poll.CheckWrite(conn)) {
+        log_failure(Fail("poll failed.", std::error_code{errcode, std::system_category()}),
+                    __FILE__, __LINE__);
+        continue;
+      }
+      result = conn.GetSockError();
+      if (!result.OK()) {
+        log_failure(std::move(result), __FILE__, __LINE__);
+        continue;
+      }
+
+      conn.SetNonBlock(false);
+      return Success();
+
+    } else {
+      conn.SetNonBlock(false);
+      return Success();
+    }
   }
-  *out = std::move(socket);
-  return std::make_error_code(std::errc{});
+
+  std::stringstream ss;
+  ss << "Failed to connect to " << host << ":" << port;
+  conn.Close();
+  return Fail(ss.str(), std::move(last_error));
 }
-}  // namespace collective
-}  // namespace xgboost
+}  // namespace xgboost::collective
diff --git a/tests/cpp/collective/test_socket.cc b/tests/cpp/collective/test_socket.cc
index 571e95f4d..ddc73d1f2 100644
--- a/tests/cpp/collective/test_socket.cc
+++ b/tests/cpp/collective/test_socket.cc
@@ -1,5 +1,5 @@
-/*!
- * Copyright (c) 2022 by XGBoost Contributors
+/**
+ * Copyright 2022-2023 by XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/collective/socket.h>
@@ -10,8 +10,7 @@
 
 #include "../helpers.h"
 
-namespace xgboost {
-namespace collective {
+namespace xgboost::collective {
 TEST(Socket, Basic) {
   system::SocketStartup();
 
@@ -31,15 +30,16 @@ TEST(Socket, Basic) {
     TCPSocket client;
     if (domain == SockDomain::kV4) {
       auto const& addr = SockAddrV4::Loopback().Addr();
-      ASSERT_EQ(Connect(MakeSockAddress(StringView{addr}, port), &client), std::errc{});
+      auto rc = Connect(StringView{addr}, port, 1, std::chrono::seconds{3}, &client);
+      ASSERT_TRUE(rc.OK()) << rc.Report();
     } else {
       auto const& addr = SockAddrV6::Loopback().Addr();
-      auto rc = Connect(MakeSockAddress(StringView{addr}, port), &client);
+      auto rc = Connect(StringView{addr}, port, 1, std::chrono::seconds{3}, &client);
       // some environment (docker) has restricted network configuration.
-      if (rc == std::error_code{EADDRNOTAVAIL, std::system_category()}) {
+      if (!rc.OK() && rc.Code() == std::error_code{EADDRNOTAVAIL, std::system_category()}) {
         GTEST_SKIP_(msg.c_str());
       }
-      ASSERT_EQ(rc, std::errc{});
+      ASSERT_EQ(rc, Success()) << rc.Report();
     }
     ASSERT_EQ(client.Domain(), domain);
 
@@ -73,5 +73,4 @@ TEST(Socket, Basic) {
 
   system::SocketFinalize();
 }
-}  // namespace collective
-}  // namespace xgboost
+}  // namespace xgboost::collective
diff --git a/tests/python/test_tracker.py b/tests/python/test_tracker.py
index 8709589dd..1f42711a2 100644
--- a/tests/python/test_tracker.py
+++ b/tests/python/test_tracker.py
@@ -20,6 +20,18 @@ def test_rabit_tracker():
         assert str(ret) == "test1234"
 
 
+@pytest.mark.skipif(**tm.not_linux())
+def test_socket_error():
+    tracker = RabitTracker(host_ip="127.0.0.1", n_workers=1)
+    tracker.start(1)
+    env = tracker.worker_envs()
+    env["DMLC_TRACKER_PORT"] = 0
+    env["DMLC_WORKER_CONNECT_RETRY"] = 1
+    with pytest.raises(ValueError, match="127.0.0.1:0\n.*refused"):
+        with xgb.collective.CommunicatorContext(**env):
+            pass
+
+
 def run_rabit_ops(client, n_workers):
     from xgboost.dask import CommunicatorContext, _get_dask_config, _get_rabit_args
 
@@ -58,6 +70,32 @@ def test_rabit_ops():
             run_rabit_ops(client, n_workers)
 
 
+def run_broadcast(client):
+    from xgboost.dask import _get_dask_config, _get_rabit_args
+
+    workers = tm.get_client_workers(client)
+    rabit_args = client.sync(_get_rabit_args, len(workers), _get_dask_config(), client)
+
+    def local_test(worker_id):
+        with collective.CommunicatorContext(**rabit_args):
+            res = collective.broadcast(17, 0)
+            return res
+
+    futures = client.map(local_test, range(len(workers)), workers=workers)
+    results = client.gather(futures)
+    np.testing.assert_allclose(np.array(results), 17)
+
+
+@pytest.mark.skipif(**tm.no_dask())
+def test_broadcast():
+    from distributed import Client, LocalCluster
+
+    n_workers = 3
+    with LocalCluster(n_workers=n_workers) as cluster:
+        with Client(cluster) as client:
+            run_broadcast(client)
+
+
 @pytest.mark.skipif(**tm.no_ipv6())
 @pytest.mark.skipif(**tm.no_dask())
 def test_rabit_ops_ipv6():

From 9bab06cbca1ea78b1f17b20b2487d6d71359f1fa Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Thu, 31 Aug 2023 03:09:35 -0700
Subject: [PATCH 130/136] Support column split in gpu hist updater (#9384)

---
 src/collective/aggregator.cuh                 | 40 ++++++++
 src/tree/gpu_hist/evaluate_splits.cu          |  3 +-
 src/tree/gpu_hist/histogram.cu                |  7 +-
 src/tree/gpu_hist/histogram.cuh               |  2 +-
 src/tree/gpu_hist/row_partitioner.cuh         |  2 +-
 src/tree/updater_gpu_hist.cu                  | 91 ++++++++++++++++---
 .../cpp/tree/gpu_hist/test_evaluate_splits.cu |  2 +-
 tests/cpp/tree/gpu_hist/test_histogram.cu     |  6 +-
 .../cpp/tree/gpu_hist/test_row_partitioner.cu |  6 +-
 tests/cpp/tree/test_gpu_hist.cu               | 56 +++++++++++-
 10 files changed, 187 insertions(+), 28 deletions(-)
 create mode 100644 src/collective/aggregator.cuh

diff --git a/src/collective/aggregator.cuh b/src/collective/aggregator.cuh
new file mode 100644
index 000000000..a87a968ab
--- /dev/null
+++ b/src/collective/aggregator.cuh
@@ -0,0 +1,40 @@
+/**
+ * Copyright 2023 by XGBoost contributors
+ *
+ * Higher level functions built on top the Communicator API, taking care of behavioral differences
+ * between row-split vs column-split distributed training, and horizontal vs vertical federated
+ * learning.
+ */
+#pragma once
+#include <xgboost/data.h>
+
+#include <limits>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "communicator-inl.cuh"
+
+namespace xgboost {
+namespace collective {
+
+/**
+ * @brief Find the global sum of the given values across all workers.
+ *
+ * This only applies when the data is split row-wise (horizontally). When data is split
+ * column-wise (vertically), the original values are returned.
+ *
+ * @tparam T The type of the values.
+ * @param info MetaInfo about the DMatrix.
+ * @param device The device id.
+ * @param values Pointer to the inputs to sum.
+ * @param size Number of values to sum.
+ */
+template <typename T>
+void GlobalSum(MetaInfo const& info, int device, T* values, size_t size) {
+  if (info.IsRowSplit()) {
+    collective::AllReduce<collective::Operation::kSum>(device, values, size);
+  }
+}
+}  // namespace collective
+}  // namespace xgboost
diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu
index ecfc6c3ce..b9a4424a5 100644
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -418,7 +418,8 @@ void GPUHistEvaluator::EvaluateSplits(
 
     // Reduce to get the best candidate from all workers.
     dh::LaunchN(out_splits.size(), [world_size, all_candidates, out_splits] __device__(size_t i) {
-      for (auto rank = 0; rank < world_size; rank++) {
+      out_splits[i] = all_candidates[i];
+      for (auto rank = 1; rank < world_size; rank++) {
         out_splits[i] = out_splits[i] + all_candidates[rank * out_splits.size() + i];
       }
     });
diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index 489c8d6f7..22eb7ab81 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -8,6 +8,7 @@
 #include <cstdint>  // uint32_t
 #include <limits>
 
+#include "../../collective/aggregator.h"
 #include "../../common/deterministic.cuh"
 #include "../../common/device_helpers.cuh"
 #include "../../data/ellpack_page.cuh"
@@ -52,7 +53,7 @@ struct Clip : public thrust::unary_function<GradientPair, Pair> {
  *
  * to avoid outliers, as the full reduction is reproducible on GPU with reduction tree.
  */
-GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair) {
+GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair, MetaInfo const& info) {
   using GradientSumT = GradientPairPrecise;
   using T = typename GradientSumT::ValueT;
   dh::XGBCachingDeviceAllocator<char> alloc;
@@ -64,11 +65,11 @@ GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair) {
   // Treat pair as array of 4 primitive types to allreduce
   using ReduceT = typename decltype(p.first)::ValueT;
   static_assert(sizeof(Pair) == sizeof(ReduceT) * 4, "Expected to reduce four elements.");
-  collective::Allreduce<collective::Operation::kSum>(reinterpret_cast<ReduceT*>(&p), 4);
+  collective::GlobalSum(info, reinterpret_cast<ReduceT*>(&p), 4);
   GradientPair positive_sum{p.first}, negative_sum{p.second};
 
   std::size_t total_rows = gpair.size();
-  collective::Allreduce<collective::Operation::kSum>(&total_rows, 1);
+  collective::GlobalSum(info, &total_rows, 1);
 
   auto histogram_rounding =
       GradientSumT{common::CreateRoundingFactor<T>(
diff --git a/src/tree/gpu_hist/histogram.cuh b/src/tree/gpu_hist/histogram.cuh
index eb9008d48..c693e2e62 100644
--- a/src/tree/gpu_hist/histogram.cuh
+++ b/src/tree/gpu_hist/histogram.cuh
@@ -39,7 +39,7 @@ private:
   GradientPairPrecise to_floating_point_;
 
  public:
-  explicit GradientQuantiser(common::Span<GradientPair const> gpair);
+  GradientQuantiser(common::Span<GradientPair const> gpair, MetaInfo const& info);
   XGBOOST_DEVICE GradientPairInt64 ToFixedPoint(GradientPair const& gpair) const {
     auto adjusted = GradientPairInt64(gpair.GetGrad() * to_fixed_point_.GetGrad(),
                                gpair.GetHess() * to_fixed_point_.GetHess());
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index 215a0e49b..64ca540f6 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -129,7 +129,7 @@ void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
         int batch_idx;
         std::size_t item_idx;
         AssignBatch(batch_info_itr, idx, &batch_idx, &item_idx);
-        auto op_res = op(ridx[item_idx], batch_info_itr[batch_idx].data);
+        auto op_res = op(ridx[item_idx], batch_idx, batch_info_itr[batch_idx].data);
         return IndexFlagTuple{static_cast<bst_uint>(item_idx), op_res, batch_idx, op_res};
       });
   size_t temp_bytes = 0;
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 0e42f1562..57eec0db8 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -12,7 +12,8 @@
 #include <utility>  // for move
 #include <vector>
 
-#include "../collective/communicator-inl.cuh"
+#include "../collective/aggregator.h"
+#include "../collective/aggregator.cuh"
 #include "../common/bitfield.h"
 #include "../common/categorical.h"
 #include "../common/cuda_context.cuh"  // CUDAContext
@@ -161,6 +162,7 @@ struct GPUHistMakerDevice {
   GPUHistEvaluator evaluator_;
   Context const* ctx_;
   std::shared_ptr<common::ColumnSampler> column_sampler_;
+  MetaInfo const& info_;
 
  public:
   EllpackPageImpl const* page{nullptr};
@@ -193,13 +195,14 @@ struct GPUHistMakerDevice {
   GPUHistMakerDevice(Context const* ctx, bool is_external_memory,
                      common::Span<FeatureType const> _feature_types, bst_row_t _n_rows,
                      TrainParam _param, std::shared_ptr<common::ColumnSampler> column_sampler,
-                     uint32_t n_features, BatchParam batch_param)
+                     uint32_t n_features, BatchParam batch_param, MetaInfo const& info)
       : evaluator_{_param, n_features, ctx->gpu_id},
         ctx_(ctx),
         feature_types{_feature_types},
         param(std::move(_param)),
         column_sampler_(std::move(column_sampler)),
-        interaction_constraints(param, n_features) {
+        interaction_constraints(param, n_features),
+        info_{info} {
     sampler = std::make_unique<GradientBasedSampler>(ctx, _n_rows, batch_param, param.subsample,
                                                      param.sampling_method, is_external_memory);
     if (!param.monotone_constraints.empty()) {
@@ -245,7 +248,7 @@ struct GPUHistMakerDevice {
     this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param,
                            dmat->Info().IsColumnSplit(), ctx_->gpu_id);
 
-    quantiser = std::make_unique<GradientQuantiser>(this->gpair);
+    quantiser = std::make_unique<GradientQuantiser>(this->gpair, dmat->Info());
 
     row_partitioner.reset();  // Release the device memory first before reallocating
     row_partitioner = std::make_unique<RowPartitioner>(ctx_->gpu_id, sample.sample_rows);
@@ -369,6 +372,66 @@ struct GPUHistMakerDevice {
     common::KCatBitField node_cats;
   };
 
+  void UpdatePositionColumnSplit(EllpackDeviceAccessor d_matrix,
+                                 std::vector<NodeSplitData> const& split_data,
+                                 std::vector<bst_node_t> const& nidx,
+                                 std::vector<bst_node_t> const& left_nidx,
+                                 std::vector<bst_node_t> const& right_nidx) {
+    auto const num_candidates = split_data.size();
+
+    using BitVector = LBitField64;
+    using BitType = BitVector::value_type;
+    auto const size = BitVector::ComputeStorageSize(d_matrix.n_rows * num_candidates);
+    dh::TemporaryArray<BitType> decision_storage(size, 0);
+    dh::TemporaryArray<BitType> missing_storage(size, 0);
+    BitVector decision_bits{dh::ToSpan(decision_storage)};
+    BitVector missing_bits{dh::ToSpan(missing_storage)};
+
+    dh::TemporaryArray<NodeSplitData> split_data_storage(num_candidates);
+    dh::safe_cuda(cudaMemcpyAsync(split_data_storage.data().get(), split_data.data(),
+                                  num_candidates * sizeof(NodeSplitData), cudaMemcpyDefault));
+    auto d_split_data = dh::ToSpan(split_data_storage);
+
+    dh::LaunchN(d_matrix.n_rows, [=] __device__(std::size_t ridx) mutable {
+      for (auto i = 0; i < num_candidates; i++) {
+        auto const& data = d_split_data[i];
+        auto const cut_value = d_matrix.GetFvalue(ridx, data.split_node.SplitIndex());
+        if (isnan(cut_value)) {
+          missing_bits.Set(ridx * num_candidates + i);
+        } else {
+          bool go_left;
+          if (data.split_type == FeatureType::kCategorical) {
+            go_left = common::Decision(data.node_cats.Bits(), cut_value);
+          } else {
+            go_left = cut_value <= data.split_node.SplitCond();
+          }
+          if (go_left) {
+            decision_bits.Set(ridx * num_candidates + i);
+          }
+        }
+      }
+    });
+
+    collective::AllReduce<collective::Operation::kBitwiseOR>(
+        ctx_->gpu_id, decision_storage.data().get(), decision_storage.size());
+    collective::AllReduce<collective::Operation::kBitwiseAND>(
+        ctx_->gpu_id, missing_storage.data().get(), missing_storage.size());
+    collective::Synchronize(ctx_->gpu_id);
+
+    row_partitioner->UpdatePositionBatch(
+        nidx, left_nidx, right_nidx, split_data,
+        [=] __device__(bst_uint ridx, int split_index, NodeSplitData const& data) {
+          auto const index = ridx * num_candidates + split_index;
+          bool go_left;
+          if (missing_bits.Check(index)) {
+            go_left = data.split_node.DefaultLeft();
+          } else {
+            go_left = decision_bits.Check(index);
+          }
+          return go_left;
+        });
+  }
+
   void UpdatePosition(std::vector<GPUExpandEntry> const& candidates, RegTree* p_tree) {
     if (candidates.empty()) {
       return;
@@ -392,9 +455,15 @@ struct GPUHistMakerDevice {
     }
 
     auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
+
+    if (info_.IsColumnSplit()) {
+      UpdatePositionColumnSplit(d_matrix, split_data, nidx, left_nidx, right_nidx);
+      return;
+    }
+
     row_partitioner->UpdatePositionBatch(
         nidx, left_nidx, right_nidx, split_data,
-        [=] __device__(bst_uint ridx, const NodeSplitData& data) {
+        [=] __device__(bst_uint ridx, int split_index, const NodeSplitData& data) {
           // given a row index, returns the node id it belongs to
           float cut_value = d_matrix.GetFvalue(ridx, data.split_node.SplitIndex());
           // Missing value
@@ -544,9 +613,8 @@ struct GPUHistMakerDevice {
     monitor.Start("AllReduce");
     auto d_node_hist = hist.GetNodeHistogram(nidx).data();
     using ReduceT = typename std::remove_pointer<decltype(d_node_hist)>::type::ValueT;
-    collective::AllReduce<collective::Operation::kSum>(
-        ctx_->gpu_id, reinterpret_cast<ReduceT*>(d_node_hist),
-        page->Cuts().TotalBins() * 2 * num_histograms);
+    collective::GlobalSum(info_, ctx_->gpu_id, reinterpret_cast<ReduceT*>(d_node_hist),
+                          page->Cuts().TotalBins() * 2 * num_histograms);
 
     monitor.Stop("AllReduce");
   }
@@ -663,8 +731,7 @@ struct GPUHistMakerDevice {
         dh::Reduce(ctx_->CUDACtx()->CTP(), gpair_it, gpair_it + gpair.size(),
                    GradientPairInt64{}, thrust::plus<GradientPairInt64>{});
     using ReduceT = typename decltype(root_sum_quantised)::ValueT;
-    collective::Allreduce<collective::Operation::kSum>(
-        reinterpret_cast<ReduceT *>(&root_sum_quantised), 2);
+    collective::GlobalSum(info_, reinterpret_cast<ReduceT*>(&root_sum_quantised), 2);
 
     hist.AllocateHistograms({kRootNIdx});
     this->BuildHist(kRootNIdx);
@@ -801,7 +868,7 @@ class GPUHistMaker : public TreeUpdater {
     info_->feature_types.SetDevice(ctx_->gpu_id);
     maker = std::make_unique<GPUHistMakerDevice>(
         ctx_, !dmat->SingleColBlock(), info_->feature_types.ConstDeviceSpan(), info_->num_row_,
-        *param, column_sampler_, info_->num_col_, batch_param);
+        *param, column_sampler_, info_->num_col_, batch_param, dmat->Info());
 
     p_last_fmat_ = dmat;
     initialised_ = true;
@@ -915,7 +982,7 @@ class GPUGlobalApproxMaker : public TreeUpdater {
     auto batch = BatchParam{param->max_bin, hess, !task_->const_hess};
     maker_ = std::make_unique<GPUHistMakerDevice>(
         ctx_, !p_fmat->SingleColBlock(), info.feature_types.ConstDeviceSpan(), info.num_row_,
-        *param, column_sampler_, info.num_col_, batch);
+        *param, column_sampler_, info.num_col_, batch, p_fmat->Info());
 
     std::size_t t_idx{0};
     for (xgboost::RegTree* tree : trees) {
diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
index f74b7d3ca..f4ed34bf0 100644
--- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
+++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
@@ -24,7 +24,7 @@ auto ZeroParam() {
 inline GradientQuantiser DummyRoundingFactor() {
   thrust::device_vector<GradientPair> gpair(1);
   gpair[0] = {1000.f, 1000.f};  // Tests should not exceed sum of 1000
-  return GradientQuantiser(dh::ToSpan(gpair));
+  return {dh::ToSpan(gpair), MetaInfo()};
 }
 
 thrust::device_vector<GradientPairInt64> ConvertToInteger(std::vector<GradientPairPrecise> x) {
diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu
index 024a1e8d3..2eacd48e5 100644
--- a/tests/cpp/tree/gpu_hist/test_histogram.cu
+++ b/tests/cpp/tree/gpu_hist/test_histogram.cu
@@ -39,7 +39,7 @@ void TestDeterministicHistogram(bool is_dense, int shm_size) {
     FeatureGroups feature_groups(page->Cuts(), page->is_dense, shm_size,
                                  sizeof(GradientPairInt64));
 
-    auto quantiser = GradientQuantiser(gpair.DeviceSpan());
+    auto quantiser = GradientQuantiser(gpair.DeviceSpan(), MetaInfo());
     BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(0),
                            feature_groups.DeviceAccessor(0), gpair.DeviceSpan(), ridx, d_histogram,
                            quantiser);
@@ -53,7 +53,7 @@ void TestDeterministicHistogram(bool is_dense, int shm_size) {
       dh::device_vector<GradientPairInt64> new_histogram(num_bins);
       auto d_new_histogram = dh::ToSpan(new_histogram);
 
-      auto quantiser = GradientQuantiser(gpair.DeviceSpan());
+      auto quantiser = GradientQuantiser(gpair.DeviceSpan(), MetaInfo());
       BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(0),
                              feature_groups.DeviceAccessor(0), gpair.DeviceSpan(), ridx,
                              d_new_histogram, quantiser);
@@ -131,7 +131,7 @@ void TestGPUHistogramCategorical(size_t num_categories) {
   dh::device_vector<GradientPairInt64> cat_hist(num_categories);
   auto gpair = GenerateRandomGradients(kRows, 0, 2);
   gpair.SetDevice(0);
-  auto quantiser = GradientQuantiser(gpair.DeviceSpan());
+  auto quantiser = GradientQuantiser(gpair.DeviceSpan(), MetaInfo());
   /**
    * Generate hist with cat data.
    */
diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
index 050980400..317728e01 100644
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -30,7 +30,7 @@ void TestUpdatePositionBatch() {
   std::vector<int> extra_data = {0};
   // Send the first five training instances to the right node
   // and the second 5 to the left node
-  rp.UpdatePositionBatch({0}, {1}, {2}, extra_data, [=] __device__(RowPartitioner::RowIndexT ridx, int) {
+  rp.UpdatePositionBatch({0}, {1}, {2}, extra_data, [=] __device__(RowPartitioner::RowIndexT ridx, int, int) {
     return ridx > 4;
   });
   rows = rp.GetRowsHost(1);
@@ -43,7 +43,7 @@ void TestUpdatePositionBatch() {
   }
 
   // Split the left node again
-  rp.UpdatePositionBatch({1}, {3}, {4}, extra_data,[=] __device__(RowPartitioner::RowIndexT ridx, int) {
+  rp.UpdatePositionBatch({1}, {3}, {4}, extra_data,[=] __device__(RowPartitioner::RowIndexT ridx, int, int) {
     return ridx < 7;
   });
   EXPECT_EQ(rp.GetRows(3).size(), 2);
@@ -57,7 +57,7 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se
   thrust::device_vector<uint32_t> ridx_tmp(ridx_in.size());
   thrust::device_vector<bst_uint> counts(segments.size());
 
-  auto op = [=] __device__(auto ridx, int data) { return ridx % 2 == 0; };
+  auto op = [=] __device__(auto ridx, int split_index, int data) { return ridx % 2 == 0; };
   std::vector<int> op_data(segments.size());
   std::vector<PerNodeData<int>> h_batch_info(segments.size());
   dh::TemporaryArray<PerNodeData<int>> d_batch_info(segments.size());
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index 50cdae741..76734e526 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -93,7 +93,7 @@ void TestBuildHist(bool use_shared_memory_histograms) {
   Context ctx{MakeCUDACtx(0)};
   auto cs = std::make_shared<common::ColumnSampler>(0);
   GPUHistMakerDevice maker(&ctx, /*is_external_memory=*/false, {}, kNRows, param, cs, kNCols,
-                           batch_param);
+                           batch_param, MetaInfo());
   xgboost::SimpleLCG gen;
   xgboost::SimpleRealUniformDistribution<bst_float> dist(0.0f, 1.0f);
   HostDeviceVector<GradientPair> gpair(kNRows);
@@ -111,7 +111,7 @@ void TestBuildHist(bool use_shared_memory_histograms) {
   maker.hist.AllocateHistograms({0});
 
   maker.gpair = gpair.DeviceSpan();
-  maker.quantiser = std::make_unique<GradientQuantiser>(maker.gpair);
+  maker.quantiser = std::make_unique<GradientQuantiser>(maker.gpair, MetaInfo());
   maker.page = page.get();
 
   maker.InitFeatureGroupsOnce();
@@ -165,7 +165,7 @@ HistogramCutsWrapper GetHostCutMatrix () {
 inline GradientQuantiser DummyRoundingFactor() {
   thrust::device_vector<GradientPair> gpair(1);
   gpair[0] = {1000.f, 1000.f};  // Tests should not exceed sum of 1000
-  return GradientQuantiser(dh::ToSpan(gpair));
+  return {dh::ToSpan(gpair), MetaInfo()};
 }
 
 void TestHistogramIndexImpl() {
@@ -426,4 +426,54 @@ TEST(GpuHist, MaxDepth) {
 
   ASSERT_THROW({learner->UpdateOneIter(0, p_mat);}, dmlc::Error);
 }
+
+namespace {
+RegTree GetUpdatedTree(Context const* ctx, DMatrix* dmat) {
+  ObjInfo task{ObjInfo::kRegression};
+  GPUHistMaker hist_maker{ctx, &task};
+  hist_maker.Configure(Args{});
+
+  TrainParam param;
+  param.UpdateAllowUnknown(Args{});
+
+  linalg::Matrix<GradientPair> gpair({dmat->Info().num_row_}, ctx->Ordinal());
+  gpair.Data()->Copy(GenerateRandomGradients(dmat->Info().num_row_));
+
+  std::vector<HostDeviceVector<bst_node_t>> position(1);
+  RegTree tree;
+  hist_maker.Update(&param, &gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position},
+                    {&tree});
+  return tree;
+}
+
+void VerifyColumnSplit(bst_row_t rows, bst_feature_t cols, RegTree const& expected_tree) {
+  Context ctx(MakeCUDACtx(GPUIDX));
+
+  auto Xy = RandomDataGenerator{rows, cols, 0}.GenerateDMatrix(true);
+  auto const world_size = collective::GetWorldSize();
+  auto const rank = collective::GetRank();
+  std::unique_ptr<DMatrix> sliced{Xy->SliceCol(world_size, rank)};
+
+  RegTree tree = GetUpdatedTree(&ctx, sliced.get());
+
+  Json json{Object{}};
+  tree.SaveModel(&json);
+  Json expected_json{Object{}};
+  expected_tree.SaveModel(&expected_json);
+  ASSERT_EQ(json, expected_json);
+}
+}  // anonymous namespace
+
+class MGPUHistTest : public BaseMGPUTest {};
+
+TEST_F(MGPUHistTest, GPUHistColumnSplit) {
+  auto constexpr kRows = 32;
+  auto constexpr kCols = 16;
+
+  Context ctx(MakeCUDACtx(0));
+  auto dmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true);
+  RegTree expected_tree = GetUpdatedTree(&ctx, dmat.get());
+
+  DoTest(VerifyColumnSplit, kRows, kCols, expected_tree);
+}
 }  // namespace xgboost::tree

From c928dd4ff500743fdf0ffd335464ccbf360f285d Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Sat, 2 Sep 2023 20:37:11 -0700
Subject: [PATCH 131/136] Support vertical federated learning with `gpu_hist`
 (#9539)

---
 src/collective/aggregator.h                | 47 ++++++++++++++++++-
 src/learner.cc                             |  6 +--
 src/objective/adaptive.cu                  | 54 +++++++++++-----------
 src/tree/fit_stump.cc                      |  8 ++--
 src/tree/fit_stump.cu                      |  9 ++--
 tests/cpp/plugin/test_federated_learner.cc | 46 ++++++++++--------
 6 files changed, 113 insertions(+), 57 deletions(-)

diff --git a/src/collective/aggregator.h b/src/collective/aggregator.h
index b33ca28ef..f2a9ff528 100644
--- a/src/collective/aggregator.h
+++ b/src/collective/aggregator.h
@@ -26,7 +26,6 @@ namespace collective {
  * applied there, with the results broadcast to other workers.
  *
  * @tparam Function The function used to calculate the results.
- * @tparam Args Arguments to the function.
  * @param info MetaInfo about the DMatrix.
  * @param buffer The buffer storing the results.
  * @param size The size of the buffer.
@@ -57,6 +56,52 @@ void ApplyWithLabels(MetaInfo const& info, void* buffer, size_t size, Function&&
   }
 }
 
+/**
+ * @brief Apply the given function where the labels are.
+ *
+ * Normally all the workers have access to the labels, so the function is just applied locally. In
+ * vertical federated learning, we assume labels are only available on worker 0, so the function is
+ * applied there, with the results broadcast to other workers.
+ *
+ * @tparam T Type of the HostDeviceVector storing the results.
+ * @tparam Function The function used to calculate the results.
+ * @param info MetaInfo about the DMatrix.
+ * @param result The HostDeviceVector storing the results.
+ * @param function The function used to calculate the results.
+ */
+template <typename T, typename Function>
+void ApplyWithLabels(MetaInfo const& info, HostDeviceVector<T>* result, Function&& function) {
+  if (info.IsVerticalFederated()) {
+    // We assume labels are only available on worker 0, so the calculation is done there and result
+    // broadcast to other workers.
+    std::string message;
+    if (collective::GetRank() == 0) {
+      try {
+        std::forward<Function>(function)();
+      } catch (dmlc::Error& e) {
+        message = e.what();
+      }
+    }
+
+    collective::Broadcast(&message, 0);
+    if (!message.empty()) {
+      LOG(FATAL) << &message[0];
+      return;
+    }
+
+    std::size_t size{};
+    if (collective::GetRank() == 0) {
+      size = result->Size();
+    }
+    collective::Broadcast(&size, sizeof(std::size_t), 0);
+
+    result->Resize(size);
+    collective::Broadcast(result->HostPointer(), size * sizeof(T), 0);
+  } else {
+    std::forward<Function>(function)();
+  }
+}
+
 /**
  * @brief Find the global max of the given value across all workers.
  *
diff --git a/src/learner.cc b/src/learner.cc
index 33725b612..79dca44bd 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -847,8 +847,7 @@ class LearnerConfiguration : public Learner {
 
   void InitEstimation(MetaInfo const& info, linalg::Tensor<float, 1>* base_score) {
     base_score->Reshape(1);
-    collective::ApplyWithLabels(info, base_score->Data()->HostPointer(),
-                                sizeof(bst_float) * base_score->Size(),
+    collective::ApplyWithLabels(info, base_score->Data(),
                                 [&] { UsePtr(obj_)->InitEstimation(info, base_score); });
   }
 };
@@ -1467,8 +1466,7 @@ class LearnerImpl : public LearnerIO {
   void GetGradient(HostDeviceVector<bst_float> const& preds, MetaInfo const& info,
                    std::int32_t iter, linalg::Matrix<GradientPair>* out_gpair) {
     out_gpair->Reshape(info.num_row_, this->learner_model_param_.OutputLength());
-    collective::ApplyWithLabels(info, out_gpair->Data()->HostPointer(),
-                                out_gpair->Size() * sizeof(GradientPair),
+    collective::ApplyWithLabels(info, out_gpair->Data(),
                                 [&] { obj_->GetGradient(preds, info, iter, out_gpair); });
   }
 
diff --git a/src/objective/adaptive.cu b/src/objective/adaptive.cu
index 29f70a8d8..cea211622 100644
--- a/src/objective/adaptive.cu
+++ b/src/objective/adaptive.cu
@@ -6,6 +6,7 @@
 #include <cstdint>                     // std::int32_t
 #include <cub/cub.cuh>                 // NOLINT
 
+#include "../collective/aggregator.h"
 #include "../common/cuda_context.cuh"  // CUDAContext
 #include "../common/device_helpers.cuh"
 #include "../common/stats.cuh"
@@ -154,38 +155,39 @@ void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
     UpdateLeafValues(&quantiles, nidx.ConstHostVector(), info, learning_rate, p_tree);
   }
 
-  HostDeviceVector<float> quantiles;
   predt.SetDevice(ctx->Device());
-
   auto d_predt = linalg::MakeTensorView(ctx, predt.ConstDeviceSpan(), info.num_row_,
                                         predt.Size() / info.num_row_);
   CHECK_LT(group_idx, d_predt.Shape(1));
   auto t_predt = d_predt.Slice(linalg::All(), group_idx);
-  auto d_labels = info.labels.View(ctx->Device()).Slice(linalg::All(), IdxY(info, group_idx));
-
-  auto d_row_index = dh::ToSpan(ridx);
-  auto seg_beg = nptr.DevicePointer();
-  auto seg_end = seg_beg + nptr.Size();
-  auto val_beg = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
-                                                  [=] XGBOOST_DEVICE(size_t i) {
-                                                    float p = t_predt(d_row_index[i]);
-                                                    auto y = d_labels(d_row_index[i]);
-                                                    return y - p;
-                                                  });
-  CHECK_EQ(d_labels.Shape(0), position.size());
-  auto val_end = val_beg + d_labels.Shape(0);
-  CHECK_EQ(nidx.Size() + 1, nptr.Size());
-  if (info.weights_.Empty()) {
-    common::SegmentedQuantile(ctx, alpha, seg_beg, seg_end, val_beg, val_end, &quantiles);
-  } else {
-    info.weights_.SetDevice(ctx->Device());
-    auto d_weights = info.weights_.ConstDeviceSpan();
-    CHECK_EQ(d_weights.size(), d_row_index.size());
-    auto w_it = thrust::make_permutation_iterator(dh::tcbegin(d_weights), dh::tcbegin(d_row_index));
-    common::SegmentedWeightedQuantile(ctx, alpha, seg_beg, seg_end, val_beg, val_end, w_it,
-                                      w_it + d_weights.size(), &quantiles);
-  }
 
+  HostDeviceVector<float> quantiles;
+  collective::ApplyWithLabels(info, &quantiles, [&] {
+    auto d_labels = info.labels.View(ctx->Device()).Slice(linalg::All(), IdxY(info, group_idx));
+    auto d_row_index = dh::ToSpan(ridx);
+    auto seg_beg = nptr.DevicePointer();
+    auto seg_end = seg_beg + nptr.Size();
+    auto val_beg = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
+                                                    [=] XGBOOST_DEVICE(size_t i) {
+                                                      float p = t_predt(d_row_index[i]);
+                                                      auto y = d_labels(d_row_index[i]);
+                                                      return y - p;
+                                                    });
+    CHECK_EQ(d_labels.Shape(0), position.size());
+    auto val_end = val_beg + d_labels.Shape(0);
+    CHECK_EQ(nidx.Size() + 1, nptr.Size());
+    if (info.weights_.Empty()) {
+      common::SegmentedQuantile(ctx, alpha, seg_beg, seg_end, val_beg, val_end, &quantiles);
+    } else {
+      info.weights_.SetDevice(ctx->Device());
+      auto d_weights = info.weights_.ConstDeviceSpan();
+      CHECK_EQ(d_weights.size(), d_row_index.size());
+      auto w_it =
+          thrust::make_permutation_iterator(dh::tcbegin(d_weights), dh::tcbegin(d_row_index));
+      common::SegmentedWeightedQuantile(ctx, alpha, seg_beg, seg_end, val_beg, val_end, w_it,
+                                        w_it + d_weights.size(), &quantiles);
+    }
+  });
   UpdateLeafValues(&quantiles.HostVector(), nidx.ConstHostVector(), info, learning_rate, p_tree);
 }
 }  // namespace detail
diff --git a/src/tree/fit_stump.cc b/src/tree/fit_stump.cc
index ec1b6fe18..a8f5e1d8e 100644
--- a/src/tree/fit_stump.cc
+++ b/src/tree/fit_stump.cc
@@ -55,11 +55,11 @@ void FitStump(Context const* ctx, MetaInfo const& info,
 }  // namespace cpu_impl
 
 namespace cuda_impl {
-void FitStump(Context const* ctx, linalg::TensorView<GradientPair const, 2> gpair,
-              linalg::VectorView<float> out);
+void FitStump(Context const* ctx, MetaInfo const& info,
+              linalg::TensorView<GradientPair const, 2> gpair, linalg::VectorView<float> out);
 
 #if !defined(XGBOOST_USE_CUDA)
-inline void FitStump(Context const*, linalg::TensorView<GradientPair const, 2>,
+inline void FitStump(Context const*, MetaInfo const&, linalg::TensorView<GradientPair const, 2>,
                      linalg::VectorView<float>) {
   common::AssertGPUSupport();
 }
@@ -74,7 +74,7 @@ void FitStump(Context const* ctx, MetaInfo const& info, linalg::Matrix<GradientP
   gpair.SetDevice(ctx->Device());
   auto gpair_t = gpair.View(ctx->Device());
   ctx->IsCPU() ? cpu_impl::FitStump(ctx, info, gpair_t, out->HostView())
-      : cuda_impl::FitStump(ctx, gpair_t, out->View(ctx->Device()));
+      : cuda_impl::FitStump(ctx, info, gpair_t, out->View(ctx->Device()));
 }
 }  // namespace tree
 }  // namespace xgboost
diff --git a/src/tree/fit_stump.cu b/src/tree/fit_stump.cu
index 40b2a0c96..f0d53bff1 100644
--- a/src/tree/fit_stump.cu
+++ b/src/tree/fit_stump.cu
@@ -11,6 +11,7 @@
 
 #include <cstddef>                                // std::size_t
 
+#include "../collective/aggregator.cuh"
 #include "../collective/communicator-inl.cuh"
 #include "../common/device_helpers.cuh"           // dh::MakeTransformIterator
 #include "fit_stump.h"
@@ -23,8 +24,8 @@
 namespace xgboost {
 namespace tree {
 namespace cuda_impl {
-void FitStump(Context const* ctx, linalg::TensorView<GradientPair const, 2> gpair,
-              linalg::VectorView<float> out) {
+void FitStump(Context const* ctx, MetaInfo const& info,
+              linalg::TensorView<GradientPair const, 2> gpair, linalg::VectorView<float> out) {
   auto n_targets = out.Size();
   CHECK_EQ(n_targets, gpair.Shape(1));
   linalg::Vector<GradientPairPrecise> sum = linalg::Constant(ctx, GradientPairPrecise{}, n_targets);
@@ -49,8 +50,8 @@ void FitStump(Context const* ctx, linalg::TensorView<GradientPair const, 2> gpai
   thrust::reduce_by_key(policy, key_it, key_it + gpair.Size(), grad_it,
                         thrust::make_discard_iterator(), dh::tbegin(d_sum.Values()));
 
-  collective::AllReduce<collective::Operation::kSum>(
-      ctx->gpu_id, reinterpret_cast<double*>(d_sum.Values().data()), d_sum.Size() * 2);
+  collective::GlobalSum(info, ctx->gpu_id, reinterpret_cast<double*>(d_sum.Values().data()),
+                        d_sum.Size() * 2);
 
   thrust::for_each_n(policy, thrust::make_counting_iterator(0ul), n_targets,
                      [=] XGBOOST_DEVICE(std::size_t i) mutable {
diff --git a/tests/cpp/plugin/test_federated_learner.cc b/tests/cpp/plugin/test_federated_learner.cc
index ac514d169..427bd790c 100644
--- a/tests/cpp/plugin/test_federated_learner.cc
+++ b/tests/cpp/plugin/test_federated_learner.cc
@@ -15,9 +15,11 @@
 
 namespace xgboost {
 namespace {
-auto MakeModel(std::string tree_method, std::string objective, std::shared_ptr<DMatrix> dmat) {
+auto MakeModel(std::string tree_method, std::string device, std::string objective,
+               std::shared_ptr<DMatrix> dmat) {
   std::unique_ptr<Learner> learner{Learner::Create({dmat})};
   learner->SetParam("tree_method", tree_method);
+  learner->SetParam("device", device);
   learner->SetParam("objective", objective);
   if (objective.find("quantile") != std::string::npos) {
     learner->SetParam("quantile_alpha", "0.5");
@@ -35,7 +37,7 @@ auto MakeModel(std::string tree_method, std::string objective, std::shared_ptr<D
 }
 
 void VerifyObjective(size_t rows, size_t cols, float expected_base_score, Json expected_model,
-                     std::string tree_method, std::string objective) {
+                     std::string tree_method, std::string device, std::string objective) {
   auto const world_size = collective::GetWorldSize();
   auto const rank = collective::GetRank();
   std::shared_ptr<DMatrix> dmat{RandomDataGenerator{rows, cols, 0}.GenerateDMatrix(rank == 0)};
@@ -61,14 +63,14 @@ void VerifyObjective(size_t rows, size_t cols, float expected_base_score, Json e
   }
   std::shared_ptr<DMatrix> sliced{dmat->SliceCol(world_size, rank)};
 
-  auto model = MakeModel(tree_method, objective, sliced);
+  auto model = MakeModel(tree_method, device, objective, sliced);
   auto base_score = GetBaseScore(model);
-  ASSERT_EQ(base_score, expected_base_score);
-  ASSERT_EQ(model, expected_model);
+  ASSERT_EQ(base_score, expected_base_score) << " rank " << rank;
+  ASSERT_EQ(model, expected_model) << " rank " << rank;
 }
 }  // namespace
 
-class FederatedLearnerTest : public ::testing::TestWithParam<std::string> {
+class VerticalFederatedLearnerTest : public ::testing::TestWithParam<std::string> {
   std::unique_ptr<ServerForTest> server_;
   static int constexpr kWorldSize{3};
 
@@ -76,7 +78,7 @@ class FederatedLearnerTest : public ::testing::TestWithParam<std::string> {
   void SetUp() override { server_ = std::make_unique<ServerForTest>(kWorldSize); }
   void TearDown() override { server_.reset(nullptr); }
 
-  void Run(std::string tree_method, std::string objective) {
+  void Run(std::string tree_method, std::string device, std::string objective) {
     static auto constexpr kRows{16};
     static auto constexpr kCols{16};
 
@@ -99,27 +101,35 @@ class FederatedLearnerTest : public ::testing::TestWithParam<std::string> {
       }
     }
 
-    auto model = MakeModel(tree_method, objective, dmat);
+    auto model = MakeModel(tree_method, device, objective, dmat);
     auto score = GetBaseScore(model);
 
     RunWithFederatedCommunicator(kWorldSize, server_->Address(), &VerifyObjective, kRows, kCols,
-                                 score, model, tree_method, objective);
+                                 score, model, tree_method, device, objective);
   }
 };
 
-TEST_P(FederatedLearnerTest, Approx) {
+TEST_P(VerticalFederatedLearnerTest, Approx) {
   std::string objective = GetParam();
-  this->Run("approx", objective);
+  this->Run("approx", "cpu", objective);
 }
 
-TEST_P(FederatedLearnerTest, Hist) {
+TEST_P(VerticalFederatedLearnerTest, Hist) {
   std::string objective = GetParam();
-  this->Run("hist", objective);
+  this->Run("hist", "cpu", objective);
 }
 
-INSTANTIATE_TEST_SUITE_P(FederatedLearnerObjective, FederatedLearnerTest,
-                         ::testing::ValuesIn(MakeObjNamesForTest()),
-                         [](const ::testing::TestParamInfo<FederatedLearnerTest::ParamType> &info) {
-                           return ObjTestNameGenerator(info);
-                         });
+#if defined(XGBOOST_USE_CUDA)
+TEST_P(VerticalFederatedLearnerTest, GPUHist) {
+  std::string objective = GetParam();
+  this->Run("hist", "cuda:0", objective);
+}
+#endif  // defined(XGBOOST_USE_CUDA)
+
+INSTANTIATE_TEST_SUITE_P(
+    FederatedLearnerObjective, VerticalFederatedLearnerTest,
+    ::testing::ValuesIn(MakeObjNamesForTest()),
+    [](const ::testing::TestParamInfo<VerticalFederatedLearnerTest::ParamType> &info) {
+      return ObjTestNameGenerator(info);
+    });
 }  // namespace xgboost

From 98e45f7b54f0b62564f8430e4a1467d4d51adf07 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Mon, 4 Sep 2023 01:44:58 -0500
Subject: [PATCH 132/136] add HTML files to gitignore (#9541)

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index 672b5bcde..4480c8102 100644
--- a/.gitignore
+++ b/.gitignore
@@ -146,6 +146,8 @@ __MACOSX/
 model*.json
 
 # R tests
+*.htm
+*.html
 *.libsvm
 *.rds
 Rplots.pdf

From 419e052314f7bb5c14b0f4b82ea57e7ee72e7e9c Mon Sep 17 00:00:00 2001
From: Bobby Wang <wbo4958@gmail.com>
Date: Mon, 4 Sep 2023 15:57:16 +0800
Subject: [PATCH 133/136] [pyspark] rework transform to reuse same code (#9292)

---
 python-package/xgboost/spark/core.py | 250 +++++++++++++--------------
 1 file changed, 122 insertions(+), 128 deletions(-)

diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py
index af58c994f..d6667ad89 100644
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@@ -64,6 +64,7 @@ from xgboost.core import Booster, _check_distributed_params
 from xgboost.sklearn import DEFAULT_N_ESTIMATORS, XGBModel, _can_use_qdm
 from xgboost.training import train as worker_train
 
+from .._typing import ArrayLike
 from .data import (
     _read_csr_matrix_from_unwrapped_spark_vec,
     alias,
@@ -1117,12 +1118,86 @@ class _SparkXGBModel(Model, _SparkXGBParams, MLReadable, MLWritable):
             )
         return features_col, feature_col_names
 
+    def _get_pred_contrib_col_name(self) -> Optional[str]:
+        """Return the pred_contrib_col col name"""
+        pred_contrib_col_name = None
+        if (
+            self.isDefined(self.pred_contrib_col)
+            and self.getOrDefault(self.pred_contrib_col) != ""
+        ):
+            pred_contrib_col_name = self.getOrDefault(self.pred_contrib_col)
+
+        return pred_contrib_col_name
+
+    def _out_schema(self) -> Tuple[bool, str]:
+        """Return the bool to indicate if it's a single prediction, true is single prediction,
+        and the returned type of the user-defined function. The value must
+        be a DDL-formatted type string."""
+
+        if self._get_pred_contrib_col_name() is not None:
+            return False, f"{pred.prediction} double, {pred.pred_contrib} array<double>"
+
+        return True, "double"
+
+    def _get_predict_func(self) -> Callable:
+        """Return the true prediction function which will be running on the executor side"""
+
+        predict_params = self._gen_predict_params_dict()
+        pred_contrib_col_name = self._get_pred_contrib_col_name()
+
+        def _predict(
+            model: XGBModel, X: ArrayLike, base_margin: Optional[ArrayLike]
+        ) -> Union[pd.DataFrame, pd.Series]:
+            data = {}
+            preds = model.predict(
+                X,
+                base_margin=base_margin,
+                validate_features=False,
+                **predict_params,
+            )
+            data[pred.prediction] = pd.Series(preds)
+
+            if pred_contrib_col_name is not None:
+                contribs = pred_contribs(model, X, base_margin)
+                data[pred.pred_contrib] = pd.Series(list(contribs))
+                return pd.DataFrame(data=data)
+
+            return data[pred.prediction]
+
+        return _predict
+
+    def _post_transform(self, dataset: DataFrame, pred_col: Column) -> DataFrame:
+        """Post process of transform"""
+        prediction_col_name = self.getOrDefault(self.predictionCol)
+        single_pred, _ = self._out_schema()
+
+        if single_pred:
+            if prediction_col_name:
+                dataset = dataset.withColumn(prediction_col_name, pred_col)
+        else:
+            pred_struct_col = "_prediction_struct"
+            dataset = dataset.withColumn(pred_struct_col, pred_col)
+
+            if prediction_col_name:
+                dataset = dataset.withColumn(
+                    prediction_col_name, getattr(col(pred_struct_col), pred.prediction)
+                )
+
+            pred_contrib_col_name = self._get_pred_contrib_col_name()
+            if pred_contrib_col_name is not None:
+                dataset = dataset.withColumn(
+                    pred_contrib_col_name,
+                    array_to_vector(getattr(col(pred_struct_col), pred.pred_contrib)),
+                )
+
+            dataset = dataset.drop(pred_struct_col)
+        return dataset
+
     def _transform(self, dataset: DataFrame) -> DataFrame:
         # pylint: disable=too-many-statements, too-many-locals
         # Save xgb_sklearn_model and predict_params to be local variable
         # to avoid the `self` object to be pickled to remote.
         xgb_sklearn_model = self._xgb_sklearn_model
-        predict_params = self._gen_predict_params_dict()
 
         has_base_margin = False
         if (
@@ -1137,18 +1212,9 @@ class _SparkXGBModel(Model, _SparkXGBParams, MLReadable, MLWritable):
         features_col, feature_col_names = self._get_feature_col(dataset)
         enable_sparse_data_optim = self.getOrDefault(self.enable_sparse_data_optim)
 
-        pred_contrib_col_name = None
-        if (
-            self.isDefined(self.pred_contrib_col)
-            and self.getOrDefault(self.pred_contrib_col) != ""
-        ):
-            pred_contrib_col_name = self.getOrDefault(self.pred_contrib_col)
+        predict_func = self._get_predict_func()
 
-        single_pred = True
-        schema = "double"
-        if pred_contrib_col_name:
-            single_pred = False
-            schema = f"{pred.prediction} double, {pred.pred_contrib} array<double>"
+        _, schema = self._out_schema()
 
         @pandas_udf(schema)  # type: ignore
         def predict_udf(iterator: Iterator[pd.DataFrame]) -> Iterator[pd.Series]:
@@ -1168,48 +1234,14 @@ class _SparkXGBModel(Model, _SparkXGBParams, MLReadable, MLWritable):
                 else:
                     base_margin = None
 
-                data = {}
-                preds = model.predict(
-                    X,
-                    base_margin=base_margin,
-                    validate_features=False,
-                    **predict_params,
-                )
-                data[pred.prediction] = pd.Series(preds)
-
-                if pred_contrib_col_name:
-                    contribs = pred_contribs(model, X, base_margin)
-                    data[pred.pred_contrib] = pd.Series(list(contribs))
-                    yield pd.DataFrame(data=data)
-                else:
-                    yield data[pred.prediction]
+                yield predict_func(model, X, base_margin)
 
         if has_base_margin:
             pred_col = predict_udf(struct(*features_col, base_margin_col))
         else:
             pred_col = predict_udf(struct(*features_col))
 
-        prediction_col_name = self.getOrDefault(self.predictionCol)
-
-        if single_pred:
-            dataset = dataset.withColumn(prediction_col_name, pred_col)
-        else:
-            pred_struct_col = "_prediction_struct"
-            dataset = dataset.withColumn(pred_struct_col, pred_col)
-
-            dataset = dataset.withColumn(
-                prediction_col_name, getattr(col(pred_struct_col), pred.prediction)
-            )
-
-            if pred_contrib_col_name:
-                dataset = dataset.withColumn(
-                    pred_contrib_col_name,
-                    array_to_vector(getattr(col(pred_struct_col), pred.pred_contrib)),
-                )
-
-            dataset = dataset.drop(pred_struct_col)
-
-        return dataset
+        return self._post_transform(dataset, pred_col)
 
 
 class _ClassificationModel(  # pylint: disable=abstract-method
@@ -1221,22 +1253,21 @@ class _ClassificationModel(  # pylint: disable=abstract-method
     .. Note:: This API is experimental.
     """
 
-    def _transform(self, dataset: DataFrame) -> DataFrame:
-        # pylint: disable=too-many-statements, too-many-locals
-        # Save xgb_sklearn_model and predict_params to be local variable
-        # to avoid the `self` object to be pickled to remote.
-        xgb_sklearn_model = self._xgb_sklearn_model
-        predict_params = self._gen_predict_params_dict()
+    def _out_schema(self) -> Tuple[bool, str]:
+        schema = (
+            f"{pred.raw_prediction} array<double>, {pred.prediction} double,"
+            f" {pred.probability} array<double>"
+        )
+        if self._get_pred_contrib_col_name() is not None:
+            # We will force setting strict_shape to True when predicting contribs,
+            # So, it will also output 3-D shape result.
+            schema = f"{schema}, {pred.pred_contrib} array<array<double>>"
 
-        has_base_margin = False
-        if (
-            self.isDefined(self.base_margin_col)
-            and self.getOrDefault(self.base_margin_col) != ""
-        ):
-            has_base_margin = True
-            base_margin_col = col(self.getOrDefault(self.base_margin_col)).alias(
-                alias.margin
-            )
+        return False, schema
+
+    def _get_predict_func(self) -> Callable:
+        predict_params = self._gen_predict_params_dict()
+        pred_contrib_col_name = self._get_pred_contrib_col_name()
 
         def transform_margin(margins: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
             if margins.ndim == 1:
@@ -1251,76 +1282,38 @@ class _ClassificationModel(  # pylint: disable=abstract-method
                 class_probs = softmax(raw_preds, axis=1)
             return raw_preds, class_probs
 
-        features_col, feature_col_names = self._get_feature_col(dataset)
-        enable_sparse_data_optim = self.getOrDefault(self.enable_sparse_data_optim)
+        def _predict(
+            model: XGBModel, X: ArrayLike, base_margin: Optional[np.ndarray]
+        ) -> Union[pd.DataFrame, pd.Series]:
+            margins = model.predict(
+                X,
+                base_margin=base_margin,
+                output_margin=True,
+                validate_features=False,
+                **predict_params,
+            )
+            raw_preds, class_probs = transform_margin(margins)
 
-        pred_contrib_col_name = None
-        if (
-            self.isDefined(self.pred_contrib_col)
-            and self.getOrDefault(self.pred_contrib_col) != ""
-        ):
-            pred_contrib_col_name = self.getOrDefault(self.pred_contrib_col)
+            # It seems that they use argmax of class probs,
+            # not of margin to get the prediction (Note: scala implementation)
+            preds = np.argmax(class_probs, axis=1)
+            result: Dict[str, pd.Series] = {
+                pred.raw_prediction: pd.Series(list(raw_preds)),
+                pred.prediction: pd.Series(preds),
+                pred.probability: pd.Series(list(class_probs)),
+            }
 
-        schema = (
-            f"{pred.raw_prediction} array<double>, {pred.prediction} double,"
-            f" {pred.probability} array<double>"
-        )
-        if pred_contrib_col_name:
-            # We will force setting strict_shape to True when predicting contribs,
-            # So, it will also output 3-D shape result.
-            schema = f"{schema}, {pred.pred_contrib} array<array<double>>"
+            if pred_contrib_col_name is not None:
+                contribs = pred_contribs(model, X, base_margin, strict_shape=True)
+                result[pred.pred_contrib] = pd.Series(list(contribs.tolist()))
 
-        @pandas_udf(schema)  # type: ignore
-        def predict_udf(
-            iterator: Iterator[Tuple[pd.Series, ...]]
-        ) -> Iterator[pd.DataFrame]:
-            assert xgb_sklearn_model is not None
-            model = xgb_sklearn_model
-            for data in iterator:
-                if enable_sparse_data_optim:
-                    X = _read_csr_matrix_from_unwrapped_spark_vec(data)
-                else:
-                    if feature_col_names is not None:
-                        X = data[feature_col_names]  # type: ignore
-                    else:
-                        X = stack_series(data[alias.data])
+            return pd.DataFrame(data=result)
 
-                if has_base_margin:
-                    base_margin = stack_series(data[alias.margin])
-                else:
-                    base_margin = None
-
-                margins = model.predict(
-                    X,
-                    base_margin=base_margin,
-                    output_margin=True,
-                    validate_features=False,
-                    **predict_params,
-                )
-                raw_preds, class_probs = transform_margin(margins)
-
-                # It seems that they use argmax of class probs,
-                # not of margin to get the prediction (Note: scala implementation)
-                preds = np.argmax(class_probs, axis=1)
-                result: Dict[str, pd.Series] = {
-                    pred.raw_prediction: pd.Series(list(raw_preds)),
-                    pred.prediction: pd.Series(preds),
-                    pred.probability: pd.Series(list(class_probs)),
-                }
-
-                if pred_contrib_col_name:
-                    contribs = pred_contribs(model, X, base_margin, strict_shape=True)
-                    result[pred.pred_contrib] = pd.Series(list(contribs.tolist()))
-
-                yield pd.DataFrame(data=result)
-
-        if has_base_margin:
-            pred_struct = predict_udf(struct(*features_col, base_margin_col))
-        else:
-            pred_struct = predict_udf(struct(*features_col))
+        return _predict
 
+    def _post_transform(self, dataset: DataFrame, pred_col: Column) -> DataFrame:
         pred_struct_col = "_prediction_struct"
-        dataset = dataset.withColumn(pred_struct_col, pred_struct)
+        dataset = dataset.withColumn(pred_struct_col, pred_col)
 
         raw_prediction_col_name = self.getOrDefault(self.rawPredictionCol)
         if raw_prediction_col_name:
@@ -1342,7 +1335,8 @@ class _ClassificationModel(  # pylint: disable=abstract-method
                 array_to_vector(getattr(col(pred_struct_col), pred.probability)),
             )
 
-        if pred_contrib_col_name:
+        pred_contrib_col_name = self._get_pred_contrib_col_name()
+        if pred_contrib_col_name is not None:
             dataset = dataset.withColumn(
                 pred_contrib_col_name,
                 getattr(col(pred_struct_col), pred.pred_contrib),

From d159ee85474b288694b4b68048489dedcd49c7a8 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Mon, 4 Sep 2023 04:40:46 -0500
Subject: [PATCH 134/136] [R] reformat build scripts (#9540)

---
 R-package/CMakeLists.txt   | 39 +++++++++++++++++++++++++-------------
 R-package/src/Makevars.in  | 31 ++++++++++++++++++++++++------
 R-package/src/Makevars.win | 31 ++++++++++++++++++++++++------
 3 files changed, 76 insertions(+), 25 deletions(-)

diff --git a/R-package/CMakeLists.txt b/R-package/CMakeLists.txt
index 003a635a5..a19e56f4e 100644
--- a/R-package/CMakeLists.txt
+++ b/R-package/CMakeLists.txt
@@ -1,41 +1,54 @@
 find_package(LibR REQUIRED)
 message(STATUS "LIBR_CORE_LIBRARY " ${LIBR_CORE_LIBRARY})
 
-file(GLOB_RECURSE R_SOURCES
+file(
+  GLOB_RECURSE R_SOURCES
   ${CMAKE_CURRENT_LIST_DIR}/src/*.cc
-  ${CMAKE_CURRENT_LIST_DIR}/src/*.c)
+  ${CMAKE_CURRENT_LIST_DIR}/src/*.c
+)
+
 # Use object library to expose symbols
 add_library(xgboost-r OBJECT ${R_SOURCES})
-if (ENABLE_ALL_WARNINGS)
+
+if(ENABLE_ALL_WARNINGS)
   target_compile_options(xgboost-r PRIVATE -Wall -Wextra)
-endif (ENABLE_ALL_WARNINGS)
-target_compile_definitions(xgboost-r
-  PUBLIC
+endif()
+
+target_compile_definitions(
+  xgboost-r PUBLIC
   -DXGBOOST_STRICT_R_MODE=1
   -DXGBOOST_CUSTOMIZE_GLOBAL_PRNG=1
   -DDMLC_LOG_BEFORE_THROW=0
   -DDMLC_DISABLE_STDIN=1
   -DDMLC_LOG_CUSTOMIZE=1
-  -DRABIT_STRICT_CXX98_)
-target_include_directories(xgboost-r
-  PRIVATE
+  -DRABIT_STRICT_CXX98_
+)
+
+target_include_directories(
+  xgboost-r PRIVATE
   ${LIBR_INCLUDE_DIRS}
   ${PROJECT_SOURCE_DIR}/include
   ${PROJECT_SOURCE_DIR}/dmlc-core/include
-  ${PROJECT_SOURCE_DIR}/rabit/include)
+  ${PROJECT_SOURCE_DIR}/rabit/include
+)
+
 target_link_libraries(xgboost-r PUBLIC ${LIBR_CORE_LIBRARY})
-if (USE_OPENMP)
+
+if(USE_OPENMP)
   find_package(OpenMP REQUIRED)
   target_link_libraries(xgboost-r PUBLIC OpenMP::OpenMP_CXX OpenMP::OpenMP_C)
-endif (USE_OPENMP)
+endif()
+
 set_target_properties(
   xgboost-r PROPERTIES
   CXX_STANDARD 17
   CXX_STANDARD_REQUIRED ON
-  POSITION_INDEPENDENT_CODE ON)
+  POSITION_INDEPENDENT_CODE ON
+)
 
 # Get compilation and link flags of xgboost-r and propagate to objxgboost
 target_link_libraries(objxgboost PUBLIC xgboost-r)
+
 # Add all objects of xgboost-r to objxgboost
 target_sources(objxgboost INTERFACE $<TARGET_OBJECTS:xgboost-r>)
 
diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in
index 9e7cbfed4..f42c94501 100644
--- a/R-package/src/Makevars.in
+++ b/R-package/src/Makevars.in
@@ -5,9 +5,12 @@ ENABLE_STD_THREAD=1
 
 CXX_STD = CXX17
 
-XGB_RFLAGS = -DXGBOOST_STRICT_R_MODE=1 -DDMLC_LOG_BEFORE_THROW=0\
-           -DDMLC_ENABLE_STD_THREAD=$(ENABLE_STD_THREAD) -DDMLC_DISABLE_STDIN=1\
-           -DDMLC_LOG_CUSTOMIZE=1
+XGB_RFLAGS = \
+    -DXGBOOST_STRICT_R_MODE=1 \
+    -DDMLC_LOG_BEFORE_THROW=0 \
+    -DDMLC_ENABLE_STD_THREAD=$(ENABLE_STD_THREAD) \
+    -DDMLC_DISABLE_STDIN=1 \
+    -DDMLC_LOG_CUSTOMIZE=1
 
 # disable the use of thread_local for 32 bit windows:
 ifeq ($(R_OSTYPE)$(WIN),windows)
@@ -15,9 +18,25 @@ ifeq ($(R_OSTYPE)$(WIN),windows)
 endif
 $(foreach v, $(XGB_RFLAGS), $(warning $(v)))
 
-PKG_CPPFLAGS=  -I$(PKGROOT)/include -I$(PKGROOT)/dmlc-core/include -I$(PKGROOT)/rabit/include -I$(PKGROOT) $(XGB_RFLAGS)
-PKG_CXXFLAGS= @OPENMP_CXXFLAGS@ @ENDIAN_FLAG@ -pthread $(CXX_VISIBILITY)
-PKG_LIBS = @OPENMP_CXXFLAGS@ @OPENMP_LIB@ @ENDIAN_FLAG@ @BACKTRACE_LIB@ -pthread
+PKG_CPPFLAGS = \
+    -I$(PKGROOT)/include \
+    -I$(PKGROOT)/dmlc-core/include \
+    -I$(PKGROOT)/rabit/include \
+    -I$(PKGROOT) \
+    $(XGB_RFLAGS)
+
+PKG_CXXFLAGS = \
+    @OPENMP_CXXFLAGS@ \
+    @ENDIAN_FLAG@ \
+    -pthread \
+    $(CXX_VISIBILITY)
+
+PKG_LIBS = \
+    @OPENMP_CXXFLAGS@ \
+    @OPENMP_LIB@ \
+    @ENDIAN_FLAG@ \
+    @BACKTRACE_LIB@ \
+    -pthread
 
 OBJECTS= \
     ./xgboost_R.o \
diff --git a/R-package/src/Makevars.win b/R-package/src/Makevars.win
index 7dfa415a4..1b620751f 100644
--- a/R-package/src/Makevars.win
+++ b/R-package/src/Makevars.win
@@ -5,9 +5,12 @@ ENABLE_STD_THREAD=0
 
 CXX_STD = CXX17
 
-XGB_RFLAGS = -DXGBOOST_STRICT_R_MODE=1 -DDMLC_LOG_BEFORE_THROW=0\
-           -DDMLC_ENABLE_STD_THREAD=$(ENABLE_STD_THREAD) -DDMLC_DISABLE_STDIN=1\
-           -DDMLC_LOG_CUSTOMIZE=1
+XGB_RFLAGS = \
+    -DXGBOOST_STRICT_R_MODE=1 \
+    -DDMLC_LOG_BEFORE_THROW=0 \
+    -DDMLC_ENABLE_STD_THREAD=$(ENABLE_STD_THREAD) \
+    -DDMLC_DISABLE_STDIN=1 \
+    -DDMLC_LOG_CUSTOMIZE=1
 
 # disable the use of thread_local for 32 bit windows:
 ifeq ($(R_OSTYPE)$(WIN),windows)
@@ -15,9 +18,25 @@ ifeq ($(R_OSTYPE)$(WIN),windows)
 endif
 $(foreach v, $(XGB_RFLAGS), $(warning $(v)))
 
-PKG_CPPFLAGS=  -I$(PKGROOT)/include -I$(PKGROOT)/dmlc-core/include -I$(PKGROOT)/rabit/include -I$(PKGROOT) $(XGB_RFLAGS)
-PKG_CXXFLAGS= $(SHLIB_OPENMP_CXXFLAGS) -DDMLC_CMAKE_LITTLE_ENDIAN=1 $(SHLIB_PTHREAD_FLAGS) $(CXX_VISIBILITY)
-PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS)  -DDMLC_CMAKE_LITTLE_ENDIAN=1  $(SHLIB_PTHREAD_FLAGS) -lwsock32 -lws2_32
+PKG_CPPFLAGS = \
+    -I$(PKGROOT)/include \
+    -I$(PKGROOT)/dmlc-core/include \
+    -I$(PKGROOT)/rabit/include \
+    -I$(PKGROOT) \
+    $(XGB_RFLAGS)
+
+PKG_CXXFLAGS = \
+    $(SHLIB_OPENMP_CXXFLAGS) \
+    -DDMLC_CMAKE_LITTLE_ENDIAN=1 \
+    $(SHLIB_PTHREAD_FLAGS) \
+    $(CXX_VISIBILITY)
+
+PKG_LIBS = \
+    $(SHLIB_OPENMP_CXXFLAGS) \
+    -DDMLC_CMAKE_LITTLE_ENDIAN=1 \
+    $(SHLIB_PTHREAD_FLAGS) \
+    -lwsock32 \
+    -lws2_32
 
 OBJECTS= \
     ./xgboost_R.o \

From adea842c838095b92bda2dde962a28a606513e57 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 5 Sep 2023 01:04:24 +0800
Subject: [PATCH 135/136] Fix inplace predict with fallback when base margin is
 used. (#9536)

- Copy meta info from proxy DMatrix.
- Use `std::call_once` to emit less warnings.
---
 src/common/error_msg.cc                 | 57 +++++++++++++++----------
 src/common/error_msg.h                  |  5 ++-
 src/data/proxy_dmatrix.cc               |  1 +
 src/gbm/gbtree.cc                       | 23 +---------
 tests/cpp/gbm/test_gbtree.cu            | 15 -------
 tests/python-gpu/test_gpu_prediction.py | 24 +++++++++--
 6 files changed, 62 insertions(+), 63 deletions(-)

diff --git a/src/common/error_msg.cc b/src/common/error_msg.cc
index 062549794..8871c1a1d 100644
--- a/src/common/error_msg.cc
+++ b/src/common/error_msg.cc
@@ -3,9 +3,11 @@
  */
 #include "error_msg.h"
 
+#include <mutex>    // for call_once, once_flag
 #include <sstream>  // for stringstream
 
 #include "../collective/communicator-inl.h"  // for GetRank
+#include "xgboost/context.h"                 // for Context
 #include "xgboost/logging.h"
 
 namespace xgboost::error {
@@ -26,34 +28,43 @@ void WarnDeprecatedGPUHist() {
 }
 
 void WarnManualUpdater() {
-  bool static thread_local logged{false};
-  if (logged) {
-    return;
-  }
-  LOG(WARNING)
-      << "You have manually specified the `updater` parameter. The `tree_method` parameter "
-         "will be ignored. Incorrect sequence of updaters will produce undefined "
-         "behavior. For common uses, we recommend using `tree_method` parameter instead.";
-  logged = true;
+  static std::once_flag flag;
+  std::call_once(flag, [] {
+    LOG(WARNING)
+        << "You have manually specified the `updater` parameter. The `tree_method` parameter "
+           "will be ignored. Incorrect sequence of updaters will produce undefined "
+           "behavior. For common uses, we recommend using `tree_method` parameter instead.";
+  });
 }
 
 void WarnDeprecatedGPUId() {
-  static thread_local bool logged{false};
-  if (logged) {
-    return;
-  }
-  auto msg = DeprecatedFunc("gpu_id", "2.0.0", "device");
-  msg += " E.g. device=cpu/cuda/cuda:0";
-  LOG(WARNING) << msg;
-  logged = true;
+  static std::once_flag flag;
+  std::call_once(flag, [] {
+    auto msg = DeprecatedFunc("gpu_id", "2.0.0", "device");
+    msg += " E.g. device=cpu/cuda/cuda:0";
+    LOG(WARNING) << msg;
+  });
 }
 
 void WarnEmptyDataset() {
-  static thread_local bool logged{false};
-  if (logged) {
-    return;
-  }
-  LOG(WARNING) << "Empty dataset at worker: " << collective::GetRank();
-  logged = true;
+  static std::once_flag flag;
+  std::call_once(flag,
+                 [] { LOG(WARNING) << "Empty dataset at worker: " << collective::GetRank(); });
+}
+
+void MismatchedDevices(Context const* booster, Context const* data) {
+  static std::once_flag flag;
+  std::call_once(flag, [&] {
+    LOG(WARNING)
+        << "Falling back to prediction using DMatrix due to mismatched devices. This might "
+           "lead to higher memory usage and slower performance. XGBoost is running on: "
+        << booster->DeviceName() << ", while the input data is on: " << data->DeviceName() << ".\n"
+        << R"(Potential solutions:
+- Use a data structure that matches the device ordinal in the booster.
+- Set the device for booster before call to inplace_predict.
+
+This warning will only be shown once.
+)";
+  });
 }
 }  // namespace xgboost::error
diff --git a/src/common/error_msg.h b/src/common/error_msg.h
index 1af4b7c88..94703fd15 100644
--- a/src/common/error_msg.h
+++ b/src/common/error_msg.h
@@ -10,7 +10,8 @@
 #include <limits>     // for numeric_limits
 #include <string>     // for string
 
-#include "xgboost/base.h"  // for bst_feature_t
+#include "xgboost/base.h"     // for bst_feature_t
+#include "xgboost/context.h"  // for Context
 #include "xgboost/logging.h"
 #include "xgboost/string_view.h"  // for StringView
 
@@ -94,5 +95,7 @@ constexpr StringView InvalidCUDAOrdinal() {
   return "Invalid device. `device` is required to be CUDA and there must be at least one GPU "
          "available for using GPU.";
 }
+
+void MismatchedDevices(Context const* booster, Context const* data);
 }  // namespace xgboost::error
 #endif  // XGBOOST_COMMON_ERROR_MSG_H_
diff --git a/src/data/proxy_dmatrix.cc b/src/data/proxy_dmatrix.cc
index cb8e290c8..e920ef50e 100644
--- a/src/data/proxy_dmatrix.cc
+++ b/src/data/proxy_dmatrix.cc
@@ -55,6 +55,7 @@ std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *ctx,
   }
 
   CHECK(p_fmat) << "Failed to fallback.";
+  p_fmat->Info() = proxy->Info().Copy();
   return p_fmat;
 }
 }  // namespace xgboost::data
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index 50dfe9262..438fd15e6 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -85,25 +85,6 @@ bool UpdatersMatched(std::vector<std::string> updater_seq,
                       return name == up->Name();
                     });
 }
-
-void MismatchedDevices(Context const* booster, Context const* data) {
-  bool thread_local static logged{false};
-  if (logged) {
-    return;
-  }
-  LOG(WARNING) << "Falling back to prediction using DMatrix due to mismatched devices. This might "
-                  "lead to higher memory usage and slower performance. XGBoost is running on: "
-               << booster->DeviceName() << ", while the input data is on: " << data->DeviceName()
-               << ".\n"
-               << R"(Potential solutions:
-- Use a data structure that matches the device ordinal in the booster.
-- Set the device for booster before call to inplace_predict.
-
-This warning will only be shown once for each thread. Subsequent warnings made by the
-current thread will be suppressed.
-)";
-  logged = true;
-}
 }  // namespace
 
 void GBTree::Configure(Args const& cfg) {
@@ -557,7 +538,7 @@ void GBTree::InplacePredict(std::shared_ptr<DMatrix> p_m, float missing,
   auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
   CHECK_LE(tree_end, model_.trees.size()) << "Invalid number of trees.";
   if (p_m->Ctx()->Device() != this->ctx_->Device()) {
-    MismatchedDevices(this->ctx_, p_m->Ctx());
+    error::MismatchedDevices(this->ctx_, p_m->Ctx());
     CHECK_EQ(out_preds->version, 0);
     auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_m);
     CHECK(proxy) << error::InplacePredictProxy();
@@ -810,7 +791,7 @@ class Dart : public GBTree {
     auto n_groups = model_.learner_model_param->num_output_group;
 
     if (ctx_->Device() != p_fmat->Ctx()->Device()) {
-      MismatchedDevices(ctx_, p_fmat->Ctx());
+      error::MismatchedDevices(ctx_, p_fmat->Ctx());
       auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_fmat);
       CHECK(proxy) << error::InplacePredictProxy();
       auto p_fmat = data::CreateDMatrixFromProxy(ctx_, proxy, missing);
diff --git a/tests/cpp/gbm/test_gbtree.cu b/tests/cpp/gbm/test_gbtree.cu
index 03f689822..801c935d6 100644
--- a/tests/cpp/gbm/test_gbtree.cu
+++ b/tests/cpp/gbm/test_gbtree.cu
@@ -58,21 +58,6 @@ void TestInplaceFallback(Context const* ctx) {
   HostDeviceVector<float>* out_predt{nullptr};
   ConsoleLogger::Configure(Args{{"verbosity", "1"}});
   std::string output;
-  // test whether the warning is raised
-#if !defined(_WIN32)
-  // Windows has issue with CUDA and thread local storage. For some reason, on Windows a
-  // cudaInitializationError is raised during destruction of `HostDeviceVector`. This
-  // might be related to https://github.com/dmlc/xgboost/issues/5793
-  ::testing::internal::CaptureStderr();
-  std::thread{[&] {
-    // Launch a new thread to ensure a warning is raised as we prevent over-verbose
-    // warning by using thread-local flags.
-    learner->InplacePredict(p_m, PredictionType::kValue, std::numeric_limits<float>::quiet_NaN(),
-                            &out_predt, 0, 0);
-  }}.join();
-  output = testing::internal::GetCapturedStderr();
-  ASSERT_NE(output.find("Falling back"), std::string::npos);
-#endif
 
   learner->InplacePredict(p_m, PredictionType::kValue, std::numeric_limits<float>::quiet_NaN(),
                           &out_predt, 0, 0);
diff --git a/tests/python-gpu/test_gpu_prediction.py b/tests/python-gpu/test_gpu_prediction.py
index fb5f47c2b..ec7c45ca2 100644
--- a/tests/python-gpu/test_gpu_prediction.py
+++ b/tests/python-gpu/test_gpu_prediction.py
@@ -191,14 +191,32 @@ class TestGPUPredict:
         np.testing.assert_allclose(predt_0, predt_3)
         np.testing.assert_allclose(predt_0, predt_4)
 
-    def run_inplace_base_margin(self, booster, dtrain, X, base_margin):
+    def run_inplace_base_margin(
+        self, device: int, booster: xgb.Booster, dtrain: xgb.DMatrix, X, base_margin
+    ) -> None:
         import cupy as cp
 
+        booster.set_param({"device": f"cuda:{device}"})
         dtrain.set_info(base_margin=base_margin)
         from_inplace = booster.inplace_predict(data=X, base_margin=base_margin)
         from_dmatrix = booster.predict(dtrain)
         cp.testing.assert_allclose(from_inplace, from_dmatrix)
 
+        booster = booster.copy()  # clear prediction cache.
+        booster.set_param({"device": "cpu"})
+        from_inplace = booster.inplace_predict(data=X, base_margin=base_margin)
+        from_dmatrix = booster.predict(dtrain)
+        cp.testing.assert_allclose(from_inplace, from_dmatrix)
+
+        booster = booster.copy()  # clear prediction cache.
+        base_margin = cp.asnumpy(base_margin)
+        if hasattr(X, "values"):
+            X = cp.asnumpy(X.values)
+        booster.set_param({"device": f"cuda:{device}"})
+        from_inplace = booster.inplace_predict(data=X, base_margin=base_margin)
+        from_dmatrix = booster.predict(dtrain)
+        cp.testing.assert_allclose(from_inplace, from_dmatrix, rtol=1e-6)
+
     def run_inplace_predict_cupy(self, device: int) -> None:
         import cupy as cp
 
@@ -244,7 +262,7 @@ class TestGPUPredict:
             run_threaded_predict(X, rows, predict_dense)
 
         base_margin = cp_rng.randn(rows)
-        self.run_inplace_base_margin(booster, dtrain, X, base_margin)
+        self.run_inplace_base_margin(device, booster, dtrain, X, base_margin)
 
         # Create a wide dataset
         X = cp_rng.randn(100, 10000)
@@ -318,7 +336,7 @@ class TestGPUPredict:
             run_threaded_predict(X, rows, predict_df)
 
         base_margin = cudf.Series(rng.randn(rows))
-        self.run_inplace_base_margin(booster, dtrain, X, base_margin)
+        self.run_inplace_base_margin(0, booster, dtrain, X, base_margin)
 
     @given(
         strategies.integers(1, 10), tm.make_dataset_strategy(), shap_parameter_strategy

From 3b9e5909fb4d8f66727c9f5ce2566b89bd9dd19f Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 5 Sep 2023 16:14:45 +0800
Subject: [PATCH 136/136] [CI] bump `setup-r` action version. (#9544)

---
 .github/workflows/r_tests.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/r_tests.yml b/.github/workflows/r_tests.yml
index 640ebce81..c353fd0d2 100644
--- a/.github/workflows/r_tests.yml
+++ b/.github/workflows/r_tests.yml
@@ -25,7 +25,7 @@ jobs:
       with:
         submodules: 'true'
 
-    - uses: r-lib/actions/setup-r@50d1eae9b8da0bb3f8582c59a5b82225fa2fe7f2 # v2.3.1
+    - uses: r-lib/actions/setup-r@11a22a908006c25fe054c4ef0ac0436b1de3edbe # v2.6.4
       with:
         r-version: ${{ matrix.config.r }}
 
@@ -64,7 +64,7 @@ jobs:
       with:
         submodules: 'true'
 
-    - uses: r-lib/actions/setup-r@50d1eae9b8da0bb3f8582c59a5b82225fa2fe7f2 # v2.3.1
+    - uses: r-lib/actions/setup-r@11a22a908006c25fe054c4ef0ac0436b1de3edbe # v2.6.4
       with:
         r-version: ${{ matrix.config.r }}