merge latest changes

2023-12-13 21:06:28 -08:00
parent c81731308c 9c56916fd7
commit 2d7ffbdf3d
194 changed files with 4859 additions and 2838 deletions
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -21,6 +21,10 @@ if (USE_HIP)
  target_sources(objxgboost PRIVATE ${HIP_SOURCES})
 endif (USE_HIP)

+if(PLUGIN_SYCL)
+  target_compile_definitions(objxgboost PRIVATE -DXGBOOST_USE_SYCL=1)
+endif()
+
 target_include_directories(objxgboost
  PRIVATE
  ${xgboost_SOURCE_DIR}/include
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -7,8 +7,6 @@
 #include <cinttypes>                         // for strtoimax
 #include <cmath>                             // for nan
 #include <cstring>                           // for strcmp
-#include <fstream>                           // for operator<<, basic_ostream, ios, stringstream
-#include <functional>                        // for less
 #include <limits>                            // for numeric_limits
 #include <map>                               // for operator!=, _Rb_tree_const_iterator, _Rb_tre...
 #include <memory>                            // for shared_ptr, allocator, __shared_ptr_access
@@ -22,7 +20,6 @@
 #include "../common/charconv.h"              // for from_chars, to_chars, NumericLimits, from_ch...
 #include "../common/hist_util.h"             // for HistogramCuts
 #include "../common/io.h"                    // for FileExtension, LoadSequentialFile, MemoryBuf...
-#include "../common/linalg_op.h"             // for ElementWiseTransformHost
 #include "../common/threading_utils.h"       // for OmpGetNumThreads, ParallelFor
 #include "../data/adapter.h"                 // for ArrayAdapter, DenseAdapter, RecordBatchesIte...
 #include "../data/ellpack_page.h"            // for EllpackPage
@@ -35,14 +32,12 @@
 #include "dmlc/parameter.h"                  // for FieldAccessEntry, FieldEntry, ParamManager
 #include "dmlc/thread_local.h"               // for ThreadLocalStore
 #include "rabit/c_api.h"                     // for RabitLinkTag
-#include "rabit/rabit.h"                     // for CheckPoint, LoadCheckPoint
 #include "xgboost/base.h"                    // for bst_ulong, bst_float, GradientPair, bst_feat...
 #include "xgboost/context.h"                 // for Context
 #include "xgboost/data.h"                    // for DMatrix, MetaInfo, DataType, ExtSparsePage
 #include "xgboost/feature_map.h"             // for FeatureMap
 #include "xgboost/global_config.h"           // for GlobalConfiguration, GlobalConfigThreadLocal...
 #include "xgboost/host_device_vector.h"      // for HostDeviceVector
-#include "xgboost/intrusive_ptr.h"           // for xgboost
 #include "xgboost/json.h"                    // for Json, get, Integer, IsA, Boolean, String
 #include "xgboost/learner.h"                 // for Learner, PredictionType
 #include "xgboost/logging.h"                 // for LOG_FATAL, LogMessageFatal, CHECK, LogCheck_EQ
@@ -81,6 +76,8 @@ void XGBBuildInfoDevice(Json *p_info) {
  info["USE_HIP"] = Boolean{false};
  info["USE_RCCL"] = Boolean{false};
  info["USE_RMM"] = Boolean{false};
+  info["USE_DLOPEN_NCCL"] = Boolean{false};
+  info["USE_DLOPEN_RCCL"] = Boolean{false};
 }
 }  // namespace xgboost
 #endif
--- a/src/c_api/c_api.cu
+++ b/src/c_api/c_api.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019-2023 by XGBoost Contributors
+ * Copyright 2019-2023, XGBoost Contributors
 */
 #include <thrust/transform.h>  // for transform

@@ -15,6 +15,9 @@
 #include "xgboost/data.h"
 #include "xgboost/json.h"
 #include "xgboost/learner.h"
+#if defined(XGBOOST_USE_NCCL)
+#include <nccl.h>
+#endif

 namespace xgboost {
 void XGBBuildInfoDevice(Json *p_info) {
@@ -38,15 +41,27 @@ void XGBBuildInfoDevice(Json *p_info) {
  info["USE_NCCL"] = Boolean{true};
  v = {Json{Integer{NCCL_MAJOR}}, Json{Integer{NCCL_MINOR}}, Json{Integer{NCCL_PATCH}}};
  info["NCCL_VERSION"] = v;
+#if defined(XGBOOST_USE_DLOPEN_NCCL)
+  info["USE_DLOPEN_NCCL"] = Boolean{true};
+#else
+  info["USE_DLOPEN_NCCL"] = Boolean{false};
+#endif  // defined(XGBOOST_USE_DLOPEN_NCCL)
 #elif defined(XGBOOST_USE_RCCL)
  info["USE_NCCL"] = Boolean{true};
  info["USE_RCCL"] = Boolean{true};
  v = {Json{Integer{NCCL_MAJOR}}, Json{Integer{NCCL_MINOR}}, Json{Integer{NCCL_PATCH}}};
  info["RCCL_VERSION"] = v;
  info["NCCL_VERSION"] = v;
+#if defined(XGBOOST_USE_DLOPEN_RCCL)
+  info["USE_DLOPEN_RCCL"] = Boolean{true};
+#else
+  info["USE_DLOPEN_RCCL"] = Boolean{false};
+#endif  // defined(XGBOOST_USE_DLOPEN_RCCL)
 #else
  info["USE_NCCL"] = Boolean{false};
+  info["USE_DLOPEN_NCCL"] = Boolean{false};
  info["USE_RCCL"] = Boolean{false};
+  info["USE_DLOPEN_RCCL"] = Boolean{false};
 #endif

 #if defined(XGBOOST_USE_RMM)
--- a/src/c_api/coll_c_api.cc
+++ b/src/c_api/coll_c_api.cc
@@ -0,0 +1,119 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include <chrono>       // for seconds
+#include <cstddef>      // for size_t
+#include <future>       // for future
+#include <memory>       // for unique_ptr
+#include <string>       // for string
+#include <type_traits>  // for is_same_v, remove_pointer_t
+#include <utility>      // for pair
+
+#include "../collective/tracker.h"  // for RabitTracker
+#include "c_api_error.h"            // for API_BEGIN
+#include "xgboost/c_api.h"
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/json.h"               // for Json
+#include "xgboost/string_view.h"        // for StringView
+
+#if defined(XGBOOST_USE_FEDERATED)
+#include "../../plugin/federated/federated_tracker.h"  // for FederatedTracker
+#else
+#include "../common/error_msg.h"  // for NoFederated
+#endif
+
+using namespace xgboost;  // NOLINT
+
+namespace {
+using TrackerHandleT =
+    std::pair<std::unique_ptr<collective::Tracker>, std::shared_future<collective::Result>>;
+
+TrackerHandleT *GetTrackerHandle(TrackerHandle handle) {
+  xgboost_CHECK_C_ARG_PTR(handle);
+  auto *ptr = static_cast<TrackerHandleT *>(handle);
+  CHECK(ptr);
+  return ptr;
+}
+
+struct CollAPIEntry {
+  std::string ret_str;
+};
+using CollAPIThreadLocalStore = dmlc::ThreadLocalStore<CollAPIEntry>;
+
+void WaitImpl(TrackerHandleT *ptr) {
+  std::chrono::seconds wait_for{100};
+  auto fut = ptr->second;
+  while (fut.valid()) {
+    auto res = fut.wait_for(wait_for);
+    CHECK(res != std::future_status::deferred);
+    if (res == std::future_status::ready) {
+      auto const &rc = ptr->second.get();
+      CHECK(rc.OK()) << rc.Report();
+      break;
+    }
+  }
+}
+}  // namespace
+
+XGB_DLL int XGTrackerCreate(char const *config, TrackerHandle *handle) {
+  API_BEGIN();
+  xgboost_CHECK_C_ARG_PTR(config);
+
+  Json jconfig = Json::Load(config);
+
+  auto type = RequiredArg<String>(jconfig, "dmlc_communicator", __func__);
+  std::unique_ptr<collective::Tracker> tptr;
+  if (type == "federated") {
+#if defined(XGBOOST_USE_FEDERATED)
+    tptr = std::make_unique<collective::FederatedTracker>(jconfig);
+#else
+    LOG(FATAL) << error::NoFederated();
+#endif  // defined(XGBOOST_USE_FEDERATED)
+  } else if (type == "rabit") {
+    tptr = std::make_unique<collective::RabitTracker>(jconfig);
+  } else {
+    LOG(FATAL) << "Unknown communicator:" << type;
+  }
+
+  auto ptr = new TrackerHandleT{std::move(tptr), std::future<collective::Result>{}};
+  static_assert(std::is_same_v<std::remove_pointer_t<decltype(ptr)>, TrackerHandleT>);
+
+  xgboost_CHECK_C_ARG_PTR(handle);
+  *handle = ptr;
+  API_END();
+}
+
+XGB_DLL int XGTrackerWorkerArgs(TrackerHandle handle, char const **args) {
+  API_BEGIN();
+  auto *ptr = GetTrackerHandle(handle);
+  auto &local = *CollAPIThreadLocalStore::Get();
+  local.ret_str = Json::Dump(ptr->first->WorkerArgs());
+  xgboost_CHECK_C_ARG_PTR(args);
+  *args = local.ret_str.c_str();
+  API_END();
+}
+
+XGB_DLL int XGTrackerRun(TrackerHandle handle) {
+  API_BEGIN();
+  auto *ptr = GetTrackerHandle(handle);
+  CHECK(!ptr->second.valid()) << "Tracker is already running.";
+  ptr->second = ptr->first->Run();
+  API_END();
+}
+
+XGB_DLL int XGTrackerWait(TrackerHandle handle, char const *config) {
+  API_BEGIN();
+  auto *ptr = GetTrackerHandle(handle);
+  xgboost_CHECK_C_ARG_PTR(config);
+  auto jconfig = Json::Load(StringView{config});
+  WaitImpl(ptr);
+  API_END();
+}
+
+XGB_DLL int XGTrackerFree(TrackerHandle handle) {
+  API_BEGIN();
+  auto *ptr = GetTrackerHandle(handle);
+  WaitImpl(ptr);
+  delete ptr;
+  API_END();
+}
--- a/src/collective/allgather.cc
+++ b/src/collective/allgather.cc
@@ -26,18 +26,19 @@ Result RingAllgather(Comm const& comm, common::Span<std::int8_t> data, std::size
  }

  for (std::int32_t r = 0; r < world; ++r) {
-    auto send_rank = (rank + world - r + worker_off) % world;
-    auto send_off = send_rank * segment_size;
-    send_off = std::min(send_off, data.size_bytes());
-    auto send_seg = data.subspan(send_off, std::min(segment_size, data.size_bytes() - send_off));
-    next_ch->SendAll(send_seg.data(), send_seg.size_bytes());
-
-    auto recv_rank = (rank + world - r - 1 + worker_off) % world;
-    auto recv_off = recv_rank * segment_size;
-    recv_off = std::min(recv_off, data.size_bytes());
-    auto recv_seg = data.subspan(recv_off, std::min(segment_size, data.size_bytes() - recv_off));
-    prev_ch->RecvAll(recv_seg.data(), recv_seg.size_bytes());
-    auto rc = prev_ch->Block();
+    auto rc = Success() << [&] {
+      auto send_rank = (rank + world - r + worker_off) % world;
+      auto send_off = send_rank * segment_size;
+      send_off = std::min(send_off, data.size_bytes());
+      auto send_seg = data.subspan(send_off, std::min(segment_size, data.size_bytes() - send_off));
+      return next_ch->SendAll(send_seg.data(), send_seg.size_bytes());
+    } << [&] {
+      auto recv_rank = (rank + world - r - 1 + worker_off) % world;
+      auto recv_off = recv_rank * segment_size;
+      recv_off = std::min(recv_off, data.size_bytes());
+      auto recv_seg = data.subspan(recv_off, std::min(segment_size, data.size_bytes() - recv_off));
+      return prev_ch->RecvAll(recv_seg.data(), recv_seg.size_bytes());
+    } << [&] { return prev_ch->Block(); };
    if (!rc.OK()) {
      return rc;
    }
@@ -78,19 +79,19 @@ namespace detail {
  auto next_ch = comm.Chan(next);

  for (std::int32_t r = 0; r < world; ++r) {
-    auto send_rank = (rank + world - r) % world;
-    auto send_off = offset[send_rank];
-    auto send_size = sizes[send_rank];
-    auto send_seg = erased_result.subspan(send_off, send_size);
-    next_ch->SendAll(send_seg);
-
-    auto recv_rank = (rank + world - r - 1) % world;
-    auto recv_off = offset[recv_rank];
-    auto recv_size = sizes[recv_rank];
-    auto recv_seg = erased_result.subspan(recv_off, recv_size);
-    prev_ch->RecvAll(recv_seg.data(), recv_seg.size_bytes());
-
-    auto rc = prev_ch->Block();
+    auto rc = Success() << [&] {
+      auto send_rank = (rank + world - r) % world;
+      auto send_off = offset[send_rank];
+      auto send_size = sizes[send_rank];
+      auto send_seg = erased_result.subspan(send_off, send_size);
+      return next_ch->SendAll(send_seg);
+    } << [&] {
+      auto recv_rank = (rank + world - r - 1) % world;
+      auto recv_off = offset[recv_rank];
+      auto recv_size = sizes[recv_rank];
+      auto recv_seg = erased_result.subspan(recv_off, recv_size);
+      return prev_ch->RecvAll(recv_seg.data(), recv_seg.size_bytes());
+    } << [&] { return prev_ch->Block(); };
    if (!rc.OK()) {
      return rc;
    }
--- a/src/collective/allreduce.cc
+++ b/src/collective/allreduce.cc
@@ -6,6 +6,7 @@
 #include <algorithm>  // for min
 #include <cstddef>    // for size_t
 #include <cstdint>    // for int32_t, int8_t
+#include <utility>    // for move
 #include <vector>     // for vector

 #include "../data/array_interface.h"    // for Type, DispatchDType
@@ -36,7 +37,10 @@ Result RingScatterReduceTyped(Comm const& comm, common::Span<std::int8_t> data,
    auto seg_nbytes = std::min(data.size_bytes() - send_off, n_bytes_in_seg);
    auto send_seg = data.subspan(send_off, seg_nbytes);

-    next_ch->SendAll(send_seg);
+    auto rc = next_ch->SendAll(send_seg);
+    if (!rc.OK()) {
+      return rc;
+    }

    // receive from ring prev
    auto recv_off = ((rank + world - r - 1) % world) * n_bytes_in_seg;
@@ -46,8 +50,7 @@ Result RingScatterReduceTyped(Comm const& comm, common::Span<std::int8_t> data,
    auto recv_seg = data.subspan(recv_off, seg_nbytes);
    auto seg = s_buf.subspan(0, recv_seg.size());

-    prev_ch->RecvAll(seg);
-    auto rc = prev_ch->Block();
+    rc = std::move(rc) << [&] { return prev_ch->RecvAll(seg); } << [&] { return comm.Block(); };
    if (!rc.OK()) {
      return rc;
    }
@@ -62,6 +65,9 @@ Result RingScatterReduceTyped(Comm const& comm, common::Span<std::int8_t> data,

 Result RingAllreduce(Comm const& comm, common::Span<std::int8_t> data, Func const& op,
                     ArrayInterfaceHandler::Type type) {
+  if (comm.World() == 1) {
+    return Success();
+  }
  return DispatchDType(type, [&](auto t) {
    using T = decltype(t);
    // Divide the data into segments according to the number of workers.
@@ -80,11 +86,9 @@ Result RingAllreduce(Comm const& comm, common::Span<std::int8_t> data, Func cons
    auto prev_ch = comm.Chan(prev);
    auto next_ch = comm.Chan(next);

-    rc = RingAllgather(comm, data, n_bytes_in_seg, 1, prev_ch, next_ch);
-    if (!rc.OK()) {
-      return rc;
-    }
-    return comm.Block();
+    return std::move(rc) << [&] {
+      return RingAllgather(comm, data, n_bytes_in_seg, 1, prev_ch, next_ch);
+    } << [&] { return comm.Block(); };
  });
 }
 }  // namespace xgboost::collective::cpu_impl
--- a/src/collective/broadcast.cc
+++ b/src/collective/broadcast.cc
@@ -62,8 +62,8 @@ Result Broadcast(Comm const& comm, common::Span<std::int8_t> data, std::int32_t

  if (shifted_rank != 0) {  // not root
    auto parent = ShiftRight(ShiftedParentRank(shifted_rank, depth), world, root);
-    comm.Chan(parent)->RecvAll(data);
-    auto rc = comm.Chan(parent)->Block();
+    auto rc = Success() << [&] { return comm.Chan(parent)->RecvAll(data); }
+                        << [&] { return comm.Chan(parent)->Block(); };
    if (!rc.OK()) {
      return Fail("broadcast failed.", std::move(rc));
    }
@@ -75,7 +75,10 @@ Result Broadcast(Comm const& comm, common::Span<std::int8_t> data, std::int32_t
      auto sft_peer = shifted_rank + (1 << i);
      auto peer = ShiftRight(sft_peer, world, root);
      CHECK_NE(peer, root);
-      comm.Chan(peer)->SendAll(data);
+      auto rc = comm.Chan(peer)->SendAll(data);
+      if (!rc.OK()) {
+        return rc;
+      }
    }
  }

--- a/src/collective/coll.cu
+++ b/src/collective/coll.cu
@@ -23,25 +23,6 @@ Coll* Coll::MakeCUDAVar() { return new NCCLColl{}; }

 NCCLColl::~NCCLColl() = default;
 namespace {
-Result GetNCCLResult(ncclResult_t code) {
-  if (code == ncclSuccess) {
-    return Success();
-  }
-
-  std::stringstream ss;
-  ss << "NCCL failure: " << ncclGetErrorString(code) << ".";
-  if (code == ncclUnhandledCudaError) {
-    // nccl usually preserves the last error so we can get more details.
-    auto err = cudaPeekAtLastError();
-    ss << "  CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
-  } else if (code == ncclSystemError) {
-    ss << "  This might be caused by a network configuration issue. Please consider specifying "
-          "the network interface for NCCL via environment variables listed in its reference: "
-          "`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n";
-  }
-  return Fail(ss.str());
-}
-
 auto GetNCCLType(ArrayInterfaceHandler::Type type) {
  auto fatal = [] {
    LOG(FATAL) << "Invalid type for NCCL operation.";
@@ -98,11 +79,12 @@ void RunBitwiseAllreduce(dh::CUDAStreamView stream, common::Span<std::int8_t> ou
                                      common::Span<std::int8_t> data, Op op) {
  dh::device_vector<std::int8_t> buffer(data.size() * pcomm->World());
  auto* device_buffer = buffer.data().get();
+  auto stub = pcomm->Stub();

  // First gather data from all the workers.
  CHECK(handle);
-  auto rc = GetNCCLResult(
-      ncclAllGather(data.data(), device_buffer, data.size(), ncclInt8, handle, pcomm->Stream()));
+  auto rc =
+      stub->Allgather(data.data(), device_buffer, data.size(), ncclInt8, handle, pcomm->Stream());
  if (!rc.OK()) {
    return rc;
  }
@@ -153,6 +135,8 @@ ncclRedOp_t GetNCCLRedOp(Op const& op) {
  }
  auto nccl = dynamic_cast<NCCLComm const*>(&comm);
  CHECK(nccl);
+  auto stub = nccl->Stub();
+
  return Success() << [&] {
    if (IsBitwiseOp(op)) {
      return BitwiseAllReduce(nccl, nccl->Handle(), data, op);
@@ -160,9 +144,8 @@ ncclRedOp_t GetNCCLRedOp(Op const& op) {
      return DispatchDType(type, [=](auto t) {
        using T = decltype(t);
        auto rdata = common::RestoreType<T>(data);
-        auto rc = ncclAllReduce(data.data(), data.data(), rdata.size(), GetNCCLType(type),
-                                GetNCCLRedOp(op), nccl->Handle(), nccl->Stream());
-        return GetNCCLResult(rc);
+        return stub->Allreduce(data.data(), data.data(), rdata.size(), GetNCCLType(type),
+                               GetNCCLRedOp(op), nccl->Handle(), nccl->Stream());
      });
    }
  } << [&] { return nccl->Block(); };
@@ -175,9 +158,11 @@ ncclRedOp_t GetNCCLRedOp(Op const& op) {
  }
  auto nccl = dynamic_cast<NCCLComm const*>(&comm);
  CHECK(nccl);
+  auto stub = nccl->Stub();
+
  return Success() << [&] {
-    return GetNCCLResult(ncclBroadcast(data.data(), data.data(), data.size_bytes(), ncclInt8, root,
-                                       nccl->Handle(), nccl->Stream()));
+    return stub->Broadcast(data.data(), data.data(), data.size_bytes(), ncclInt8, root,
+                           nccl->Handle(), nccl->Stream());
  } << [&] { return nccl->Block(); };
 }

@@ -188,10 +173,12 @@ ncclRedOp_t GetNCCLRedOp(Op const& op) {
  }
  auto nccl = dynamic_cast<NCCLComm const*>(&comm);
  CHECK(nccl);
+  auto stub = nccl->Stub();
+
  auto send = data.subspan(comm.Rank() * size, size);
  return Success() << [&] {
-    return GetNCCLResult(
-        ncclAllGather(send.data(), data.data(), size, ncclInt8, nccl->Handle(), nccl->Stream()));
+    return stub->Allgather(send.data(), data.data(), size, ncclInt8, nccl->Handle(),
+                           nccl->Stream());
  } << [&] { return nccl->Block(); };
 }

@@ -203,19 +190,20 @@ namespace cuda_impl {
 */
 Result BroadcastAllgatherV(NCCLComm const* comm, common::Span<std::int8_t const> data,
                           common::Span<std::int64_t const> sizes, common::Span<std::int8_t> recv) {
-  return Success() << [] { return GetNCCLResult(ncclGroupStart()); } << [&] {
+  auto stub = comm->Stub();
+  return Success() << [&stub] { return stub->GroupStart(); } << [&] {
    std::size_t offset = 0;
    for (std::int32_t r = 0; r < comm->World(); ++r) {
      auto as_bytes = sizes[r];
-      auto rc = ncclBroadcast(data.data(), recv.subspan(offset, as_bytes).data(), as_bytes,
-                              ncclInt8, r, comm->Handle(), dh::DefaultStream());
-      if (rc != ncclSuccess) {
-        return GetNCCLResult(rc);
+      auto rc = stub->Broadcast(data.data(), recv.subspan(offset, as_bytes).data(), as_bytes,
+                                ncclInt8, r, comm->Handle(), dh::DefaultStream());
+      if (!rc.OK()) {
+        return rc;
      }
      offset += as_bytes;
    }
    return Success();
-  } << [] { return GetNCCLResult(ncclGroupEnd()); };
+  } << [&] { return stub->GroupEnd(); };
 }
 }  // namespace cuda_impl

@@ -228,10 +216,11 @@ Result BroadcastAllgatherV(NCCLComm const* comm, common::Span<std::int8_t const>
  if (!comm.IsDistributed()) {
    return Success();
  }
+  auto stub = nccl->Stub();

  switch (algo) {
    case AllgatherVAlgo::kRing: {
-      return Success() << [] { return GetNCCLResult(ncclGroupStart()); } << [&] {
+      return Success() << [&] { return stub->GroupStart(); } << [&] {
        // get worker offset
        detail::AllgatherVOffset(sizes, recv_segments);
        // copy data
@@ -241,8 +230,8 @@ Result BroadcastAllgatherV(NCCLComm const* comm, common::Span<std::int8_t const>
                                        cudaMemcpyDeviceToDevice, nccl->Stream()));
        }
        return detail::RingAllgatherV(comm, sizes, recv_segments, recv);
-      } << [] {
-        return GetNCCLResult(ncclGroupEnd());
+      } << [&] {
+        return stub->GroupEnd();
      } << [&] { return nccl->Block(); };
    }
    case AllgatherVAlgo::kBcast: {
--- a/src/collective/coll.cuh
+++ b/src/collective/coll.cuh
@@ -8,7 +8,8 @@
 #include "../data/array_interface.h"  // for ArrayInterfaceHandler
 #include "coll.h"                     // for Coll
 #include "comm.h"                     // for Comm
-#include "xgboost/span.h"             // for Span
+#include "nccl_stub.h"
+#include "xgboost/span.h"  // for Span

 namespace xgboost::collective {
 class NCCLColl : public Coll {
--- a/src/collective/comm.cc
+++ b/src/collective/comm.cc
@@ -5,6 +5,7 @@

 #include <algorithm>  // for copy
 #include <chrono>     // for seconds
+#include <cstdlib>    // for exit
 #include <memory>     // for shared_ptr
 #include <string>     // for string
 #include <utility>    // for move, forward
@@ -29,19 +30,28 @@ Comm::Comm(std::string const& host, std::int32_t port, std::chrono::seconds time
 Result ConnectTrackerImpl(proto::PeerInfo info, std::chrono::seconds timeout, std::int32_t retry,
                          std::string const& task_id, TCPSocket* out, std::int32_t rank,
                          std::int32_t world) {
-  // get information from tracker
+  // Get information from the tracker
  CHECK(!info.host.empty());
-  auto rc = Connect(info.host, info.port, retry, timeout, out);
-  if (!rc.OK()) {
-    return Fail("Failed to connect to the tracker.", std::move(rc));
-  }
-
  TCPSocket& tracker = *out;
-  return std::move(rc)
-      << [&] { return tracker.NonBlocking(false); }
-      << [&] { return tracker.RecvTimeout(timeout); }
-      << [&] { return proto::Magic{}.Verify(&tracker); }
-      << [&] { return proto::Connect{}.WorkerSend(&tracker, world, rank, task_id); };
+  return Success() << [&] {
+    auto rc = Connect(info.host, info.port, retry, timeout, out);
+    if (rc.OK()) {
+      return rc;
+    } else {
+      return Fail("Failed to connect to the tracker.", std::move(rc));
+    }
+  } << [&] {
+    return tracker.NonBlocking(false);
+  } << [&] {
+    return tracker.RecvTimeout(timeout);
+  } << [&] {
+    return proto::Magic{}.Verify(&tracker);
+  } << [&] {
+    return proto::Connect{}.WorkerSend(&tracker, world, rank, task_id);
+  } << [&] {
+    LOG(INFO) << "Task " << task_id << " connected to the tracker";
+    return Success();
+  };
 }

 [[nodiscard]] Result Comm::ConnectTracker(TCPSocket* out) const {
@@ -49,14 +59,6 @@ Result ConnectTrackerImpl(proto::PeerInfo info, std::chrono::seconds timeout, st
                            this->Rank(), this->World());
 }

-#if !defined(XGBOOST_USE_NCCL) && !defined(XGBOOST_USE_RCCL)
-Comm* Comm::MakeCUDAVar(Context const*, std::shared_ptr<Coll>) const {
-  common::AssertGPUSupport();
-  common::AssertNCCLSupport();
-  return nullptr;
-}
-#endif  //  !defined(XGBOOST_USE_NCCL)
-
 [[nodiscard]] Result ConnectWorkers(Comm const& comm, TCPSocket* listener, std::int32_t lport,
                                    proto::PeerInfo ninfo, std::chrono::seconds timeout,
                                    std::int32_t retry,
@@ -181,12 +183,21 @@ Comm* Comm::MakeCUDAVar(Context const*, std::shared_ptr<Coll>) const {
 }

 RabitComm::RabitComm(std::string const& host, std::int32_t port, std::chrono::seconds timeout,
-                     std::int32_t retry, std::string task_id)
-    : Comm{std::move(host), port, timeout, retry, std::move(task_id)} {
+                     std::int32_t retry, std::string task_id, StringView nccl_path)
+    : HostComm{std::move(host), port, timeout, retry, std::move(task_id)},
+      nccl_path_{std::move(nccl_path)} {
  auto rc = this->Bootstrap(timeout_, retry_, task_id_);
  CHECK(rc.OK()) << rc.Report();
 }

+#if !defined(XGBOOST_USE_NCCL) && !defined(XGBOOST_USE_RCCL)
+Comm* RabitComm::MakeCUDAVar(Context const*, std::shared_ptr<Coll>) const {
+  common::AssertGPUSupport();
+  common::AssertNCCLSupport();
+  return nullptr;
+}
+#endif  //  !defined(XGBOOST_USE_NCCL)
+
 [[nodiscard]] Result RabitComm::Bootstrap(std::chrono::seconds timeout, std::int32_t retry,
                                          std::string task_id) {
  TCPSocket tracker;
@@ -209,24 +220,18 @@ RabitComm::RabitComm(std::string const& host, std::int32_t port, std::chrono::se
  std::shared_ptr<TCPSocket> error_sock{TCPSocket::CreatePtr(domain)};
  auto eport = error_sock->BindHost();
  error_sock->Listen();
-  error_worker_ = std::thread{[this, error_sock = std::move(error_sock)] {
+  error_worker_ = std::thread{[error_sock = std::move(error_sock)] {
    auto conn = error_sock->Accept();
-    // On Windows accept returns an invalid socket after network is shutdown.
+    // On Windows, accept returns a closed socket after finalize.
    if (conn.IsClosed()) {
      return;
    }
    LOG(WARNING) << "Another worker is running into error.";
-    std::string scmd;
-    conn.Recv(&scmd);
-    auto jcmd = Json::Load(scmd);
-    auto rc = this->Shutdown();
-    if (!rc.OK()) {
-      LOG(WARNING) << "Fail to shutdown worker:" << rc.Report();
-    }
 #if !defined(XGBOOST_STRICT_R_MODE) || XGBOOST_STRICT_R_MODE == 0
-    exit(-1);
+    // exit is nicer than abort as the former performs cleanups.
+    std::exit(-1);
 #else
-    LOG(FATAL) << rc.Report();
+    LOG(FATAL) << "abort";
 #endif
  }};
  error_worker_.detach();
@@ -259,8 +264,8 @@ RabitComm::RabitComm(std::string const& host, std::int32_t port, std::chrono::se
  CHECK(this->channels_.empty());
  for (auto& w : workers) {
    if (w) {
-      w->SetNoDelay();
-      rc = w->NonBlocking(true);
+      rc = std::move(rc) << [&] { return w->SetNoDelay(); } << [&] { return w->NonBlocking(true); }
+                         << [&] { return w->SetKeepAlive(); };
    }
    if (!rc.OK()) {
      return rc;
--- a/src/collective/comm.cu
+++ b/src/collective/comm.cu
@@ -10,21 +10,24 @@
 #include <sstream>    // for stringstream
 #include <vector>     // for vector

+#include "../common/cuda_context.cuh"    // for CUDAContext
 #include "../common/device_helpers.cuh"  // for DefaultStream
 #include "../common/type.h"              // for EraseType
-#include "broadcast.h"                   // for Broadcast
 #include "comm.cuh"                      // for NCCLComm
 #include "comm.h"                        // for Comm
+#include "nccl_stub.h"                   // for NcclStub
 #include "xgboost/collective/result.h"   // for Result
 #include "xgboost/span.h"                // for Span

 namespace xgboost::collective {
 namespace {
-Result GetUniqueId(Comm const& comm, std::shared_ptr<Coll> coll, ncclUniqueId* pid) {
+Result GetUniqueId(Comm const& comm, std::shared_ptr<NcclStub> stub, std::shared_ptr<Coll> coll,
+                   ncclUniqueId* pid) {
  static const int kRootRank = 0;
  ncclUniqueId id;
  if (comm.Rank() == kRootRank) {
-    dh::safe_nccl(ncclGetUniqueId(&id));
+    auto rc = stub->GetUniqueId(&id);
+    CHECK(rc.OK()) << rc.Report();
  }
  auto rc = coll->Broadcast(
      comm, common::Span{reinterpret_cast<std::int8_t*>(&id), sizeof(ncclUniqueId)}, kRootRank);
@@ -63,14 +66,15 @@ static std::string PrintUUID(xgboost::common::Span<std::uint64_t, kUuidLength> c
 }
 }  // namespace

-Comm* Comm::MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll> pimpl) const {
-  return new NCCLComm{ctx, *this, pimpl};
+Comm* RabitComm::MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll> pimpl) const {
+  return new NCCLComm{ctx, *this, pimpl, StringView{this->nccl_path_}};
 }

-NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> pimpl)
+NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> pimpl,
+                   StringView nccl_path)
    : Comm{root.TrackerInfo().host, root.TrackerInfo().port, root.Timeout(), root.Retry(),
           root.TaskID()},
-      stream_{dh::DefaultStream()} {
+      stream_{ctx->CUDACtx()->Stream()} {
  this->world_ = root.World();
  this->rank_ = root.Rank();
  this->domain_ = root.Domain();
@@ -79,6 +83,7 @@ NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> p
  }

  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
+  stub_ = std::make_shared<NcclStub>(nccl_path);

  std::vector<std::uint64_t> uuids(root.World() * kUuidLength, 0);
  auto s_uuid = xgboost::common::Span<std::uint64_t>{uuids.data(), uuids.size()};
@@ -104,19 +109,22 @@ NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> p
      << "Multiple processes within communication group running on same CUDA "
      << "device is not supported. " << PrintUUID(s_this_uuid) << "\n";

-  rc = GetUniqueId(root, pimpl, &nccl_unique_id_);
+  rc = std::move(rc) << [&] { return GetUniqueId(root, this->stub_, pimpl, &nccl_unique_id_); } <<
+       [&] {
+         return this->stub_->CommInitRank(&nccl_comm_, root.World(), nccl_unique_id_, root.Rank());
+       };
  CHECK(rc.OK()) << rc.Report();
-  dh::safe_nccl(ncclCommInitRank(&nccl_comm_, root.World(), nccl_unique_id_, root.Rank()));

  for (std::int32_t r = 0; r < root.World(); ++r) {
    this->channels_.emplace_back(
-        std::make_shared<NCCLChannel>(root, r, nccl_comm_, dh::DefaultStream()));
+        std::make_shared<NCCLChannel>(root, r, nccl_comm_, stub_, dh::DefaultStream()));
  }
 }

 NCCLComm::~NCCLComm() {
  if (nccl_comm_) {
-    dh::safe_nccl(ncclCommDestroy(nccl_comm_));
+    auto rc = stub_->CommDestroy(nccl_comm_);
+    CHECK(rc.OK()) << rc.Report();
  }
 }
 }  // namespace xgboost::collective
--- a/src/collective/comm.cuh
+++ b/src/collective/comm.cuh
@@ -9,9 +9,13 @@
 #include "../common/cuda_to_hip.h"
 #include "rccl.h"
 #endif  // XGBOOST_USE_NCCL
+
+#include <utility>  // for move
+
 #include "../common/device_helpers.cuh"
 #include "coll.h"
 #include "comm.h"
+#include "nccl_stub.h"  // for NcclStub
 #include "xgboost/context.h"

 namespace xgboost::collective {
@@ -24,15 +28,20 @@ inline Result GetCUDAResult(cudaError rc) {
  return Fail(msg);
 }

+#if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
 class NCCLComm : public Comm {
  ncclComm_t nccl_comm_{nullptr};
+  std::shared_ptr<NcclStub> stub_;
  ncclUniqueId nccl_unique_id_{};
  dh::CUDAStreamView stream_;
+  std::string nccl_path_;

 public:
  [[nodiscard]] ncclComm_t Handle() const { return nccl_comm_; }
+  auto Stub() const { return stub_; }

-  explicit NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> pimpl);
+  explicit NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> pimpl,
+                    StringView nccl_path);
  [[nodiscard]] Result LogTracker(std::string) const override {
    LOG(FATAL) << "Device comm is used for logging.";
    return Fail("Undefined.");
@@ -49,22 +58,29 @@ class NCCLComm : public Comm {
 class NCCLChannel : public Channel {
  std::int32_t rank_{-1};
  ncclComm_t nccl_comm_{};
+  std::shared_ptr<NcclStub> stub_;
  dh::CUDAStreamView stream_;

 public:
  explicit NCCLChannel(Comm const& comm, std::int32_t rank, ncclComm_t nccl_comm,
-                       dh::CUDAStreamView stream)
-      : rank_{rank}, nccl_comm_{nccl_comm}, Channel{comm, nullptr}, stream_{stream} {}
+                       std::shared_ptr<NcclStub> stub, dh::CUDAStreamView stream)
+      : rank_{rank},
+        nccl_comm_{nccl_comm},
+        stub_{std::move(stub)},
+        Channel{comm, nullptr},
+        stream_{stream} {}

-  void SendAll(std::int8_t const* ptr, std::size_t n) override {
-    dh::safe_nccl(ncclSend(ptr, n, ncclInt8, rank_, nccl_comm_, stream_));
+  [[nodiscard]] Result SendAll(std::int8_t const* ptr, std::size_t n) override {
+    return stub_->Send(ptr, n, ncclInt8, rank_, nccl_comm_, stream_);
  }
-  void RecvAll(std::int8_t* ptr, std::size_t n) override {
-    dh::safe_nccl(ncclRecv(ptr, n, ncclInt8, rank_, nccl_comm_, stream_));
+  [[nodiscard]] Result RecvAll(std::int8_t* ptr, std::size_t n) override {
+    return stub_->Recv(ptr, n, ncclInt8, rank_, nccl_comm_, stream_);
  }
  [[nodiscard]] Result Block() override {
    auto rc = stream_.Sync(false);
    return GetCUDAResult(rc);
  }
 };
+
+#endif  //  defined(XGBOOST_USE_NCCL)
 }  // namespace xgboost::collective
--- a/src/collective/comm.h
+++ b/src/collective/comm.h
@@ -34,6 +34,8 @@ inline std::int32_t BootstrapPrev(std::int32_t r, std::int32_t world) {
  return nrank;
 }

+inline StringView DefaultNcclName() { return "libnccl.so.2"; }
+
 class Channel;
 class Coll;

@@ -86,11 +88,21 @@ class Comm : public std::enable_shared_from_this<Comm> {
  [[nodiscard]] virtual Result LogTracker(std::string msg) const = 0;

  [[nodiscard]] virtual Result SignalError(Result const&) { return Success(); }
-
-  virtual Comm* MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll> pimpl) const;
 };

-class RabitComm : public Comm {
+/**
+ * @brief Base class for CPU-based communicator.
+ */
+class HostComm : public Comm {
+ public:
+  using Comm::Comm;
+  [[nodiscard]] virtual Comm* MakeCUDAVar(Context const* ctx,
+                                          std::shared_ptr<Coll> pimpl) const = 0;
+};
+
+class RabitComm : public HostComm {
+  std::string nccl_path_ = std::string{DefaultNcclName()};
+
  [[nodiscard]] Result Bootstrap(std::chrono::seconds timeout, std::int32_t retry,
                                 std::string task_id);
  [[nodiscard]] Result Shutdown();
@@ -100,13 +112,15 @@ class RabitComm : public Comm {
  RabitComm() = default;
  // ctor for testing where environment is known.
  RabitComm(std::string const& host, std::int32_t port, std::chrono::seconds timeout,
-            std::int32_t retry, std::string task_id);
+            std::int32_t retry, std::string task_id, StringView nccl_path);
  ~RabitComm() noexcept(false) override;

  [[nodiscard]] bool IsFederated() const override { return false; }
  [[nodiscard]] Result LogTracker(std::string msg) const override;

  [[nodiscard]] Result SignalError(Result const&) override;
+
+  [[nodiscard]] Comm* MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll> pimpl) const override;
 };

 /**
@@ -121,21 +135,25 @@ class Channel {
  explicit Channel(Comm const& comm, std::shared_ptr<TCPSocket> sock)
      : sock_{std::move(sock)}, comm_{comm} {}

-  virtual void SendAll(std::int8_t const* ptr, std::size_t n) {
+  [[nodiscard]] virtual Result SendAll(std::int8_t const* ptr, std::size_t n) {
    Loop::Op op{Loop::Op::kWrite, comm_.Rank(), const_cast<std::int8_t*>(ptr), n, sock_.get(), 0};
    CHECK(sock_.get());
    comm_.Submit(std::move(op));
+    return Success();
  }
-  void SendAll(common::Span<std::int8_t const> data) {
-    this->SendAll(data.data(), data.size_bytes());
+  [[nodiscard]] Result SendAll(common::Span<std::int8_t const> data) {
+    return this->SendAll(data.data(), data.size_bytes());
  }

-  virtual void RecvAll(std::int8_t* ptr, std::size_t n) {
+  [[nodiscard]] virtual Result RecvAll(std::int8_t* ptr, std::size_t n) {
    Loop::Op op{Loop::Op::kRead, comm_.Rank(), ptr, n, sock_.get(), 0};
    CHECK(sock_.get());
    comm_.Submit(std::move(op));
+    return Success();
+  }
+  [[nodiscard]] Result RecvAll(common::Span<std::int8_t> data) {
+    return this->RecvAll(data.data(), data.size_bytes());
  }
-  void RecvAll(common::Span<std::int8_t> data) { this->RecvAll(data.data(), data.size_bytes()); }

  [[nodiscard]] auto Socket() const { return sock_; }
  [[nodiscard]] virtual Result Block() { return comm_.Block(); }
--- a/src/collective/comm_group.cc
+++ b/src/collective/comm_group.cc
@@ -0,0 +1,122 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include "comm_group.h"
+
+#include <algorithm>  // for transform
+#include <chrono>     // for seconds
+#include <cstdint>    // for int32_t
+#include <memory>     // for shared_ptr, unique_ptr
+#include <string>     // for string
+#include <vector>     // for vector
+
+#include "../common/json_utils.h"       // for OptionalArg
+#include "coll.h"                       // for Coll
+#include "comm.h"                       // for Comm
+#include "tracker.h"                    // for GetHostAddress
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/context.h"            // for DeviceOrd
+#include "xgboost/json.h"               // for Json
+
+#if defined(XGBOOST_USE_FEDERATED)
+#include "../../plugin/federated/federated_coll.h"
+#include "../../plugin/federated/federated_comm.h"
+#endif
+
+namespace xgboost::collective {
+[[nodiscard]] std::shared_ptr<Coll> CommGroup::Backend(DeviceOrd device) const {
+  if (device.IsCUDA()) {
+    if (!gpu_coll_) {
+      gpu_coll_.reset(backend_->MakeCUDAVar());
+    }
+    return gpu_coll_;
+  }
+  return backend_;
+}
+
+[[nodiscard]] Comm const& CommGroup::Ctx(Context const* ctx, DeviceOrd device) const {
+  if (device.IsCUDA()) {
+    CHECK(ctx->IsCUDA());
+    if (!gpu_comm_ || gpu_comm_->World() != comm_->World()) {
+      gpu_comm_.reset(comm_->MakeCUDAVar(ctx, backend_));
+    }
+    return *gpu_comm_;
+  }
+  return *comm_;
+}
+
+CommGroup::CommGroup()
+    : comm_{std::shared_ptr<RabitComm>(new RabitComm{})},  // NOLINT
+      backend_{std::shared_ptr<Coll>(new Coll{})} {}       // NOLINT
+
+[[nodiscard]] CommGroup* CommGroup::Create(Json config) {
+  if (IsA<Null>(config)) {
+    return new CommGroup;
+  }
+
+  std::string type = OptionalArg<String>(config, "dmlc_communicator", std::string{"rabit"});
+  // Try both lower and upper case for compatibility
+  auto get_param = [&](std::string name, auto dft, auto t) {
+    std::string upper;
+    std::transform(name.cbegin(), name.cend(), std::back_inserter(upper),
+                   [](char c) { return std::toupper(c); });
+    std::transform(name.cbegin(), name.cend(), name.begin(),
+                   [](char c) { return std::tolower(c); });
+
+    auto const& obj = get<Object const>(config);
+    auto it = obj.find(upper);
+    if (it != obj.cend()) {
+      return OptionalArg<decltype(t)>(config, upper, dft);
+    } else {
+      return OptionalArg<decltype(t)>(config, name, dft);
+    }
+  };
+  // Common args
+  auto retry = get_param("dmlc_retry", static_cast<Integer::Int>(DefaultRetry()), Integer{});
+  auto timeout =
+      get_param("dmlc_timeout_sec", static_cast<Integer::Int>(DefaultTimeoutSec()), Integer{});
+  auto task_id = get_param("dmlc_task_id", std::string{}, String{});
+
+  if (type == "rabit") {
+    auto host = get_param("dmlc_tracker_uri", std::string{}, String{});
+    auto port = get_param("dmlc_tracker_port", static_cast<std::int64_t>(0), Integer{});
+    auto nccl = get_param("dmlc_nccl_path", std::string{DefaultNcclName()}, String{});
+    auto ptr =
+        new CommGroup{std::shared_ptr<RabitComm>{new RabitComm{  // NOLINT
+                          host, static_cast<std::int32_t>(port), std::chrono::seconds{timeout},
+                          static_cast<std::int32_t>(retry), task_id, nccl}},
+                      std::shared_ptr<Coll>(new Coll{})};  // NOLINT
+    return ptr;
+  } else if (type == "federated") {
+#if defined(XGBOOST_USE_FEDERATED)
+    auto ptr = new CommGroup{
+        std::make_shared<FederatedComm>(retry, std::chrono::seconds{timeout}, task_id, config),
+        std::make_shared<FederatedColl>()};
+    return ptr;
+#endif  // defined(XGBOOST_USE_FEDERATED)
+  } else {
+    LOG(FATAL) << "Invalid communicator type";
+  }
+
+  return nullptr;
+}
+
+std::unique_ptr<collective::CommGroup>& GlobalCommGroup() {
+  static thread_local std::unique_ptr<collective::CommGroup> sptr;
+  if (!sptr) {
+    Json config{Null{}};
+    sptr.reset(CommGroup::Create(config));
+  }
+  return sptr;
+}
+
+void GlobalCommGroupInit(Json config) {
+  auto& sptr = GlobalCommGroup();
+  sptr.reset(CommGroup::Create(std::move(config)));
+}
+
+void GlobalCommGroupFinalize() {
+  auto& sptr = GlobalCommGroup();
+  sptr.reset();
+}
+}  // namespace xgboost::collective
--- a/src/collective/comm_group.h
+++ b/src/collective/comm_group.h
@@ -0,0 +1,55 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#pragma once
+#include <memory>   // for shared_ptr, unique_ptr
+#include <string>   // for string
+#include <utility>  // for move
+
+#include "coll.h"                       // for Comm
+#include "comm.h"                       // for Coll
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/collective/socket.h"  // for GetHostName
+
+namespace xgboost::collective {
+/**
+ * @brief Communicator group used for double dispatching between communicators and
+ *        collective implementations.
+ */
+class CommGroup {
+  std::shared_ptr<HostComm> comm_;
+  mutable std::shared_ptr<Comm> gpu_comm_;
+
+  std::shared_ptr<Coll> backend_;
+  mutable std::shared_ptr<Coll> gpu_coll_;  // lazy initialization
+
+  CommGroup(std::shared_ptr<Comm> comm, std::shared_ptr<Coll> coll)
+      : comm_{std::dynamic_pointer_cast<HostComm>(comm)}, backend_{std::move(coll)} {
+    CHECK(comm_);
+  }
+
+ public:
+  CommGroup();
+
+  [[nodiscard]] auto World() const { return comm_->World(); }
+  [[nodiscard]] auto Rank() const { return comm_->Rank(); }
+  [[nodiscard]] bool IsDistributed() const { return comm_->IsDistributed(); }
+
+  [[nodiscard]] static CommGroup* Create(Json config);
+
+  [[nodiscard]] std::shared_ptr<Coll> Backend(DeviceOrd device) const;
+  [[nodiscard]] Comm const& Ctx(Context const* ctx, DeviceOrd device) const;
+  [[nodiscard]] Result SignalError(Result const& res) { return comm_->SignalError(res); }
+
+  [[nodiscard]] Result ProcessorName(std::string* out) const {
+    auto rc = GetHostName(out);
+    return rc;
+  }
+};
+
+std::unique_ptr<collective::CommGroup>& GlobalCommGroup();
+
+void GlobalCommGroupInit(Json config);
+
+void GlobalCommGroupFinalize();
+}  // namespace xgboost::collective
--- a/src/collective/communicator.cc
+++ b/src/collective/communicator.cc
@@ -3,6 +3,7 @@
 */
 #include "communicator.h"

+#include "comm.h"
 #include "in_memory_communicator.h"
 #include "noop_communicator.h"
 #include "rabit_communicator.h"
@@ -14,8 +15,12 @@
 namespace xgboost::collective {
 thread_local std::unique_ptr<Communicator> Communicator::communicator_{new NoOpCommunicator()};
 thread_local CommunicatorType Communicator::type_{};
+thread_local std::string Communicator::nccl_path_{};

 void Communicator::Init(Json const& config) {
+  auto nccl = OptionalArg<String>(config, "dmlc_nccl_path", std::string{DefaultNcclName()});
+  nccl_path_ = nccl;
+
  auto type = GetTypeFromEnv();
  auto const arg = GetTypeFromConfig(config);
  if (arg != CommunicatorType::kUnknown) {
--- a/src/collective/communicator.cu
+++ b/src/collective/communicator.cu
@@ -31,17 +31,17 @@ DeviceCommunicator* Communicator::GetDevice(int device_ordinal) {
 #if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
    switch (type_) {
      case CommunicatorType::kRabit:
-        device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false));
+        device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false, nccl_path_));
        break;
      case CommunicatorType::kFederated:
      case CommunicatorType::kInMemory:
        device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal));
        break;
      case CommunicatorType::kInMemoryNccl:
-        device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, true));
+        device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, true, nccl_path_));
        break;
      default:
-        device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false));
+        device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false, nccl_path_));
    }
 #else
    device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal));
--- a/src/collective/communicator.h
+++ b/src/collective/communicator.h
@@ -234,6 +234,7 @@ class Communicator {

  static thread_local std::unique_ptr<Communicator> communicator_;
  static thread_local CommunicatorType type_;
+  static thread_local std::string nccl_path_;
 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
  static thread_local std::unique_ptr<DeviceCommunicator> device_communicator_;
 #endif
--- a/src/collective/loop.cc
+++ b/src/collective/loop.cc
@@ -10,21 +10,26 @@
 #include "xgboost/logging.h"            // for CHECK

 namespace xgboost::collective {
-Result Loop::EmptyQueue() {
+Result Loop::EmptyQueue(std::queue<Op>* p_queue) const {
  timer_.Start(__func__);
-  auto error = [this] {
-    this->stop_ = true;
+  auto error = [this] { timer_.Stop(__func__); };
+
+  if (stop_) {
    timer_.Stop(__func__);
-  };
+    return Success();
+  }

-  while (!queue_.empty() && !stop_) {
-    std::queue<Op> qcopy;
+  auto& qcopy = *p_queue;
+
+  // clear the copied queue
+  while (!qcopy.empty()) {
    rabit::utils::PollHelper poll;
+    std::size_t n_ops = qcopy.size();

-    // watch all ops
-    while (!queue_.empty()) {
-      auto op = queue_.front();
-      queue_.pop();
+    // Iterate through all the ops for poll
+    for (std::size_t i = 0; i < n_ops; ++i) {
+      auto op = qcopy.front();
+      qcopy.pop();

      switch (op.code) {
        case Op::kRead: {
@@ -40,6 +45,7 @@ Result Loop::EmptyQueue() {
          return Fail("Invalid socket operation.");
        }
      }
+
      qcopy.push(op);
    }

@@ -51,10 +57,12 @@ Result Loop::EmptyQueue() {
      error();
      return rc;
    }
+
    // we wonldn't be here if the queue is empty.
    CHECK(!qcopy.empty());

-    while (!qcopy.empty() && !stop_) {
+    // Iterate through all the ops for performing the operations
+    for (std::size_t i = 0; i < n_ops; ++i) {
      auto op = qcopy.front();
      qcopy.pop();

@@ -81,20 +89,21 @@ Result Loop::EmptyQueue() {
      }

      if (n_bytes_done == -1 && !system::LastErrorWouldBlock()) {
-        stop_ = true;
        auto rc = system::FailWithCode("Invalid socket output.");
        error();
        return rc;
      }
+
      op.off += n_bytes_done;
      CHECK_LE(op.off, op.n);

      if (op.off != op.n) {
        // not yet finished, push back to queue for next round.
-        queue_.push(op);
+        qcopy.push(op);
      }
    }
  }
+
  timer_.Stop(__func__);
  return Success();
 }
@@ -107,22 +116,46 @@ void Loop::Process() {
    if (stop_) {
      break;
    }
-    CHECK(!mu_.try_lock());

-    this->rc_ = this->EmptyQueue();
-    if (!rc_.OK()) {
-      stop_ = true;
+    auto unlock_notify = [&](bool is_blocking, bool stop) {
+      if (!is_blocking) {
+        std::lock_guard guard{mu_};
+        stop_ = stop;
+      } else {
+        stop_ = stop;
+        lock.unlock();
+      }
      cv_.notify_one();
-      break;
+    };
+
+    // move the queue
+    std::queue<Op> qcopy;
+    bool is_blocking = false;
+    while (!queue_.empty()) {
+      auto op = queue_.front();
+      queue_.pop();
+      if (op.code == Op::kBlock) {
+        is_blocking = true;
+      } else {
+        qcopy.push(op);
+      }
+    }
+    // unblock the queue
+    if (!is_blocking) {
+      lock.unlock();
+    }
+    // clear the queue
+    auto rc = this->EmptyQueue(&qcopy);
+    // Handle error
+    if (!rc.OK()) {
+      unlock_notify(is_blocking, true);
+      std::lock_guard<std::mutex> guard{rc_lock_};
+      this->rc_ = std::move(rc);
+      return;
    }

-    CHECK(queue_.empty());
-    CHECK(!mu_.try_lock());
-    cv_.notify_one();
-  }
-
-  if (rc_.OK()) {
-    CHECK(queue_.empty());
+    CHECK(qcopy.empty());
+    unlock_notify(is_blocking, false);
  }
 }

@@ -140,6 +173,24 @@ Result Loop::Stop() {
  return Success();
 }

+[[nodiscard]] Result Loop::Block() {
+  {
+    std::lock_guard<std::mutex> guard{rc_lock_};
+    if (!rc_.OK()) {
+      return std::move(rc_);
+    }
+  }
+  this->Submit(Op{Op::kBlock});
+  {
+    std::unique_lock lock{mu_};
+    cv_.wait(lock, [this] { return (this->queue_.empty()) || stop_; });
+  }
+  {
+    std::lock_guard<std::mutex> lock{rc_lock_};
+    return std::move(rc_);
+  }
+}
+
 Loop::Loop(std::chrono::seconds timeout) : timeout_{timeout} {
  timer_.Init(__func__);
  worker_ = std::thread{[this] {
--- a/src/collective/loop.h
+++ b/src/collective/loop.h
@@ -20,13 +20,14 @@ namespace xgboost::collective {
 class Loop {
 public:
  struct Op {
-    enum Code : std::int8_t { kRead = 0, kWrite = 1 } code;
+    enum Code : std::int8_t { kRead = 0, kWrite = 1, kBlock = 2 } code;
    std::int32_t rank{-1};
    std::int8_t* ptr{nullptr};
    std::size_t n{0};
    TCPSocket* sock{nullptr};
    std::size_t off{0};

+    explicit Op(Code c) : code{c} { CHECK(c == kBlock); }
    Op(Code c, std::int32_t rank, std::int8_t* ptr, std::size_t n, TCPSocket* sock, std::size_t off)
        : code{c}, rank{rank}, ptr{ptr}, n{n}, sock{sock}, off{off} {}
    Op(Op const&) = default;
@@ -41,12 +42,15 @@ class Loop {
  std::mutex mu_;
  std::queue<Op> queue_;
  std::chrono::seconds timeout_;
+
  Result rc_;
+  std::mutex rc_lock_;  // lock for transferring error info.
+
  bool stop_{false};
  std::exception_ptr curr_exce_{nullptr};
-  common::Monitor timer_;
+  common::Monitor mutable timer_;

-  Result EmptyQueue();
+  Result EmptyQueue(std::queue<Op>* p_queue) const;
  void Process();

 public:
@@ -60,15 +64,7 @@ class Loop {
    cv_.notify_one();
  }

-  [[nodiscard]] Result Block() {
-    {
-      std::unique_lock lock{mu_};
-      cv_.notify_all();
-    }
-    std::unique_lock lock{mu_};
-    cv_.wait(lock, [this] { return this->queue_.empty() || stop_; });
-    return std::move(rc_);
-  }
+  [[nodiscard]] Result Block();

  explicit Loop(std::chrono::seconds timeout);

--- a/src/collective/nccl_device_communicator.cu
+++ b/src/collective/nccl_device_communicator.cu
@@ -2,12 +2,14 @@
 * Copyright 2023 XGBoost contributors
 */
 #if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
+#include "comm.cuh"
 #include "nccl_device_communicator.cuh"

 namespace xgboost {
 namespace collective {

-NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, bool needs_sync)
+NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, bool needs_sync,
+                                               StringView nccl_path)
    : device_ordinal_{device_ordinal},
      needs_sync_{needs_sync},
      world_size_{GetWorldSize()},
@@ -18,6 +20,7 @@ NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, bool needs_sy
  if (world_size_ == 1) {
    return;
  }
+  stub_ = std::make_shared<NcclStub>(std::move(nccl_path));

  std::vector<uint64_t> uuids(world_size_ * kUuidLength, 0);
  auto s_uuid = xgboost::common::Span<uint64_t>{uuids.data(), uuids.size()};
@@ -43,7 +46,8 @@ NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, bool needs_sy

  nccl_unique_id_ = GetUniqueId();
  dh::safe_cuda(cudaSetDevice(device_ordinal_));
-  dh::safe_nccl(ncclCommInitRank(&nccl_comm_, world_size_, nccl_unique_id_, rank_));
+  auto rc = stub_->CommInitRank(&nccl_comm_, world_size_, nccl_unique_id_, rank_);
+  CHECK(rc.OK()) << rc.Report();
 }

 NcclDeviceCommunicator::~NcclDeviceCommunicator() {
@@ -51,7 +55,8 @@ NcclDeviceCommunicator::~NcclDeviceCommunicator() {
    return;
  }
  if (nccl_comm_) {
-    dh::safe_nccl(ncclCommDestroy(nccl_comm_));
+    auto rc = stub_->CommDestroy(nccl_comm_);
+    CHECK(rc.OK()) << rc.Report();
  }
  if (xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
    LOG(CONSOLE) << "======== NCCL Statistics========";
@@ -137,8 +142,9 @@ void NcclDeviceCommunicator::BitwiseAllReduce(void *send_receive_buffer, std::si
  auto *device_buffer = buffer.data().get();

  // First gather data from all the workers.
-  dh::safe_nccl(ncclAllGather(send_receive_buffer, device_buffer, count, GetNcclDataType(data_type),
-                              nccl_comm_, dh::DefaultStream()));
+  auto rc = stub_->Allgather(send_receive_buffer, device_buffer, count, GetNcclDataType(data_type),
+                             nccl_comm_, dh::DefaultStream());
+  CHECK(rc.OK()) << rc.Report();
  if (needs_sync_) {
    dh::DefaultStream().Sync();
  }
@@ -170,9 +176,10 @@ void NcclDeviceCommunicator::AllReduce(void *send_receive_buffer, std::size_t co
  if (IsBitwiseOp(op)) {
    BitwiseAllReduce(send_receive_buffer, count, data_type, op);
  } else {
-    dh::safe_nccl(ncclAllReduce(send_receive_buffer, send_receive_buffer, count,
-                                GetNcclDataType(data_type), GetNcclRedOp(op), nccl_comm_,
-                                dh::DefaultStream()));
+    auto rc = stub_->Allreduce(send_receive_buffer, send_receive_buffer, count,
+                               GetNcclDataType(data_type), GetNcclRedOp(op), nccl_comm_,
+                               dh::DefaultStream());
+    CHECK(rc.OK()) << rc.Report();
  }
  allreduce_bytes_ += count * GetTypeSize(data_type);
  allreduce_calls_ += 1;
@@ -185,8 +192,9 @@ void NcclDeviceCommunicator::AllGather(void const *send_buffer, void *receive_bu
  }

  dh::safe_cuda(cudaSetDevice(device_ordinal_));
-  dh::safe_nccl(ncclAllGather(send_buffer, receive_buffer, send_size, ncclInt8, nccl_comm_,
-                              dh::DefaultStream()));
+  auto rc = stub_->Allgather(send_buffer, receive_buffer, send_size, ncclInt8, nccl_comm_,
+                             dh::DefaultStream());
+  CHECK(rc.OK()) << rc.Report();
 }

 void NcclDeviceCommunicator::AllGatherV(void const *send_buffer, size_t length_bytes,
@@ -206,14 +214,18 @@ void NcclDeviceCommunicator::AllGatherV(void const *send_buffer, size_t length_b
  receive_buffer->resize(total_bytes);

  size_t offset = 0;
-  dh::safe_nccl(ncclGroupStart());
-  for (int32_t i = 0; i < world_size_; ++i) {
-    size_t as_bytes = segments->at(i);
-    dh::safe_nccl(ncclBroadcast(send_buffer, receive_buffer->data().get() + offset, as_bytes,
-                                ncclChar, i, nccl_comm_, dh::DefaultStream()));
-    offset += as_bytes;
-  }
-  dh::safe_nccl(ncclGroupEnd());
+  auto rc = Success() << [&] { return stub_->GroupStart(); } << [&] {
+    for (int32_t i = 0; i < world_size_; ++i) {
+      size_t as_bytes = segments->at(i);
+      auto rc = stub_->Broadcast(send_buffer, receive_buffer->data().get() + offset, as_bytes,
+                                 ncclChar, i, nccl_comm_, dh::DefaultStream());
+      if (!rc.OK()) {
+        return rc;
+      }
+      offset += as_bytes;
+    }
+    return Success();
+  } << [&] { return stub_->GroupEnd(); };
 }

 void NcclDeviceCommunicator::Synchronize() {
--- a/src/collective/nccl_device_communicator.cuh
+++ b/src/collective/nccl_device_communicator.cuh
@@ -4,8 +4,10 @@
 #pragma once

 #include "../common/device_helpers.cuh"
+#include "comm.cuh"
 #include "communicator.h"
 #include "device_communicator.cuh"
+#include "nccl_stub.h"

 namespace xgboost {
 namespace collective {
@@ -25,7 +27,7 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
   * needed. The in-memory communicator is used in tests with multiple threads, each thread
   * representing a rank/worker, so the additional synchronization is needed to avoid deadlocks.
   */
-  explicit NcclDeviceCommunicator(int device_ordinal, bool needs_sync);
+  explicit NcclDeviceCommunicator(int device_ordinal, bool needs_sync, StringView nccl_path);
  ~NcclDeviceCommunicator() override;
  void AllReduce(void *send_receive_buffer, std::size_t count, DataType data_type,
                 Operation op) override;
@@ -74,7 +76,8 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
    static const int kRootRank = 0;
    ncclUniqueId id;
    if (rank_ == kRootRank) {
-      dh::safe_nccl(ncclGetUniqueId(&id));
+      auto rc = stub_->GetUniqueId(&id);
+      CHECK(rc.OK()) << rc.Report();
    }
    Broadcast(static_cast<void *>(&id), sizeof(ncclUniqueId), static_cast<int>(kRootRank));
    return id;
@@ -88,6 +91,7 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
  int const world_size_;
  int const rank_;
  ncclComm_t nccl_comm_{};
+  std::shared_ptr<NcclStub> stub_;
  ncclUniqueId nccl_unique_id_{};
  size_t allreduce_bytes_{0};  // Keep statistics of the number of bytes communicated.
  size_t allreduce_calls_{0};  // Keep statistics of the number of reduce calls.
--- a/src/collective/nccl_stub.cc
+++ b/src/collective/nccl_stub.cc
@@ -0,0 +1,131 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#if defined(XGBOOST_USE_NCCL) || (defined(XGBOOST_USE_RCCL) && 0)
+#include "nccl_stub.h"
+
+#include <cuda.h>              // for CUDA_VERSION
+#include <cuda_runtime_api.h>  // for cudaPeekAtLastError
+#include <dlfcn.h>             // for dlclose, dlsym, dlopen
+#include <nccl.h>
+#include <thrust/system/cuda/error.h>  // for cuda_category
+#include <thrust/system_error.h>       // for system_error
+
+#include <cstdint>  // for int32_t
+#include <sstream>  // for stringstream
+#include <string>   // for string
+#include <utility>  // for move
+
+#include "xgboost/logging.h"
+
+namespace xgboost::collective {
+Result NcclStub::GetNcclResult(ncclResult_t code) const {
+  if (code == ncclSuccess) {
+    return Success();
+  }
+
+  std::stringstream ss;
+  ss << "NCCL failure: " << this->GetErrorString(code) << ".";
+  if (code == ncclUnhandledCudaError) {
+    // nccl usually preserves the last error so we can get more details.
+    auto err = cudaPeekAtLastError();
+    ss << "  CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
+  } else if (code == ncclSystemError) {
+    ss << "  This might be caused by a network configuration issue. Please consider specifying "
+          "the network interface for NCCL via environment variables listed in its reference: "
+          "`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n";
+  }
+  return Fail(ss.str());
+}
+
+NcclStub::NcclStub(StringView path) : path_{std::move(path)} {
+#if defined(XGBOOST_USE_DLOPEN_NCCL) || defined(XGBOOST_USE_DLOPEN_RCCL)
+  CHECK(!path_.empty()) << "Empty path for NCCL.";
+
+  auto cu_major = (CUDA_VERSION) / 1000;
+  std::stringstream ss;
+  ss << R"m(
+
+If XGBoost is installed from PyPI with pip, the error can fixed by:
+
+- Run `pip install nvidia-nccl-cu)m"
+     << cu_major << "` (Or with any CUDA version that's compatible with " << cu_major << ").";
+  ss << R"m(
+
+Otherwise, please refer to:
+
+  https://xgboost.readthedocs.io/en/stable/tutorials/dask.html#troubleshooting
+
+for more info, or open an issue on GitHub. Starting from XGBoost 2.1.0, the PyPI package
+no long bundles NCCL in the binary wheel.
+
+)m";
+  auto help = ss.str();
+  std::string msg{"Failed to load NCCL from path: `" + path_ + "`. Error:\n  "};
+
+  auto safe_load = [&](auto t, StringView name) {
+    std::stringstream errs;
+    auto ptr = reinterpret_cast<decltype(t)>(dlsym(handle_, name.c_str()));
+    if (!ptr) {
+      errs << "Failed to load NCCL symbol `" << name << "` from " << path_ << ". Error:\n  "
+           << dlerror() << help;
+      LOG(FATAL) << errs.str();
+    }
+    return ptr;
+  };
+
+  handle_ = dlopen(path_.c_str(), RTLD_LAZY);
+  if (!handle_) {
+    LOG(FATAL) << msg << dlerror() << help;
+  }
+
+  allreduce_ = safe_load(allreduce_, "ncclAllReduce");
+  broadcast_ = safe_load(broadcast_, "ncclBroadcast");
+  allgather_ = safe_load(allgather_, "ncclAllGather");
+  comm_init_rank_ = safe_load(comm_init_rank_, "ncclCommInitRank");
+  comm_destroy_ = safe_load(comm_destroy_, "ncclCommDestroy");
+  get_uniqueid_ = safe_load(get_uniqueid_, "ncclGetUniqueId");
+  send_ = safe_load(send_, "ncclSend");
+  recv_ = safe_load(recv_, "ncclRecv");
+  group_start_ = safe_load(group_start_, "ncclGroupStart");
+  group_end_ = safe_load(group_end_, "ncclGroupEnd");
+  get_error_string_ = safe_load(get_error_string_, "ncclGetErrorString");
+  get_version_ = safe_load(get_version_, "ncclGetVersion");
+
+  std::int32_t v;
+  CHECK_EQ(get_version_(&v), ncclSuccess);
+  auto patch = v % 100;
+  auto minor = (v / 100) % 100;
+  auto major = v / 10000;
+
+  LOG(INFO) << "Loaded shared NCCL " << major << "." << minor << "." << patch << ":`" << path_
+            << "`" << std::endl;
+#else
+  allreduce_ = ncclAllReduce;
+  broadcast_ = ncclBroadcast;
+  allgather_ = ncclAllGather;
+  comm_init_rank_ = ncclCommInitRank;
+  comm_destroy_ = ncclCommDestroy;
+  get_uniqueid_ = ncclGetUniqueId;
+  send_ = ncclSend;
+  recv_ = ncclRecv;
+  group_start_ = ncclGroupStart;
+  group_end_ = ncclGroupEnd;
+  get_error_string_ = ncclGetErrorString;
+  get_version_ = ncclGetVersion;
+#endif
+};
+
+NcclStub::~NcclStub() {  // NOLINT
+#if defined(XGBOOST_USE_DLOPEN_NCCL) || defined(XGBOOST_USE_DLOPEN_RCCL)
+  if (handle_) {
+    auto rc = dlclose(handle_);
+    if (rc != 0) {
+      LOG(WARNING) << "Failed to close NCCL handle:" << dlerror();
+    }
+  }
+  handle_ = nullptr;
+#endif  // defined(XGBOOST_USE_DLOPEN_NCCL)
+}
+}  // namespace xgboost::collective
+#endif  // defined(XGBOOST_USE_NCCL)
--- a/src/collective/nccl_stub.h
+++ b/src/collective/nccl_stub.h
@@ -0,0 +1,86 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#pragma once
+#if defined(XGBOOST_USE_NCCL) || (defined(XGBOOST_USE_RCCL) && 0)
+
+#include <cuda_runtime_api.h>
+#include <nccl.h>
+
+#include <string>  // for string
+
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/string_view.h"        // for StringView
+
+namespace xgboost::collective {
+/**
+ * @brief A stub for NCCL to facilitate dynamic loading.
+ */
+class NcclStub {
+#if defined(XGBOOST_USE_DLOPEN_NCCL) || defined(XGBOOST_USE_DLOPEN_RCCL)
+  void* handle_{nullptr};
+#endif  // defined(XGBOOST_USE_DLOPEN_NCCL)
+  std::string path_;
+
+  decltype(ncclAllReduce)* allreduce_{nullptr};
+  decltype(ncclBroadcast)* broadcast_{nullptr};
+  decltype(ncclAllGather)* allgather_{nullptr};
+  decltype(ncclCommInitRank)* comm_init_rank_{nullptr};
+  decltype(ncclCommDestroy)* comm_destroy_{nullptr};
+  decltype(ncclGetUniqueId)* get_uniqueid_{nullptr};
+  decltype(ncclSend)* send_{nullptr};
+  decltype(ncclRecv)* recv_{nullptr};
+  decltype(ncclGroupStart)* group_start_{nullptr};
+  decltype(ncclGroupEnd)* group_end_{nullptr};
+  decltype(ncclGetErrorString)* get_error_string_{nullptr};
+  decltype(ncclGetVersion)* get_version_{nullptr};
+
+ public:
+  Result GetNcclResult(ncclResult_t code) const;
+
+ public:
+  explicit NcclStub(StringView path);
+  ~NcclStub();
+
+  [[nodiscard]] Result Allreduce(const void* sendbuff, void* recvbuff, size_t count,
+                                 ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+                                 cudaStream_t stream) const {
+    return this->GetNcclResult(allreduce_(sendbuff, recvbuff, count, datatype, op, comm, stream));
+  }
+  [[nodiscard]] Result Broadcast(const void* sendbuff, void* recvbuff, size_t count,
+                                 ncclDataType_t datatype, int root, ncclComm_t comm,
+                                 cudaStream_t stream) const {
+    return this->GetNcclResult(broadcast_(sendbuff, recvbuff, count, datatype, root, comm, stream));
+  }
+  [[nodiscard]] Result Allgather(const void* sendbuff, void* recvbuff, size_t sendcount,
+                                 ncclDataType_t datatype, ncclComm_t comm,
+                                 cudaStream_t stream) const {
+    return this->GetNcclResult(allgather_(sendbuff, recvbuff, sendcount, datatype, comm, stream));
+  }
+  [[nodiscard]] Result CommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId,
+                                    int rank) const {
+    return this->GetNcclResult(this->comm_init_rank_(comm, nranks, commId, rank));
+  }
+  [[nodiscard]] Result CommDestroy(ncclComm_t comm) const {
+    return this->GetNcclResult(comm_destroy_(comm));
+  }
+  [[nodiscard]] Result GetUniqueId(ncclUniqueId* uniqueId) const {
+    return this->GetNcclResult(get_uniqueid_(uniqueId));
+  }
+  [[nodiscard]] Result Send(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
+                            ncclComm_t comm, cudaStream_t stream) {
+    return this->GetNcclResult(send_(sendbuff, count, datatype, peer, comm, stream));
+  }
+  [[nodiscard]] Result Recv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
+                            ncclComm_t comm, cudaStream_t stream) const {
+    return this->GetNcclResult(recv_(recvbuff, count, datatype, peer, comm, stream));
+  }
+  [[nodiscard]] Result GroupStart() const { return this->GetNcclResult(group_start_()); }
+  [[nodiscard]] Result GroupEnd() const { return this->GetNcclResult(group_end_()); }
+  [[nodiscard]] const char* GetErrorString(ncclResult_t result) const {
+    return get_error_string_(result);
+  }
+};
+}  // namespace xgboost::collective
+
+#endif  // defined(XGBOOST_USE_NCCL)
--- a/src/collective/tracker.cc
+++ b/src/collective/tracker.cc
@@ -58,36 +58,35 @@ Result Tracker::WaitUntilReady() const {

 RabitTracker::WorkerProxy::WorkerProxy(std::int32_t world, TCPSocket sock, SockAddrV4 addr)
    : sock_{std::move(sock)} {
-  auto host = addr.Addr();
-
  std::int32_t rank{0};
-  rc_ = Success()
-        << [&] { return proto::Magic{}.Verify(&sock_); }
-        << [&] { return proto::Connect{}.TrackerRecv(&sock_, &world_, &rank, &task_id_); };
-  if (!rc_.OK()) {
-    return;
-  }
-
-  std::string cmd;
-  sock_.Recv(&cmd);
-  auto jcmd = Json::Load(StringView{cmd});
-  cmd_ = static_cast<proto::CMD>(get<Integer const>(jcmd["cmd"]));
+  Json jcmd;
  std::int32_t port{0};
-  if (cmd_ == proto::CMD::kStart) {
-    proto::Start start;
-    rc_ = start.TrackerHandle(jcmd, &world_, world, &port, &sock_, &eport_);
-  } else if (cmd_ == proto::CMD::kPrint) {
-    proto::Print print;
-    rc_ = print.TrackerHandle(jcmd, &msg_);
-  } else if (cmd_ == proto::CMD::kError) {
-    proto::ErrorCMD error;
-    rc_ = error.TrackerHandle(jcmd, &msg_, &code_);
-  }
-  if (!rc_.OK()) {
-    return;
-  }

-  info_ = proto::PeerInfo{host, port, rank};
+  rc_ = Success() << [&] { return proto::Magic{}.Verify(&sock_); } << [&] {
+    return proto::Connect{}.TrackerRecv(&sock_, &world_, &rank, &task_id_);
+  } << [&] {
+    std::string cmd;
+    sock_.Recv(&cmd);
+    jcmd = Json::Load(StringView{cmd});
+    cmd_ = static_cast<proto::CMD>(get<Integer const>(jcmd["cmd"]));
+    return Success();
+  } << [&] {
+    if (cmd_ == proto::CMD::kStart) {
+      proto::Start start;
+      return start.TrackerHandle(jcmd, &world_, world, &port, &sock_, &eport_);
+    } else if (cmd_ == proto::CMD::kPrint) {
+      proto::Print print;
+      return print.TrackerHandle(jcmd, &msg_);
+    } else if (cmd_ == proto::CMD::kError) {
+      proto::ErrorCMD error;
+      return error.TrackerHandle(jcmd, &msg_, &code_);
+    }
+    return Success();
+  } << [&] {
+    auto host = addr.Addr();
+    info_ = proto::PeerInfo{host, port, rank};
+    return Success();
+  };
 }

 RabitTracker::RabitTracker(Json const& config) : Tracker{config} {
@@ -137,15 +136,18 @@ Result RabitTracker::Bootstrap(std::vector<WorkerProxy>* p_workers) {

    std::int32_t n_shutdown{0};
    bool during_restart{false};
+    bool running{false};
    std::vector<WorkerProxy> pending;

    explicit State(std::int32_t world) : n_workers{world} {}
    State(State const& that) = delete;
    State& operator=(State&& that) = delete;

+    // modifiers
    void Start(WorkerProxy&& worker) {
      CHECK_LT(pending.size(), n_workers);
      CHECK_LE(n_shutdown, n_workers);
+      CHECK(!running);

      pending.emplace_back(std::forward<WorkerProxy>(worker));

@@ -155,6 +157,7 @@ Result RabitTracker::Bootstrap(std::vector<WorkerProxy>* p_workers) {
      CHECK_GE(n_shutdown, 0);
      CHECK_LT(n_shutdown, n_workers);

+      running = false;
      ++n_shutdown;

      CHECK_LE(n_shutdown, n_workers);
@@ -163,21 +166,26 @@ Result RabitTracker::Bootstrap(std::vector<WorkerProxy>* p_workers) {
      CHECK_LE(pending.size(), n_workers);
      CHECK_LE(n_shutdown, n_workers);

+      running = false;
      during_restart = true;
    }
-    [[nodiscard]] bool Ready() const {
-      CHECK_LE(pending.size(), n_workers);
-      return static_cast<std::int32_t>(pending.size()) == n_workers;
-    }
    void Bootstrap() {
      CHECK_EQ(pending.size(), n_workers);
      CHECK_LE(n_shutdown, n_workers);

+      running = true;
+
      // A reset.
      n_shutdown = 0;
      during_restart = false;
      pending.clear();
    }
+
+    // observers
+    [[nodiscard]] bool Ready() const {
+      CHECK_LE(pending.size(), n_workers);
+      return static_cast<std::int32_t>(pending.size()) == n_workers;
+    }
    [[nodiscard]] bool ShouldContinue() const {
      CHECK_LE(pending.size(), n_workers);
      CHECK_LE(n_shutdown, n_workers);
@@ -187,7 +195,31 @@ Result RabitTracker::Bootstrap(std::vector<WorkerProxy>* p_workers) {
    }
  };

-  return std::async(std::launch::async, [this] {
+  auto handle_error = [&](WorkerProxy const& worker) {
+    auto msg = worker.Msg();
+    auto code = worker.Code();
+    LOG(WARNING) << "Recieved error from [" << worker.Host() << ":" << worker.Rank() << "]: " << msg
+                 << " code:" << code;
+    auto host = worker.Host();
+    // We signal all workers for the error, if they haven't aborted already.
+    for (auto& w : worker_error_handles_) {
+      if (w.first == host) {
+        continue;
+      }
+      TCPSocket out;
+      // Connecting to the error port as a signal for exit.
+      //
+      // retry is set to 1, just let the worker timeout or error. Otherwise the
+      // tracker and the worker might be waiting for each other.
+      auto rc = Connect(w.first, w.second, 1, timeout_, &out);
+      if (!rc.OK()) {
+        return Fail("Failed to inform workers to stop.");
+      }
+    }
+    return Success();
+  };
+
+  return std::async(std::launch::async, [this, handle_error] {
    State state{this->n_workers_};

    while (state.ShouldContinue()) {
@@ -205,6 +237,16 @@ Result RabitTracker::Bootstrap(std::vector<WorkerProxy>* p_workers) {
      }
      switch (worker.Command()) {
        case proto::CMD::kStart: {
+          if (state.running) {
+            // Something went wrong with one of the workers. It got disconnected without
+            // notice.
+            state.Error();
+            rc = handle_error(worker);
+            if (!rc.OK()) {
+              return Fail("Failed to handle abort.", std::move(rc));
+            }
+          }
+
          state.Start(std::move(worker));
          if (state.Ready()) {
            rc = this->Bootstrap(&state.pending);
@@ -216,36 +258,20 @@ Result RabitTracker::Bootstrap(std::vector<WorkerProxy>* p_workers) {
          continue;
        }
        case proto::CMD::kShutdown: {
+          if (state.during_restart) {
+            // The worker can still send shutdown after call to `std::exit`.
+            continue;
+          }
          state.Shutdown();
          continue;
        }
        case proto::CMD::kError: {
          if (state.during_restart) {
+            // Ignore further errors.
            continue;
          }
          state.Error();
-          auto msg = worker.Msg();
-          auto code = worker.Code();
-          LOG(WARNING) << "Recieved error from [" << worker.Host() << ":" << worker.Rank()
-                       << "]: " << msg << " code:" << code;
-          auto host = worker.Host();
-          // We signal all workers for the error, if they haven't aborted already.
-          for (auto& w : worker_error_handles_) {
-            if (w.first == host) {
-              continue;
-            }
-            TCPSocket out;
-            // retry is set to 1, just let the worker timeout or error. Otherwise the
-            // tracker and the worker might be waiting for each other.
-            auto rc = Connect(w.first, w.second, 1, timeout_, &out);
-            // send signal to stop the worker.
-            proto::ShutdownCMD shutdown;
-            rc = shutdown.Send(&out);
-            if (!rc.OK()) {
-              return Fail("Failed to inform workers to stop.");
-            }
-          }
-
+          rc = handle_error(worker);
          continue;
        }
        case proto::CMD::kPrint: {
--- a/src/collective/tracker.h
+++ b/src/collective/tracker.h
@@ -114,6 +114,9 @@ class RabitTracker : public Tracker {
  // record for how to reach out to workers if error happens.
  std::vector<std::pair<std::string, std::int32_t>> worker_error_handles_;
  // listening socket for incoming workers.
+  //
+  // At the moment, the listener calls accept without first polling. We can add an
+  // additional unix domain socket to allow cancelling the accept.
  TCPSocket listener_;

  Result Bootstrap(std::vector<WorkerProxy>* p_workers);
--- a/src/common/algorithm.cuh
+++ b/src/common/algorithm.cuh
@@ -29,8 +29,7 @@
 #include "xgboost/logging.h"   // CHECK
 #include "xgboost/span.h"      // Span,byte

-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 namespace detail {

 // Wrapper around cub sort to define is_decending
@@ -165,13 +164,14 @@ inline void SegmentedSortKeys(Context const *ctx, Span<V const> group_ptr,
 template <bool accending, bool per_seg_index, typename U, typename V, typename IdxT>
 void SegmentedArgSort(Context const *ctx, Span<U> values, Span<V> group_ptr,
                      Span<IdxT> sorted_idx) {
+  auto cuctx = ctx->CUDACtx();
  CHECK_GE(group_ptr.size(), 1ul);
  std::size_t n_groups = group_ptr.size() - 1;
  std::size_t bytes = 0;
  if (per_seg_index) {
    SegmentedSequence(ctx, group_ptr, sorted_idx);
  } else {
-    dh::Iota(sorted_idx);
+    dh::Iota(sorted_idx, cuctx->Stream());
  }
  dh::TemporaryArray<std::remove_const_t<U>> values_out(values.size());
  dh::TemporaryArray<std::remove_const_t<IdxT>> sorted_idx_out(sorted_idx.size());
@@ -179,15 +179,16 @@ void SegmentedArgSort(Context const *ctx, Span<U> values, Span<V> group_ptr,
  detail::DeviceSegmentedRadixSortPair<!accending>(
      nullptr, bytes, values.data(), values_out.data().get(), sorted_idx.data(),
      sorted_idx_out.data().get(), sorted_idx.size(), n_groups, group_ptr.data(),
-      group_ptr.data() + 1, ctx->CUDACtx()->Stream());
+      group_ptr.data() + 1, cuctx->Stream());
  dh::TemporaryArray<byte> temp_storage(bytes);
  detail::DeviceSegmentedRadixSortPair<!accending>(
      temp_storage.data().get(), bytes, values.data(), values_out.data().get(), sorted_idx.data(),
      sorted_idx_out.data().get(), sorted_idx.size(), n_groups, group_ptr.data(),
-      group_ptr.data() + 1, ctx->CUDACtx()->Stream());
+      group_ptr.data() + 1, cuctx->Stream());

  dh::safe_cuda(cudaMemcpyAsync(sorted_idx.data(), sorted_idx_out.data().get(),
-                                sorted_idx.size_bytes(), cudaMemcpyDeviceToDevice));
+                                sorted_idx.size_bytes(), cudaMemcpyDeviceToDevice,
+                                cuctx->Stream()));
 }

 /**
@@ -197,11 +198,12 @@ void SegmentedArgSort(Context const *ctx, Span<U> values, Span<V> group_ptr,
 template <typename SegIt, typename ValIt>
 void SegmentedArgMergeSort(Context const *ctx, SegIt seg_begin, SegIt seg_end, ValIt val_begin,
                           ValIt val_end, dh::device_vector<std::size_t> *p_sorted_idx) {
+  auto cuctx = ctx->CUDACtx();
  using Tup = thrust::tuple<std::int32_t, float>;
  auto &sorted_idx = *p_sorted_idx;
  std::size_t n = std::distance(val_begin, val_end);
  sorted_idx.resize(n);
-  dh::Iota(dh::ToSpan(sorted_idx));
+  dh::Iota(dh::ToSpan(sorted_idx), cuctx->Stream());
  dh::device_vector<Tup> keys(sorted_idx.size());
  auto key_it = dh::MakeTransformIterator<Tup>(thrust::make_counting_iterator(0ul),
                                               [=] XGBOOST_DEVICE(std::size_t i) -> Tup {
@@ -215,7 +217,7 @@ void SegmentedArgMergeSort(Context const *ctx, SegIt seg_begin, SegIt seg_end, V
                                                 return thrust::make_tuple(seg_idx, residue);
                                               });
  thrust::copy(ctx->CUDACtx()->CTP(), key_it, key_it + keys.size(), keys.begin());
-  thrust::stable_sort_by_key(ctx->CUDACtx()->TP(), keys.begin(), keys.end(), sorted_idx.begin(),
+  thrust::stable_sort_by_key(cuctx->TP(), keys.begin(), keys.end(), sorted_idx.begin(),
                             [=] XGBOOST_DEVICE(Tup const &l, Tup const &r) {
                               if (thrust::get<0>(l) != thrust::get<0>(r)) {
                                 return thrust::get<0>(l) < thrust::get<0>(r);  // segment index
@@ -223,6 +225,75 @@ void SegmentedArgMergeSort(Context const *ctx, SegIt seg_begin, SegIt seg_end, V
                               return thrust::get<1>(l) < thrust::get<1>(r);    // residue
                             });
 }
-}  // namespace common
-}  // namespace xgboost
+
+template <bool accending, typename IdxT, typename U>
+void ArgSort(xgboost::Context const *ctx, xgboost::common::Span<U> keys,
+             xgboost::common::Span<IdxT> sorted_idx) {
+  std::size_t bytes = 0;
+  auto cuctx = ctx->CUDACtx();
+  dh::Iota(sorted_idx, cuctx->Stream());
+
+  using KeyT = typename decltype(keys)::value_type;
+  using ValueT = std::remove_const_t<IdxT>;
+
+  dh::TemporaryArray<KeyT> out(keys.size());
+  cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(keys.data()), out.data().get());
+  dh::TemporaryArray<IdxT> sorted_idx_out(sorted_idx.size());
+  cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(sorted_idx.data()),
+                                     sorted_idx_out.data().get());
+
+  // track https://github.com/NVIDIA/cub/pull/340 for 64bit length support
+  using OffsetT = std::conditional_t<!dh::BuildWithCUDACub(), std::ptrdiff_t, int32_t>;
+  CHECK_LE(sorted_idx.size(), std::numeric_limits<OffsetT>::max());
+  if (accending) {
+    void *d_temp_storage = nullptr;
+#if THRUST_MAJOR_VERSION >= 2
+    dh::safe_cuda((cub::DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
+        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
+        cuctx->Stream())));
+#else
+    dh::safe_cuda((cub::DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
+        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
+        nullptr, false)));
+#endif
+    dh::TemporaryArray<char> storage(bytes);
+    d_temp_storage = storage.data().get();
+#if THRUST_MAJOR_VERSION >= 2
+    dh::safe_cuda((cub::DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
+        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
+        cuctx->Stream())));
+#else
+    dh::safe_cuda((cub::DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
+        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
+        nullptr, false)));
+#endif
+  } else {
+    void *d_temp_storage = nullptr;
+#if THRUST_MAJOR_VERSION >= 2
+    dh::safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
+        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
+        cuctx->Stream())));
+#else
+    dh::safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
+        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
+        nullptr, false)));
+#endif
+    dh::TemporaryArray<char> storage(bytes);
+    d_temp_storage = storage.data().get();
+#if THRUST_MAJOR_VERSION >= 2
+    dh::safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
+        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
+        cuctx->Stream())));
+#else
+    dh::safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
+        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
+        nullptr, false)));
+#endif
+  }
+
+  dh::safe_cuda(cudaMemcpyAsync(sorted_idx.data(), sorted_idx_out.data().get(),
+                                sorted_idx.size_bytes(), cudaMemcpyDeviceToDevice,
+                                cuctx->Stream()));
+}
+}  // namespace xgboost::common
 #endif  // XGBOOST_COMMON_ALGORITHM_CUH_
--- a/src/common/common.h
+++ b/src/common/common.h
@@ -176,10 +176,10 @@ inline void AssertNCCLSupport() {
 #endif  // !defined(XGBOOST_USE_NCCL)
 }

-inline void AssertOneAPISupport() {
-#ifndef XGBOOST_USE_ONEAPI
-    LOG(FATAL) << "XGBoost version not compiled with OneAPI support.";
-#endif  // XGBOOST_USE_ONEAPI
+inline void AssertSYCLSupport() {
+#ifndef XGBOOST_USE_SYCL
+    LOG(FATAL) << "XGBoost version not compiled with SYCL support.";
+#endif  // XGBOOST_USE_SYCL
 }

 void SetDevice(std::int32_t device);
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -38,10 +38,6 @@
 #include "xgboost/logging.h"
 #include "xgboost/span.h"

-#ifdef XGBOOST_USE_NCCL
-#include "nccl.h"
-#endif  // XGBOOST_USE_NCCL
-
 #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 #include "rmm/mr/device/per_device_resource.hpp"
 #include "rmm/mr/device/thrust_allocator_adaptor.hpp"
@@ -117,30 +113,6 @@ XGBOOST_DEV_INLINE T atomicAdd(T *addr, T v) {  // NOLINT
 }
 namespace dh {

-#ifdef XGBOOST_USE_NCCL
-#define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__)
-
-inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file, int line) {
-  if (code != ncclSuccess) {
-    std::stringstream ss;
-    ss << "NCCL failure: " << ncclGetErrorString(code) << ".";
-    ss << " " << file << "(" << line << ")\n";
-    if (code == ncclUnhandledCudaError) {
-      // nccl usually preserves the last error so we can get more details.
-      auto err = cudaPeekAtLastError();
-      ss << "  CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
-    } else if (code == ncclSystemError) {
-      ss << "  This might be caused by a network configuration issue. Please consider specifying "
-            "the network interface for NCCL via environment variables listed in its reference: "
-            "`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n";
-    }
-    LOG(FATAL) << ss.str();
-  }
-
-  return code;
-}
-#endif
-
 inline int32_t CudaGetPointerDevice(void const *ptr) {
  int32_t device = -1;
  cudaPointerAttributes attr;
@@ -315,8 +287,8 @@ inline void LaunchN(size_t n, L lambda) {
 }

 template <typename Container>
-void Iota(Container array) {
-  LaunchN(array.size(), [=] __device__(size_t i) { array[i] = i; });
+void Iota(Container array, cudaStream_t stream) {
+  LaunchN(array.size(), stream, [=] __device__(size_t i) { array[i] = i; });
 }

 namespace detail {
@@ -482,7 +454,8 @@ struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
  cub::CachingDeviceAllocator& GetGlobalCachingAllocator() {
    // Configure allocator with maximum cached bin size of ~1GB and no limit on
    // maximum cached bytes
-    thread_local cub::CachingDeviceAllocator *allocator = new cub::CachingDeviceAllocator(2, 9, 29);
+    thread_local std::unique_ptr<cub::CachingDeviceAllocator> allocator{
+        std::make_unique<cub::CachingDeviceAllocator>(2, 9, 29)};
    return *allocator;
  }
  pointer allocate(size_t n) {  // NOLINT
@@ -598,6 +571,16 @@ class DoubleBuffer {
  T *Other() { return buff.Alternate(); }
 };

+template <typename T>
+xgboost::common::Span<T> LazyResize(xgboost::Context const *ctx,
+                                    xgboost::HostDeviceVector<T> *buffer, std::size_t n) {
+  buffer->SetDevice(ctx->Device());
+  if (buffer->Size() < n) {
+    buffer->Resize(n);
+  }
+  return buffer->DeviceSpan().subspan(0, n);
+}
+
 /**
 * \brief Copies device span to std::vector.
 *
@@ -1061,74 +1044,6 @@ void InclusiveSum(InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items)
  InclusiveScan(d_in, d_out, cub::Sum(), num_items);
 }

-template <bool accending, typename IdxT, typename U>
-void ArgSort(xgboost::common::Span<U> keys, xgboost::common::Span<IdxT> sorted_idx) {
-  size_t bytes = 0;
-  Iota(sorted_idx);
-
-  using KeyT = typename decltype(keys)::value_type;
-  using ValueT = std::remove_const_t<IdxT>;
-
-  TemporaryArray<KeyT> out(keys.size());
-  cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(keys.data()),
-                                 out.data().get());
-  TemporaryArray<IdxT> sorted_idx_out(sorted_idx.size());
-  cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(sorted_idx.data()),
-                                     sorted_idx_out.data().get());
-
-  // track https://github.com/NVIDIA/cub/pull/340 for 64bit length support
-  using OffsetT = std::conditional_t<!BuildWithCUDACub(), std::ptrdiff_t, int32_t>;
-  CHECK_LE(sorted_idx.size(), std::numeric_limits<OffsetT>::max());
-  if (accending) {
-    void *d_temp_storage = nullptr;
-#if THRUST_MAJOR_VERSION >= 2
-    safe_cuda((cub::DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
-        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
-        sizeof(KeyT) * 8, false, nullptr)));
-#else
-    safe_cuda((cub::DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
-        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
-        sizeof(KeyT) * 8, false, nullptr, false)));
-#endif
-    TemporaryArray<char> storage(bytes);
-    d_temp_storage = storage.data().get();
-#if THRUST_MAJOR_VERSION >= 2
-    safe_cuda((cub::DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
-        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
-        sizeof(KeyT) * 8, false, nullptr)));
-#else
-    safe_cuda((cub::DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
-        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
-        sizeof(KeyT) * 8, false, nullptr, false)));
-#endif
-  } else {
-    void *d_temp_storage = nullptr;
-#if THRUST_MAJOR_VERSION >= 2
-    safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
-        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
-        sizeof(KeyT) * 8, false, nullptr)));
-#else
-    safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
-        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
-        sizeof(KeyT) * 8, false, nullptr, false)));
-#endif
-    TemporaryArray<char> storage(bytes);
-    d_temp_storage = storage.data().get();
-#if THRUST_MAJOR_VERSION >= 2
-    safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
-        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
-        sizeof(KeyT) * 8, false, nullptr)));
-#else
-    safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
-        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
-        sizeof(KeyT) * 8, false, nullptr, false)));
-#endif
-  }
-
-  safe_cuda(cudaMemcpyAsync(sorted_idx.data(), sorted_idx_out.data().get(),
-                            sorted_idx.size_bytes(), cudaMemcpyDeviceToDevice));
-}
-
 class CUDAStreamView;

 class CUDAEvent {
--- a/src/common/error_msg.h
+++ b/src/common/error_msg.h
@@ -97,5 +97,7 @@ constexpr StringView InvalidCUDAOrdinal() {
 }

 void MismatchedDevices(Context const* booster, Context const* data);
+
+inline auto NoFederated() { return "XGBoost is not compiled with federated learning support."; }
 }  // namespace xgboost::error
 #endif  // XGBOOST_COMMON_ERROR_MSG_H_
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -51,7 +51,7 @@ HistogramCuts SketchOnDMatrix(Context const *ctx, DMatrix *m, bst_bin_t max_bins
    for (auto const &page : m->GetBatches<SparsePage>()) {
      container.PushRowPage(page, info, hessian);
    }
-    container.MakeCuts(m->Info(), &out);
+    container.MakeCuts(ctx, m->Info(), &out);
  } else {
    SortedSketchContainer container{ctx,
                                    max_bins,
@@ -61,7 +61,7 @@ HistogramCuts SketchOnDMatrix(Context const *ctx, DMatrix *m, bst_bin_t max_bins
    for (auto const &page : m->GetBatches<SortedCSCPage>(ctx)) {
      container.PushColPage(page, info, hessian);
    }
-    container.MakeCuts(m->Info(), &out);
+    container.MakeCuts(ctx, m->Info(), &out);
  }

  return out;
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@@ -359,7 +359,7 @@ HistogramCuts DeviceSketchWithHessian(Context const* ctx, DMatrix* p_fmat, bst_b
    }
  }

-  sketch_container.MakeCuts(&cuts, p_fmat->Info().IsColumnSplit());
+  sketch_container.MakeCuts(ctx, &cuts, p_fmat->Info().IsColumnSplit());
  return cuts;
 }
 }  // namespace xgboost::common
--- a/src/common/quantile.cc
+++ b/src/common/quantile.cc
@@ -11,9 +11,7 @@
 #include "categorical.h"
 #include "hist_util.h"

-namespace xgboost {
-namespace common {
-
+namespace xgboost::common {
 template <typename WQSketch>
 SketchContainerImpl<WQSketch>::SketchContainerImpl(Context const *ctx,
                                                   std::vector<bst_row_t> columns_size,
@@ -129,7 +127,7 @@ struct QuantileAllreduce {
   * \param rank rank of target worker
   * \param fidx feature idx
   */
-  auto Values(int32_t rank, bst_feature_t fidx) const {
+  [[nodiscard]] auto Values(int32_t rank, bst_feature_t fidx) const {
    // get span for worker
    auto wsize = worker_indptr[rank + 1] - worker_indptr[rank];
    auto worker_values = global_values.subspan(worker_indptr[rank], wsize);
@@ -145,7 +143,7 @@ struct QuantileAllreduce {

 template <typename WQSketch>
 void SketchContainerImpl<WQSketch>::GatherSketchInfo(
-    MetaInfo const& info,
+    Context const *, MetaInfo const &info,
    std::vector<typename WQSketch::SummaryContainer> const &reduced,
    std::vector<size_t> *p_worker_segments, std::vector<bst_row_t> *p_sketches_scan,
    std::vector<typename WQSketch::Entry> *p_global_sketches) {
@@ -206,7 +204,7 @@ void SketchContainerImpl<WQSketch>::GatherSketchInfo(
 }

 template <typename WQSketch>
-void SketchContainerImpl<WQSketch>::AllreduceCategories(MetaInfo const& info) {
+void SketchContainerImpl<WQSketch>::AllreduceCategories(Context const*, MetaInfo const& info) {
  auto world_size = collective::GetWorldSize();
  auto rank = collective::GetRank();
  if (world_size == 1 || info.IsColumnSplit()) {
@@ -274,16 +272,15 @@ void SketchContainerImpl<WQSketch>::AllreduceCategories(MetaInfo const& info) {

 template <typename WQSketch>
 void SketchContainerImpl<WQSketch>::AllReduce(
-    MetaInfo const& info,
-    std::vector<typename WQSketch::SummaryContainer> *p_reduced,
-    std::vector<int32_t>* p_num_cuts) {
+    Context const *ctx, MetaInfo const &info,
+    std::vector<typename WQSketch::SummaryContainer> *p_reduced, std::vector<int32_t> *p_num_cuts) {
  monitor_.Start(__func__);

  size_t n_columns = sketches_.size();
  collective::Allreduce<collective::Operation::kMax>(&n_columns, 1);
  CHECK_EQ(n_columns, sketches_.size()) << "Number of columns differs across workers";

-  AllreduceCategories(info);
+  AllreduceCategories(ctx, info);

  auto& num_cuts = *p_num_cuts;
  CHECK_EQ(num_cuts.size(), 0);
@@ -324,7 +321,7 @@ void SketchContainerImpl<WQSketch>::AllReduce(
  std::vector<bst_row_t> sketches_scan((n_columns + 1) * world, 0);

  std::vector<typename WQSketch::Entry> global_sketches;
-  this->GatherSketchInfo(info, reduced, &worker_segments, &sketches_scan, &global_sketches);
+  this->GatherSketchInfo(ctx, info, reduced, &worker_segments, &sketches_scan, &global_sketches);

  std::vector<typename WQSketch::SummaryContainer> final_sketches(n_columns);

@@ -383,11 +380,12 @@ auto AddCategories(std::set<float> const &categories, HistogramCuts *cuts) {
 }

 template <typename WQSketch>
-void SketchContainerImpl<WQSketch>::MakeCuts(MetaInfo const &info, HistogramCuts *p_cuts) {
+void SketchContainerImpl<WQSketch>::MakeCuts(Context const *ctx, MetaInfo const &info,
+                                             HistogramCuts *p_cuts) {
  monitor_.Start(__func__);
  std::vector<typename WQSketch::SummaryContainer> reduced;
  std::vector<int32_t> num_cuts;
-  this->AllReduce(info, &reduced, &num_cuts);
+  this->AllReduce(ctx, info, &reduced, &num_cuts);

  p_cuts->min_vals_.HostVector().resize(sketches_.size(), 0.0f);
  std::vector<typename WQSketch::SummaryContainer> final_summaries(reduced.size());
@@ -496,5 +494,4 @@ void SortedSketchContainer::PushColPage(SparsePage const &page, MetaInfo const &
  });
  monitor_.Stop(__func__);
 }
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
--- a/src/common/quantile.cu
+++ b/src/common/quantile.cu
@@ -22,9 +22,7 @@
 #include "transform_iterator.h"  // MakeIndexTransformIter
 #include "xgboost/span.h"

-namespace xgboost {
-namespace common {
-
+namespace xgboost::common {
 using WQSketch = HostSketchContainer::WQSketch;
 using SketchEntry = WQSketch::Entry;

@@ -504,7 +502,7 @@ void SketchContainer::FixError() {
  });
 }

-void SketchContainer::AllReduce(bool is_column_split) {
+void SketchContainer::AllReduce(Context const*, bool is_column_split) {
  dh::safe_cuda(cudaSetDevice(device_.ordinal));
  auto world = collective::GetWorldSize();
  if (world == 1 || is_column_split) {
@@ -585,13 +583,13 @@ struct InvalidCatOp {
 };
 }  // anonymous namespace

-void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) {
+void SketchContainer::MakeCuts(Context const* ctx, HistogramCuts* p_cuts, bool is_column_split) {
  timer_.Start(__func__);
  dh::safe_cuda(cudaSetDevice(device_.ordinal));
  p_cuts->min_vals_.Resize(num_columns_);

  // Sync between workers.
-  this->AllReduce(is_column_split);
+  this->AllReduce(ctx, is_column_split);

  // Prune to final number of bins.
  this->Prune(num_bins_ + 1);
@@ -734,5 +732,4 @@ void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) {
  p_cuts->SetCategorical(this->has_categorical_, max_cat);
  timer_.Stop(__func__);
 }
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
--- a/src/common/quantile.cuh
+++ b/src/common/quantile.cuh
@@ -151,9 +151,9 @@ class SketchContainer {
             Span<SketchEntry const> that);

  /* \brief Merge quantiles from other GPU workers. */
-  void AllReduce(bool is_column_split);
+  void AllReduce(Context const* ctx, bool is_column_split);
  /* \brief Create the final histogram cut values. */
-  void MakeCuts(HistogramCuts* cuts, bool is_column_split);
+  void MakeCuts(Context const* ctx, HistogramCuts* cuts, bool is_column_split);

  Span<SketchEntry const> Data() const {
    return {this->Current().data().get(), this->Current().size()};
--- a/src/common/quantile.h
+++ b/src/common/quantile.h
@@ -827,13 +827,14 @@ class SketchContainerImpl {
    return group_ind;
  }
  // Gather sketches from all workers.
-  void GatherSketchInfo(MetaInfo const& info,
+  void GatherSketchInfo(Context const *ctx, MetaInfo const &info,
                        std::vector<typename WQSketch::SummaryContainer> const &reduced,
                        std::vector<bst_row_t> *p_worker_segments,
                        std::vector<bst_row_t> *p_sketches_scan,
                        std::vector<typename WQSketch::Entry> *p_global_sketches);
  // Merge sketches from all workers.
-  void AllReduce(MetaInfo const& info, std::vector<typename WQSketch::SummaryContainer> *p_reduced,
+  void AllReduce(Context const *ctx, MetaInfo const &info,
+                 std::vector<typename WQSketch::SummaryContainer> *p_reduced,
                 std::vector<int32_t> *p_num_cuts);

  template <typename Batch, typename IsValid>
@@ -887,11 +888,11 @@ class SketchContainerImpl {
  /* \brief Push a CSR matrix. */
  void PushRowPage(SparsePage const &page, MetaInfo const &info, Span<float const> hessian = {});

-  void MakeCuts(MetaInfo const& info, HistogramCuts* cuts);
+  void MakeCuts(Context const *ctx, MetaInfo const &info, HistogramCuts *cuts);

 private:
  // Merge all categories from other workers.
-  void AllreduceCategories(MetaInfo const& info);
+  void AllreduceCategories(Context const* ctx, MetaInfo const& info);
 };

 class HostSketchContainer : public SketchContainerImpl<WQuantileSketch<float, float>> {
--- a/src/common/random.cc
+++ b/src/common/random.cc
@@ -1,32 +1,50 @@
-/*!
- * Copyright 2020 by XGBoost Contributors
- * \file random.cc
+/**
+ * Copyright 2020-2023, XGBoost Contributors
 */
 #include "random.h"

-namespace xgboost {
-namespace common {
+#include <algorithm>  // for sort, max, copy
+#include <memory>     // for shared_ptr
+
+#include "xgboost/host_device_vector.h"  // for HostDeviceVector
+
+namespace xgboost::common {
 std::shared_ptr<HostDeviceVector<bst_feature_t>> ColumnSampler::ColSample(
    std::shared_ptr<HostDeviceVector<bst_feature_t>> p_features, float colsample) {
  if (colsample == 1.0f) {
    return p_features;
  }
+
+  int n = std::max(1, static_cast<int>(colsample * p_features->Size()));
+  auto p_new_features = std::make_shared<HostDeviceVector<bst_feature_t>>();
+
+  if (ctx_->IsCUDA()) {
+#if defined(XGBOOST_USE_CUDA)
+    cuda_impl::SampleFeature(ctx_, n, p_features, p_new_features, this->feature_weights_,
+                             &this->weight_buffer_, &this->idx_buffer_, &rng_);
+    return p_new_features;
+#else
+    AssertGPUSupport();
+    return nullptr;
+#endif  // defined(XGBOOST_USE_CUDA)
+  }
+
  const auto &features = p_features->HostVector();
  CHECK_GT(features.size(), 0);

-  int n = std::max(1, static_cast<int>(colsample * features.size()));
-  auto p_new_features = std::make_shared<HostDeviceVector<bst_feature_t>>();
  auto &new_features = *p_new_features;

-  if (feature_weights_.size() != 0) {
+  if (!feature_weights_.Empty()) {
    auto const &h_features = p_features->HostVector();
-    std::vector<float> weights(h_features.size());
+    auto const &h_feature_weight = feature_weights_.ConstHostVector();
+    auto &weight = this->weight_buffer_.HostVector();
+    weight.resize(h_features.size());
    for (size_t i = 0; i < h_features.size(); ++i) {
-      weights[i] = feature_weights_[h_features[i]];
+      weight[i] = h_feature_weight[h_features[i]];
    }
    CHECK(ctx_);
    new_features.HostVector() =
-        WeightedSamplingWithoutReplacement(ctx_, p_features->HostVector(), weights, n);
+        WeightedSamplingWithoutReplacement(ctx_, p_features->HostVector(), weight, n);
  } else {
    new_features.Resize(features.size());
    std::copy(features.begin(), features.end(), new_features.HostVector().begin());
@@ -36,5 +54,4 @@ std::shared_ptr<HostDeviceVector<bst_feature_t>> ColumnSampler::ColSample(
  std::sort(new_features.HostVector().begin(), new_features.HostVector().end());
  return p_new_features;
 }
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
--- a/src/common/random.cu
+++ b/src/common/random.cu
@@ -0,0 +1,106 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include <thrust/shuffle.h>  // for shuffle
+
+#include <memory>  // for shared_ptr
+
+#include "algorithm.cuh"     // for ArgSort
+#include "cuda_context.cuh"  // for CUDAContext
+#include "device_helpers.cuh"
+#include "random.h"
+#include "xgboost/base.h"                // for bst_feature_t
+#include "xgboost/context.h"             // for Context
+#include "xgboost/host_device_vector.h"  // for HostDeviceVector
+
+namespace xgboost::common::cuda_impl {
+// GPU implementation for sampling without replacement, see the CPU version for references.
+void WeightedSamplingWithoutReplacement(Context const *ctx, common::Span<bst_feature_t const> array,
+                                        common::Span<float const> weights,
+                                        common::Span<bst_feature_t> results,
+                                        HostDeviceVector<bst_feature_t> *sorted_idx,
+                                        GlobalRandomEngine *grng) {
+  CUDAContext const *cuctx = ctx->CUDACtx();
+  CHECK_EQ(array.size(), weights.size());
+  // Sampling keys
+  dh::caching_device_vector<float> keys(weights.size());
+
+  auto d_keys = dh::ToSpan(keys);
+
+  auto seed = (*grng)();
+  constexpr auto kEps = kRtEps;  // avoid CUDA compilation error
+  thrust::for_each_n(cuctx->CTP(), thrust::make_counting_iterator(0ul), array.size(),
+                     [=] XGBOOST_DEVICE(std::size_t i) {
+                       thrust::default_random_engine rng;
+                       rng.seed(seed);
+                       rng.discard(i);
+                       thrust::uniform_real_distribution<float> dist;
+
+                       auto w = std::max(weights[i], kEps);
+                       auto u = dist(rng);
+                       auto k = std::log(u) / w;
+                       d_keys[i] = k;
+                     });
+  // Allocate buffer for sorted index.
+  auto d_idx = dh::LazyResize(ctx, sorted_idx, keys.size());
+
+  ArgSort<false>(ctx, d_keys, d_idx);
+
+  // Filter the result according to sorted index.
+  auto it = thrust::make_permutation_iterator(dh::tbegin(array), dh::tbegin(d_idx));
+  // |array| == |weights| == |keys| == |sorted_idx| >= |results|
+  for (auto size : {array.size(), weights.size(), keys.size()}) {
+    CHECK_EQ(size, d_idx.size());
+  }
+  CHECK_GE(array.size(), results.size());
+  thrust::copy_n(cuctx->CTP(), it, results.size(), dh::tbegin(results));
+}
+
+void SampleFeature(Context const *ctx, bst_feature_t n_features,
+                   std::shared_ptr<HostDeviceVector<bst_feature_t>> p_features,
+                   std::shared_ptr<HostDeviceVector<bst_feature_t>> p_new_features,
+                   HostDeviceVector<float> const &feature_weights,
+                   HostDeviceVector<float> *weight_buffer,
+                   HostDeviceVector<bst_feature_t> *idx_buffer, GlobalRandomEngine *grng) {
+  CUDAContext const *cuctx = ctx->CUDACtx();
+  auto &new_features = *p_new_features;
+  new_features.SetDevice(ctx->Device());
+  p_features->SetDevice(ctx->Device());
+  CHECK_LE(n_features, p_features->Size());
+
+  if (!feature_weights.Empty()) {
+    CHECK_LE(p_features->Size(), feature_weights.Size());
+    idx_buffer->SetDevice(ctx->Device());
+    feature_weights.SetDevice(ctx->Device());
+
+    auto d_old_features = p_features->DeviceSpan();
+    auto d_weight_buffer = dh::LazyResize(ctx, weight_buffer, d_old_features.size());
+    // Filter weights according to the existing feature index.
+    auto d_feature_weight = feature_weights.ConstDeviceSpan();
+    auto it = thrust::make_permutation_iterator(dh::tcbegin(d_feature_weight),
+                                                dh::tcbegin(d_old_features));
+    thrust::copy_n(cuctx->CTP(), it, d_old_features.size(), dh::tbegin(d_weight_buffer));
+    new_features.Resize(n_features);
+    WeightedSamplingWithoutReplacement(ctx, d_old_features, d_weight_buffer,
+                                       new_features.DeviceSpan(), idx_buffer, grng);
+  } else {
+    new_features.Resize(p_features->Size());
+    new_features.Copy(*p_features);
+    auto d_feat = new_features.DeviceSpan();
+    thrust::default_random_engine rng;
+    rng.seed((*grng)());
+    thrust::shuffle(cuctx->CTP(), dh::tbegin(d_feat), dh::tend(d_feat), rng);
+    new_features.Resize(n_features);
+  }
+
+  auto d_new_features = new_features.DeviceSpan();
+  thrust::sort(cuctx->CTP(), dh::tbegin(d_new_features), dh::tend(d_new_features));
+}
+
+void InitFeatureSet(Context const *ctx,
+                    std::shared_ptr<HostDeviceVector<bst_feature_t>> p_features) {
+  CUDAContext const *cuctx = ctx->CUDACtx();
+  auto d_features = p_features->DeviceSpan();
+  thrust::sequence(cuctx->CTP(), dh::tbegin(d_features), dh::tend(d_features), 0);
+}
+}  // namespace xgboost::common::cuda_impl
--- a/src/common/random.h
+++ b/src/common/random.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2015-2020 by Contributors
+/**
+ * Copyright 2015-2020, XGBoost Contributors
 * \file random.h
 * \brief Utility related to random.
 * \author Tianqi Chen
@@ -25,8 +25,7 @@
 #include "xgboost/context.h"  // Context
 #include "xgboost/host_device_vector.h"

-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 /*!
 * \brief Define mt19937 as default type Random Engine.
 */
@@ -113,6 +112,18 @@ std::vector<T> WeightedSamplingWithoutReplacement(Context const* ctx, std::vecto
  return results;
 }

+namespace cuda_impl {
+void SampleFeature(Context const* ctx, bst_feature_t n_features,
+                   std::shared_ptr<HostDeviceVector<bst_feature_t>> p_features,
+                   std::shared_ptr<HostDeviceVector<bst_feature_t>> p_new_features,
+                   HostDeviceVector<float> const& feature_weights,
+                   HostDeviceVector<float>* weight_buffer,
+                   HostDeviceVector<bst_feature_t>* idx_buffer, GlobalRandomEngine* grng);
+
+void InitFeatureSet(Context const* ctx,
+                    std::shared_ptr<HostDeviceVector<bst_feature_t>> p_features);
+}  // namespace cuda_impl
+
 /**
 * \class ColumnSampler
 *
@@ -123,46 +134,37 @@ std::vector<T> WeightedSamplingWithoutReplacement(Context const* ctx, std::vecto
 class ColumnSampler {
  std::shared_ptr<HostDeviceVector<bst_feature_t>> feature_set_tree_;
  std::map<int, std::shared_ptr<HostDeviceVector<bst_feature_t>>> feature_set_level_;
-  std::vector<float> feature_weights_;
+  HostDeviceVector<float> feature_weights_;
  float colsample_bylevel_{1.0f};
  float colsample_bytree_{1.0f};
  float colsample_bynode_{1.0f};
  GlobalRandomEngine rng_;
  Context const* ctx_;

+  // Used for weighted sampling.
+  HostDeviceVector<bst_feature_t> idx_buffer_;
+  HostDeviceVector<float> weight_buffer_;
+
 public:
  std::shared_ptr<HostDeviceVector<bst_feature_t>> ColSample(
      std::shared_ptr<HostDeviceVector<bst_feature_t>> p_features, float colsample);
  /**
-   * \brief Column sampler constructor.
-   * \note This constructor manually sets the rng seed
+   * @brief Column sampler constructor.
+   * @note This constructor manually sets the rng seed
   */
-  explicit ColumnSampler(uint32_t seed) {
-    rng_.seed(seed);
-  }
+  explicit ColumnSampler(std::uint32_t seed) { rng_.seed(seed); }

  /**
-  * \brief Column sampler constructor.
-  * \note This constructor synchronizes the RNG seed across processes.
-  */
-  ColumnSampler() {
-    uint32_t seed = common::GlobalRandom()();
-    collective::Broadcast(&seed, sizeof(seed), 0);
-    rng_.seed(seed);
-  }
-
-  /**
-   * \brief Initialise this object before use.
+   * @brief Initialise this object before use.
   *
-   * \param num_col
-   * \param colsample_bynode
-   * \param colsample_bylevel
-   * \param colsample_bytree
-   * \param skip_index_0      (Optional) True to skip index 0.
+   * @param num_col
+   * @param colsample_bynode  Sampling rate for node.
+   * @param colsample_bylevel Sampling rate for tree level.
+   * @param colsample_bytree  Sampling rate for tree.
   */
  void Init(Context const* ctx, int64_t num_col, std::vector<float> feature_weights,
            float colsample_bynode, float colsample_bylevel, float colsample_bytree) {
-    feature_weights_ = std::move(feature_weights);
+    feature_weights_.HostVector() = std::move(feature_weights);
    colsample_bylevel_ = colsample_bylevel;
    colsample_bytree_ = colsample_bytree;
    colsample_bynode_ = colsample_bynode;
@@ -173,8 +175,17 @@ class ColumnSampler {
    }
    Reset();

+    feature_set_tree_->SetDevice(ctx->Device());
    feature_set_tree_->Resize(num_col);
-    std::iota(feature_set_tree_->HostVector().begin(), feature_set_tree_->HostVector().end(), 0);
+    if (ctx->IsCPU()) {
+      std::iota(feature_set_tree_->HostVector().begin(), feature_set_tree_->HostVector().end(), 0);
+    } else {
+#if defined(XGBOOST_USE_CUDA)
+      cuda_impl::InitFeatureSet(ctx, feature_set_tree_);
+#else
+      AssertGPUSupport();
+#endif
+    }

    feature_set_tree_ = ColSample(feature_set_tree_, colsample_bytree_);
  }
@@ -216,6 +227,11 @@ class ColumnSampler {
  }
 };

-}  // namespace common
-}  // namespace xgboost
+inline auto MakeColumnSampler(Context const*) {
+  std::uint32_t seed = common::GlobalRandomEngine()();
+  collective::Broadcast(&seed, sizeof(seed), 0);
+  auto cs = std::make_shared<common::ColumnSampler>(seed);
+  return cs;
+}
+}  // namespace xgboost::common
 #endif  // XGBOOST_COMMON_RANDOM_H_
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -745,7 +745,7 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col
  }
 }

-void MetaInfo::SynchronizeNumberOfColumns() {
+void MetaInfo::SynchronizeNumberOfColumns(Context const*) {
  if (IsColumnSplit()) {
    collective::Allreduce<collective::Operation::kSum>(&num_col_, 1);
  } else {
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@@ -95,7 +95,7 @@ void GetCutsFromRef(Context const* ctx, std::shared_ptr<DMatrix> ref, bst_featur

 namespace {
 // Synchronize feature type in case of empty DMatrix
-void SyncFeatureType(std::vector<FeatureType>* p_h_ft) {
+void SyncFeatureType(Context const*, std::vector<FeatureType>* p_h_ft) {
  if (!collective::IsDistributed()) {
    return;
  }
@@ -193,7 +193,7 @@ void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p,
  // From here on Info() has the correct data shape
  Info().num_row_ = accumulated_rows;
  Info().num_nonzero_ = nnz;
-  Info().SynchronizeNumberOfColumns();
+  Info().SynchronizeNumberOfColumns(ctx);
  CHECK(std::none_of(column_sizes.cbegin(), column_sizes.cend(), [&](auto f) {
    return f > accumulated_rows;
  })) << "Something went wrong during iteration.";
@@ -213,9 +213,9 @@ void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p,
    while (iter.Next()) {
      if (!p_sketch) {
        h_ft = proxy->Info().feature_types.ConstHostVector();
-        SyncFeatureType(&h_ft);
-        p_sketch.reset(new common::HostSketchContainer{ctx, p.max_bin, h_ft, column_sizes,
-                                                       !proxy->Info().group_ptr_.empty()});
+        SyncFeatureType(ctx, &h_ft);
+        p_sketch = std::make_unique<common::HostSketchContainer>(ctx, p.max_bin, h_ft, column_sizes,
+                                                                 !proxy->Info().group_ptr_.empty());
      }
      HostAdapterDispatch(proxy, [&](auto const& batch) {
        proxy->Info().num_nonzero_ = batch_nnz[i];
@@ -230,7 +230,7 @@ void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p,
    CHECK_EQ(accumulated_rows, Info().num_row_);

    CHECK(p_sketch);
-    p_sketch->MakeCuts(Info(), &cuts);
+    p_sketch->MakeCuts(ctx, Info(), &cuts);
  }
  if (!h_ft.empty()) {
    CHECK_EQ(h_ft.size(), n_features);
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@@ -108,7 +108,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
    sketch_containers.clear();
    sketch_containers.shrink_to_fit();

-    final_sketch.MakeCuts(&cuts, this->info_.IsColumnSplit());
+    final_sketch.MakeCuts(ctx, &cuts, this->info_.IsColumnSplit());
  } else {
    GetCutsFromRef(ctx, ref, Info().num_col_, p, &cuts);
  }
@@ -170,7 +170,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,

  iter.Reset();
  // Synchronise worker columns
-  info_.SynchronizeNumberOfColumns();
+  info_.SynchronizeNumberOfColumns(ctx);
 }

 BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(Context const* ctx,
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -283,7 +283,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread,
  // Synchronise worker columns
  info_.data_split_mode = data_split_mode;
  ReindexFeatures(&ctx);
-  info_.SynchronizeNumberOfColumns();
+  info_.SynchronizeNumberOfColumns(&ctx);

  if (adapter->NumRows() == kAdapterUnknownSize) {
    using IteratorAdapterT =
--- a/src/data/simple_dmatrix.cu
+++ b/src/data/simple_dmatrix.cu
@@ -42,7 +42,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthr
  info_.num_row_ = adapter->NumRows();
  // Synchronise worker columns
  info_.data_split_mode = data_split_mode;
-  info_.SynchronizeNumberOfColumns();
+  info_.SynchronizeNumberOfColumns(&ctx);

  this->fmat_ctx_ = ctx;
 }
--- a/src/data/sparse_page_dmatrix.cc
+++ b/src/data/sparse_page_dmatrix.cc
@@ -97,7 +97,7 @@ SparsePageDMatrix::SparsePageDMatrix(DataIterHandle iter_handle, DMatrixHandle p
  this->info_.num_col_ = n_features;
  this->info_.num_nonzero_ = nnz;

-  info_.SynchronizeNumberOfColumns();
+  info_.SynchronizeNumberOfColumns(&ctx);
  CHECK_NE(info_.num_col_, 0);

  fmat_ctx_ = ctx;
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -113,13 +113,13 @@ void GBTree::Configure(Args const& cfg) {
  }
 #endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)

-#if defined(XGBOOST_USE_ONEAPI)
-  if (!oneapi_predictor_) {
-    oneapi_predictor_ =
-        std::unique_ptr<Predictor>(Predictor::Create("oneapi_predictor", this->ctx_));
+#if defined(XGBOOST_USE_SYCL)
+  if (!sycl_predictor_) {
+    sycl_predictor_ =
+      std::unique_ptr<Predictor>(Predictor::Create("sycl_predictor", this->ctx_));
  }
-  oneapi_predictor_->Configure(cfg);
-#endif  // defined(XGBOOST_USE_ONEAPI)
+  sycl_predictor_->Configure(cfg);
+#endif  // defined(XGBOOST_USE_SYCL)

  // `updater` parameter was manually specified
  specified_updater_ =
@@ -553,6 +553,11 @@ void GBTree::InplacePredict(std::shared_ptr<DMatrix> p_m, float missing,
      },
      [&, begin = tree_begin, end = tree_end] {
        return this->gpu_predictor_->InplacePredict(p_m, model_, missing, out_preds, begin, end);
+#if defined(XGBOOST_USE_SYCL)
+      },
+      [&, begin = tree_begin, end = tree_end] {
+        return this->sycl_predictor_->InplacePredict(p_m, model_, missing, out_preds, begin, end);
+#endif  // defined(XGBOOST_USE_SYCL)
      });
  if (!known_type) {
    auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_m);
@@ -568,10 +573,16 @@ void GBTree::InplacePredict(std::shared_ptr<DMatrix> p_m, float missing,
  if (f_dmat && !f_dmat->SingleColBlock()) {
    if (ctx_->IsCPU()) {
      return cpu_predictor_;
-    } else {
+    } else if (ctx_->IsCUDA()) {
      common::AssertGPUSupport();
      CHECK(gpu_predictor_);
      return gpu_predictor_;
+    } else {
+#if defined(XGBOOST_USE_SYCL)
+      common::AssertSYCLSupport();
+      CHECK(sycl_predictor_);
+      return sycl_predictor_;
+#endif  // defined(XGBOOST_USE_SYCL)
    }
  }

@@ -606,10 +617,16 @@ void GBTree::InplacePredict(std::shared_ptr<DMatrix> p_m, float missing,

  if (ctx_->IsCPU()) {
    return cpu_predictor_;
-  } else {
+  } else if (ctx_->IsCUDA()) {
    common::AssertGPUSupport();
    CHECK(gpu_predictor_);
    return gpu_predictor_;
+  } else {
+#if defined(XGBOOST_USE_SYCL)
+      common::AssertSYCLSupport();
+      CHECK(sycl_predictor_);
+      return sycl_predictor_;
+#endif  // defined(XGBOOST_USE_SYCL)
  }

  return cpu_predictor_;
@@ -814,6 +831,11 @@ class Dart : public GBTree {
          },
          [&] {
            return gpu_predictor_->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1);
+#if defined(XGBOOST_USE_SYCL)
+          },
+          [&] {
+            return sycl_predictor_->InplacePredict(p_fmat, model_, missing, &predts, i, i + 1);
+#endif  // defined(XGBOOST_USE_SYCL)
          });
      CHECK(success) << msg;
    };
@@ -830,6 +852,12 @@ class Dart : public GBTree {
            [&] {
              this->gpu_predictor_->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions,
                                                       model_);
+#if defined(XGBOOST_USE_SYCL)
+            },
+            [&] {
+              this->sycl_predictor_->InitOutPredictions(p_fmat->Info(), &p_out_preds->predictions,
+                                                        model_);
+#endif  // defined(XGBOOST_USE_SYCL)
            });
      }
      // Multiple the tree weight
--- a/src/gbm/gbtree.h
+++ b/src/gbm/gbtree.h
@@ -349,9 +349,9 @@ class GBTree : public GradientBooster {
  // Predictors
  std::unique_ptr<Predictor> cpu_predictor_;
  std::unique_ptr<Predictor> gpu_predictor_{nullptr};
-#if defined(XGBOOST_USE_ONEAPI)
-  std::unique_ptr<Predictor> oneapi_predictor_;
-#endif  // defined(XGBOOST_USE_ONEAPI)
+#if defined(XGBOOST_USE_SYCL)
+  std::unique_ptr<Predictor> sycl_predictor_;
+#endif  // defined(XGBOOST_USE_SYCL)
  common::Monitor monitor_;
 };

--- a/src/learner.cc
+++ b/src/learner.cc
@@ -209,7 +209,7 @@ struct LearnerModelParamLegacy : public dmlc::Parameter<LearnerModelParamLegacy>
    return dmlc::Parameter<LearnerModelParamLegacy>::UpdateAllowUnknown(kwargs);
  }
  // sanity check
-  void Validate() {
+  void Validate(Context const*) {
    if (!collective::IsDistributed()) {
      return;
    }
@@ -434,7 +434,7 @@ class LearnerConfiguration : public Learner {
      }
      // Update the shared model parameter
      this->ConfigureModelParamWithoutBaseScore();
-      mparam_.Validate();
+      mparam_.Validate(&ctx_);
    }
    CHECK(!std::isnan(mparam_.base_score));
    CHECK(!std::isinf(mparam_.base_score));
--- a/src/metric/auc.cc
+++ b/src/metric/auc.cc
@@ -360,7 +360,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
                                           common::OptionalWeights{info.weights_.ConstHostSpan()});
    } else {
      std::tie(fp, tp, auc) =
-          GPUBinaryROCAUC(predts.ConstDeviceSpan(), info, ctx_->Device(), &this->d_cache_);
+          GPUBinaryROCAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_);
    }
    return std::make_tuple(fp, tp, auc);
  }
@@ -376,8 +376,9 @@ XGBOOST_REGISTER_METRIC(EvalAUC, "auc")
 .set_body([](const char*) { return new EvalROCAUC(); });

 #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
-std::tuple<double, double, double> GPUBinaryROCAUC(common::Span<float const>, MetaInfo const &,
-                                                   DeviceOrd, std::shared_ptr<DeviceAUCCache> *) {
+std::tuple<double, double, double> GPUBinaryROCAUC(Context const *, common::Span<float const>,
+                                                   MetaInfo const &,
+                                                   std::shared_ptr<DeviceAUCCache> *) {
  common::AssertGPUSupport();
  return {};
 }
@@ -409,8 +410,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
          BinaryPRAUC(ctx_, predts.ConstHostSpan(), info.labels.HostView().Slice(linalg::All(), 0),
                      common::OptionalWeights{info.weights_.ConstHostSpan()});
    } else {
-      std::tie(pr, re, auc) =
-          GPUBinaryPRAUC(predts.ConstDeviceSpan(), info, ctx_->Device(), &this->d_cache_);
+      std::tie(pr, re, auc) = GPUBinaryPRAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_);
    }
    return std::make_tuple(pr, re, auc);
  }
@@ -453,8 +453,9 @@ XGBOOST_REGISTER_METRIC(AUCPR, "aucpr")
    .set_body([](char const *) { return new EvalPRAUC{}; });

 #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
-std::tuple<double, double, double> GPUBinaryPRAUC(common::Span<float const>, MetaInfo const &,
-                                                  DeviceOrd, std::shared_ptr<DeviceAUCCache> *) {
+std::tuple<double, double, double> GPUBinaryPRAUC(Context const *, common::Span<float const>,
+                                                  MetaInfo const &,
+                                                  std::shared_ptr<DeviceAUCCache> *) {
  common::AssertGPUSupport();
  return {};
 }
--- a/src/metric/auc.cu
+++ b/src/metric/auc.cu
@@ -89,13 +89,14 @@ void InitCacheOnce(common::Span<float const> predts, std::shared_ptr<DeviceAUCCa
 * - Reduce the scan array into 1 AUC value.
 */
 template <typename Fn>
-std::tuple<double, double, double>
-GPUBinaryAUC(common::Span<float const> predts, MetaInfo const &info,
-             DeviceOrd device, common::Span<size_t const> d_sorted_idx,
-             Fn area_fn, std::shared_ptr<DeviceAUCCache> cache) {
-  auto labels = info.labels.View(device);
+std::tuple<double, double, double> GPUBinaryAUC(Context const *ctx,
+                                                common::Span<float const> predts,
+                                                MetaInfo const &info,
+                                                common::Span<size_t const> d_sorted_idx, Fn area_fn,
+                                                std::shared_ptr<DeviceAUCCache> cache) {
+  auto labels = info.labels.View(ctx->Device());
  auto weights = info.weights_.ConstDeviceSpan();
-  dh::safe_cuda(cudaSetDevice(device.ordinal));
+  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));

  CHECK_NE(labels.Size(), 0);
  CHECK_EQ(labels.Size(), predts.size());
@@ -121,7 +122,7 @@ GPUBinaryAUC(common::Span<float const> predts, MetaInfo const &info,

  dh::XGBDeviceAllocator<char> alloc;
  auto d_unique_idx = dh::ToSpan(cache->unique_idx);
-  dh::Iota(d_unique_idx);
+  dh::Iota(d_unique_idx, ctx->CUDACtx()->Stream());

  auto uni_key = dh::MakeTransformIterator<float>(
      thrust::make_counting_iterator(0),
@@ -177,8 +178,9 @@ GPUBinaryAUC(common::Span<float const> predts, MetaInfo const &info,
  return std::make_tuple(last.first, last.second, auc);
 }

-std::tuple<double, double, double> GPUBinaryROCAUC(common::Span<float const> predts,
-                                                   MetaInfo const &info, DeviceOrd device,
+std::tuple<double, double, double> GPUBinaryROCAUC(Context const *ctx,
+                                                   common::Span<float const> predts,
+                                                   MetaInfo const &info,
                                                   std::shared_ptr<DeviceAUCCache> *p_cache) {
  auto &cache = *p_cache;
  InitCacheOnce<false>(predts, p_cache);
@@ -187,10 +189,10 @@ std::tuple<double, double, double> GPUBinaryROCAUC(common::Span<float const> pre
   * Create sorted index for each class
   */
  auto d_sorted_idx = dh::ToSpan(cache->sorted_idx);
-  dh::ArgSort<false>(predts, d_sorted_idx);
+  common::ArgSort<false>(ctx, predts, d_sorted_idx);
  // Create lambda to avoid pass function pointer.
  return GPUBinaryAUC(
-      predts, info, device, d_sorted_idx,
+      ctx, predts, info, d_sorted_idx,
      [] XGBOOST_DEVICE(double x0, double x1, double y0, double y1) -> double {
        return TrapezoidArea(x0, x1, y0, y1);
      },
@@ -209,9 +211,9 @@ void Transpose(common::Span<float const> in, common::Span<float> out, size_t m,
  });
 }

-double ScaleClasses(common::Span<double> results, common::Span<double> local_area,
-                    common::Span<double> tp, common::Span<double> auc, size_t n_classes) {
-  dh::XGBDeviceAllocator<char> alloc;
+double ScaleClasses(Context const *ctx, common::Span<double> results,
+                    common::Span<double> local_area, common::Span<double> tp,
+                    common::Span<double> auc, size_t n_classes) {
  if (collective::IsDistributed()) {
    int32_t device = dh::CurrentDevice();
    CHECK_EQ(dh::CudaGetPointerDevice(results.data()), device);
@@ -229,9 +231,8 @@ double ScaleClasses(common::Span<double> results, common::Span<double> local_are
  double auc_sum;

  thrust::tie(auc_sum, tp_sum) =
-      thrust::reduce(thrust::cuda::par(alloc), reduce_in, reduce_in + n_classes,
-                     Pair{0.0, 0.0}, PairPlus<double, double>{});
-
+      thrust::reduce(ctx->CUDACtx()->CTP(), reduce_in, reduce_in + n_classes, Pair{0.0, 0.0},
+                     PairPlus<double, double>{});
  if (tp_sum != 0 && !std::isnan(auc_sum)) {
    auc_sum /= tp_sum;
  } else {
@@ -322,10 +323,10 @@ void SegmentedReduceAUC(common::Span<size_t const> d_unique_idx,
 * up each class in all kernels.
 */
 template <bool scale, typename Fn>
-double GPUMultiClassAUCOVR(MetaInfo const &info, DeviceOrd device,
+double GPUMultiClassAUCOVR(Context const *ctx, MetaInfo const &info,
                           common::Span<uint32_t> d_class_ptr, size_t n_classes,
                           std::shared_ptr<DeviceAUCCache> cache, Fn area_fn) {
-  dh::safe_cuda(cudaSetDevice(device.ordinal));
+  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
  /**
   * Sorted idx
   */
@@ -333,7 +334,7 @@ double GPUMultiClassAUCOVR(MetaInfo const &info, DeviceOrd device,
  // Index is sorted within class.
  auto d_sorted_idx = dh::ToSpan(cache->sorted_idx);

-  auto labels = info.labels.View(device);
+  auto labels = info.labels.View(ctx->Device());
  auto weights = info.weights_.ConstDeviceSpan();

  size_t n_samples = labels.Shape(0);
@@ -341,12 +342,11 @@ double GPUMultiClassAUCOVR(MetaInfo const &info, DeviceOrd device,
  if (n_samples == 0) {
    dh::TemporaryArray<double> resutls(n_classes * 4, 0.0f);
    auto d_results = dh::ToSpan(resutls);
-    dh::LaunchN(n_classes * 4,
-                [=] XGBOOST_DEVICE(size_t i) { d_results[i] = 0.0f; });
+    dh::LaunchN(n_classes * 4, [=] XGBOOST_DEVICE(size_t i) { d_results[i] = 0.0f; });
    auto local_area = d_results.subspan(0, n_classes);
    auto tp = d_results.subspan(2 * n_classes, n_classes);
    auto auc = d_results.subspan(3 * n_classes, n_classes);
-    return ScaleClasses(d_results, local_area, tp, auc, n_classes);
+    return ScaleClasses(ctx, d_results, local_area, tp, auc, n_classes);
  }

  /**
@@ -375,7 +375,7 @@ double GPUMultiClassAUCOVR(MetaInfo const &info, DeviceOrd device,
   */
  dh::XGBDeviceAllocator<char> alloc;
  auto d_unique_idx = dh::ToSpan(cache->unique_idx);
-  dh::Iota(d_unique_idx);
+  dh::Iota(d_unique_idx, ctx->CUDACtx()->Stream());
  auto uni_key = dh::MakeTransformIterator<thrust::pair<uint32_t, float>>(
      thrust::make_counting_iterator(0), [=] XGBOOST_DEVICE(size_t i) {
        uint32_t class_id = i / n_samples;
@@ -452,7 +452,7 @@ double GPUMultiClassAUCOVR(MetaInfo const &info, DeviceOrd device,
      tp[c] = 1.0f;
    }
  });
-  return ScaleClasses(d_results, local_area, tp, auc, n_classes);
+  return ScaleClasses(ctx, d_results, local_area, tp, auc, n_classes);
 }

 void MultiClassSortedIdx(Context const *ctx, common::Span<float const> predts,
@@ -487,8 +487,7 @@ double GPUMultiClassROCAUC(Context const *ctx, common::Span<float const> predts,
                              size_t /*class_id*/) {
    return TrapezoidArea(fp_prev, fp, tp_prev, tp);
  };
-  return GPUMultiClassAUCOVR<true>(info, ctx->Device(), dh::ToSpan(class_ptr), n_classes, cache,
-                                   fn);
+  return GPUMultiClassAUCOVR<true>(ctx, info, dh::ToSpan(class_ptr), n_classes, cache, fn);
 }

 namespace {
@@ -623,8 +622,9 @@ std::pair<double, std::uint32_t> GPURankingAUC(Context const *ctx, common::Span<
  return std::make_pair(auc, n_valid);
 }

-std::tuple<double, double, double> GPUBinaryPRAUC(common::Span<float const> predts,
-                                                  MetaInfo const &info, DeviceOrd device,
+std::tuple<double, double, double> GPUBinaryPRAUC(Context const *ctx,
+                                                  common::Span<float const> predts,
+                                                  MetaInfo const &info,
                                                  std::shared_ptr<DeviceAUCCache> *p_cache) {
  auto& cache = *p_cache;
  InitCacheOnce<false>(predts, p_cache);
@@ -633,9 +633,9 @@ std::tuple<double, double, double> GPUBinaryPRAUC(common::Span<float const> pred
   * Create sorted index for each class
   */
  auto d_sorted_idx = dh::ToSpan(cache->sorted_idx);
-  dh::ArgSort<false>(predts, d_sorted_idx);
+  common::ArgSort<false>(ctx, predts, d_sorted_idx);

-  auto labels = info.labels.View(device);
+  auto labels = info.labels.View(ctx->Device());
  auto d_weights = info.weights_.ConstDeviceSpan();
  auto get_weight = common::OptionalWeights{d_weights};
  auto it = dh::MakeTransformIterator<Pair>(
@@ -660,7 +660,7 @@ std::tuple<double, double, double> GPUBinaryPRAUC(common::Span<float const> pred
    return detail::CalcDeltaPRAUC(fp_prev, fp, tp_prev, tp, total_pos);
  };
  double fp, tp, auc;
-  std::tie(fp, tp, auc) = GPUBinaryAUC(predts, info, device, d_sorted_idx, fn, cache);
+  std::tie(fp, tp, auc) = GPUBinaryAUC(ctx, predts, info, d_sorted_idx, fn, cache);
  return std::make_tuple(1.0, 1.0, auc);
 }

@@ -717,20 +717,21 @@ double GPUMultiClassPRAUC(Context const *ctx, common::Span<float const> predts,
    return detail::CalcDeltaPRAUC(fp_prev, fp, tp_prev, tp,
                                  d_totals[class_id].first);
  };
-  return GPUMultiClassAUCOVR<false>(info, ctx->Device(), d_class_ptr, n_classes, cache, fn);
+  return GPUMultiClassAUCOVR<false>(ctx, info, d_class_ptr, n_classes, cache, fn);
 }

 template <typename Fn>
-std::pair<double, uint32_t>
-GPURankingPRAUCImpl(common::Span<float const> predts, MetaInfo const &info,
-                    common::Span<uint32_t> d_group_ptr, DeviceOrd device,
-                    std::shared_ptr<DeviceAUCCache> cache, Fn area_fn) {
+std::pair<double, uint32_t> GPURankingPRAUCImpl(Context const *ctx,
+                                                common::Span<float const> predts,
+                                                MetaInfo const &info,
+                                                common::Span<uint32_t> d_group_ptr,
+                                                std::shared_ptr<DeviceAUCCache> cache, Fn area_fn) {
  /**
   * Sorted idx
   */
  auto d_sorted_idx = dh::ToSpan(cache->sorted_idx);

-  auto labels = info.labels.View(device);
+  auto labels = info.labels.View(ctx->Device());
  auto weights = info.weights_.ConstDeviceSpan();

  uint32_t n_groups = static_cast<uint32_t>(info.group_ptr_.size() - 1);
@@ -761,7 +762,7 @@ GPURankingPRAUCImpl(common::Span<float const> predts, MetaInfo const &info,
   */
  dh::XGBDeviceAllocator<char> alloc;
  auto d_unique_idx = dh::ToSpan(cache->unique_idx);
-  dh::Iota(d_unique_idx);
+  dh::Iota(d_unique_idx, ctx->CUDACtx()->Stream());
  auto uni_key = dh::MakeTransformIterator<thrust::pair<uint32_t, float>>(
      thrust::make_counting_iterator(0), [=] XGBOOST_DEVICE(size_t i) {
        auto idx = d_sorted_idx[i];
@@ -910,7 +911,7 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *ctx,
    return detail::CalcDeltaPRAUC(fp_prev, fp, tp_prev, tp,
                                  d_totals[group_id].first);
  };
-  return GPURankingPRAUCImpl(predts, info, d_group_ptr, ctx->Device(), cache, fn);
+  return GPURankingPRAUCImpl(ctx, predts, info, d_group_ptr, cache, fn);
 }
 }  // namespace metric
 }  // namespace xgboost
--- a/src/metric/auc.h
+++ b/src/metric/auc.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2021 by XGBoost Contributors
+/**
+ * Copyright 2021-2023, XGBoost Contributors
 */
 #ifndef XGBOOST_METRIC_AUC_H_
 #define XGBOOST_METRIC_AUC_H_
@@ -18,8 +18,7 @@
 #include "xgboost/metric.h"
 #include "xgboost/span.h"

-namespace xgboost {
-namespace metric {
+namespace xgboost::metric {
 /***********
 * ROC AUC *
 ***********/
@@ -29,8 +28,9 @@ XGBOOST_DEVICE inline double TrapezoidArea(double x0, double x1, double y0, doub

 struct DeviceAUCCache;

-std::tuple<double, double, double> GPUBinaryROCAUC(common::Span<float const> predts,
-                                                   MetaInfo const &info, DeviceOrd,
+std::tuple<double, double, double> GPUBinaryROCAUC(Context const *ctx,
+                                                   common::Span<float const> predts,
+                                                   MetaInfo const &info,
                                                   std::shared_ptr<DeviceAUCCache> *p_cache);

 double GPUMultiClassROCAUC(Context const *ctx, common::Span<float const> predts,
@@ -44,8 +44,9 @@ std::pair<double, std::uint32_t> GPURankingAUC(Context const *ctx, common::Span<
 /**********
 * PR AUC *
 **********/
-std::tuple<double, double, double> GPUBinaryPRAUC(common::Span<float const> predts,
-                                                  MetaInfo const &info, DeviceOrd,
+std::tuple<double, double, double> GPUBinaryPRAUC(Context const *ctx,
+                                                  common::Span<float const> predts,
+                                                  MetaInfo const &info,
                                                  std::shared_ptr<DeviceAUCCache> *p_cache);

 double GPUMultiClassPRAUC(Context const *ctx, common::Span<float const> predts,
@@ -111,6 +112,5 @@ struct PRAUCLabelInvalid {
 inline void InvalidLabels() {
  LOG(FATAL) << "PR-AUC supports only binary relevance for learning to rank.";
 }
-}      // namespace metric
-}      // namespace xgboost
+}  // namespace xgboost::metric
 #endif  // XGBOOST_METRIC_AUC_H_
--- a/src/metric/elementwise_metric.cu
+++ b/src/metric/elementwise_metric.cu
@@ -215,7 +215,7 @@ struct EvalError {
      has_param_ = false;
    }
  }
-  const char *Name() const {
+  [[nodiscard]] const char *Name() const {
    static thread_local std::string name;
    if (has_param_) {
      std::ostringstream os;
@@ -228,7 +228,7 @@ struct EvalError {
    }
  }

-  XGBOOST_DEVICE bst_float EvalRow(bst_float label, bst_float pred) const {
+  [[nodiscard]] XGBOOST_DEVICE bst_float EvalRow(bst_float label, bst_float pred) const {
    // assume label is in [0,1]
    return pred > threshold_ ? 1.0f - label : label;
  }
@@ -370,7 +370,7 @@ struct EvalEWiseBase : public MetricNoCache {
    return Policy::GetFinal(dat[0], dat[1]);
  }

-  const char* Name() const override { return policy_.Name(); }
+  [[nodiscard]] const char* Name() const override { return policy_.Name(); }

 private:
  Policy policy_;
--- a/src/metric/rank_metric.cc
+++ b/src/metric/rank_metric.cc
@@ -162,7 +162,7 @@ struct EvalRank : public MetricNoCache, public EvalRankConfig {
    return collective::GlobalRatio(info, sum_metric, static_cast<double>(ngroups));
  }

-  const char* Name() const override {
+  [[nodiscard]] const char* Name() const override {
    return name.c_str();
  }

@@ -294,7 +294,7 @@ class EvalRankWithCache : public Metric {
 };

 namespace {
-double Finalize(MetaInfo const& info, double score, double sw) {
+double Finalize(Context const*, MetaInfo const& info, double score, double sw) {
  std::array<double, 2> dat{score, sw};
  collective::GlobalSum(info, &dat);
  std::tie(score, sw) = std::tuple_cat(dat);
@@ -323,7 +323,7 @@ class EvalPrecision : public EvalRankWithCache<ltr::PreCache> {

    if (ctx_->IsCUDA()) {
      auto pre = cuda_impl::PreScore(ctx_, info, predt, p_cache);
-      return Finalize(info, pre.Residue(), pre.Weights());
+      return Finalize(ctx_, info, pre.Residue(), pre.Weights());
    }

    auto gptr = p_cache->DataGroupPtr(ctx_);
@@ -352,7 +352,7 @@ class EvalPrecision : public EvalRankWithCache<ltr::PreCache> {
    }

    auto sum = std::accumulate(pre.cbegin(), pre.cend(), 0.0);
-    return Finalize(info, sum, sw);
+    return Finalize(ctx_, info, sum, sw);
  }
 };

@@ -369,7 +369,7 @@ class EvalNDCG : public EvalRankWithCache<ltr::NDCGCache> {
              std::shared_ptr<ltr::NDCGCache> p_cache) override {
    if (ctx_->IsCUDA()) {
      auto ndcg = cuda_impl::NDCGScore(ctx_, info, preds, minus_, p_cache);
-      return Finalize(info, ndcg.Residue(), ndcg.Weights());
+      return Finalize(ctx_, info, ndcg.Residue(), ndcg.Weights());
    }

    // group local ndcg
@@ -415,7 +415,7 @@ class EvalNDCG : public EvalRankWithCache<ltr::NDCGCache> {
      sum_w = std::accumulate(weights.weights.cbegin(), weights.weights.cend(), 0.0);
    }
    auto ndcg = std::accumulate(linalg::cbegin(ndcg_gloc), linalg::cend(ndcg_gloc), 0.0);
-    return Finalize(info, ndcg, sum_w);
+    return Finalize(ctx_, info, ndcg, sum_w);
  }
 };

@@ -427,7 +427,7 @@ class EvalMAPScore : public EvalRankWithCache<ltr::MAPCache> {
              std::shared_ptr<ltr::MAPCache> p_cache) override {
    if (ctx_->IsCUDA()) {
      auto map = cuda_impl::MAPScore(ctx_, info, predt, minus_, p_cache);
-      return Finalize(info, map.Residue(), map.Weights());
+      return Finalize(ctx_, info, map.Residue(), map.Weights());
    }

    auto gptr = p_cache->DataGroupPtr(ctx_);
@@ -469,7 +469,7 @@ class EvalMAPScore : public EvalRankWithCache<ltr::MAPCache> {
      sw += weight[i];
    }
    auto sum = std::accumulate(map_gloc.cbegin(), map_gloc.cend(), 0.0);
-    return Finalize(info, sum, sw);
+    return Finalize(ctx_, info, sum, sw);
  }
 };

--- a/src/metric/survival_metric.cu
+++ b/src/metric/survival_metric.cu
@@ -218,7 +218,7 @@ struct EvalEWiseSurvivalBase : public MetricNoCache {
    return Policy::GetFinal(dat[0], dat[1]);
  }

-  const char* Name() const override {
+  [[nodiscard]] const char* Name() const override {
    return policy_.Name();
  }

--- a/src/objective/adaptive.cu
+++ b/src/objective/adaptive.cu
@@ -18,9 +18,7 @@
 #include "adaptive.h"
 #include "xgboost/context.h"

-namespace xgboost {
-namespace obj {
-namespace detail {
+namespace xgboost::obj::detail {
 void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> position,
                          dh::device_vector<size_t>* p_ridx, HostDeviceVector<size_t>* p_nptr,
                          HostDeviceVector<bst_node_t>* p_nidx, RegTree const& tree) {
@@ -34,7 +32,7 @@ void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
                                position.size_bytes(), cudaMemcpyDeviceToDevice, cuctx->Stream()));

  p_ridx->resize(position.size());
-  dh::Iota(dh::ToSpan(*p_ridx));
+  dh::Iota(dh::ToSpan(*p_ridx), cuctx->Stream());
  // sort row index according to node index
  thrust::stable_sort_by_key(cuctx->TP(), sorted_position.begin(),
                             sorted_position.begin() + n_samples, p_ridx->begin());
@@ -197,6 +195,4 @@ void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
  });
  UpdateLeafValues(&quantiles.HostVector(), nidx.ConstHostVector(), info, learning_rate, p_tree);
 }
-}  // namespace detail
-}  // namespace obj
-}  // namespace xgboost
+}  // namespace xgboost::obj::detail
--- a/src/objective/regression_loss.h
+++ b/src/objective/regression_loss.h
@@ -13,9 +13,7 @@
 #include "xgboost/logging.h"
 #include "xgboost/task.h"  // ObjInfo

-namespace xgboost {
-namespace obj {
-// common regressions
+namespace xgboost::obj {
 // linear regression
 struct LinearSquareLoss {
  XGBOOST_DEVICE static bst_float PredTransform(bst_float x) { return x; }
@@ -106,7 +104,21 @@ struct LogisticRaw : public LogisticRegression {

  static ObjInfo Info() { return ObjInfo::kRegression; }
 };
-}  // namespace obj
-}  // namespace xgboost

+// gamma deviance loss.
+class GammaDeviance {
+ public:
+  XGBOOST_DEVICE static float PredTransform(float x) { return std::exp(x); }
+  XGBOOST_DEVICE static float ProbToMargin(float x) { return std::log(x); }
+  XGBOOST_DEVICE static float FirstOrderGradient(float p, float y) {
+    return 1.0f - y / p;
+  }
+  XGBOOST_DEVICE static float SecondOrderGradient(float p, float y) { return y / p; }
+  static ObjInfo Info() { return ObjInfo::kRegression; }
+  static const char* Name() { return "reg:gamma"; }
+  static const char* DefaultEvalMetric() { return "gamma-deviance"; }
+  XGBOOST_DEVICE static bool CheckLabel(float x) { return x > 0.0f; }
+  static const char* LabelErrorMsg() { return "label must be positive for gamma regression."; }
+};
+}  // namespace xgboost::obj
 #endif  // XGBOOST_OBJECTIVE_REGRESSION_LOSS_H_
--- a/src/objective/regression_obj.cu
+++ b/src/objective/regression_obj.cu
@@ -221,6 +221,10 @@ XGBOOST_REGISTER_OBJECTIVE(LogisticRaw, LogisticRaw::Name())
          "before logistic transformation.")
 .set_body([]() { return new RegLossObj<LogisticRaw>(); });

+XGBOOST_REGISTER_OBJECTIVE(GammaRegression, GammaDeviance::Name())
+    .describe("Gamma regression using the gamma deviance loss with log link.")
+    .set_body([]() { return new RegLossObj<GammaDeviance>(); });
+
 // Deprecated functions
 XGBOOST_REGISTER_OBJECTIVE(LinearRegression, "reg:linear")
 .describe("Regression with squared error.")
@@ -501,87 +505,6 @@ XGBOOST_REGISTER_OBJECTIVE(CoxRegression, "survival:cox")
 .describe("Cox regression for censored survival data (negative labels are considered censored).")
 .set_body([]() { return new CoxRegression(); });

-// gamma regression
-class GammaRegression : public FitIntercept {
- public:
-  void Configure(Args const&) override {}
-  [[nodiscard]] ObjInfo Task() const override { return ObjInfo::kRegression; }
-
-  void GetGradient(const HostDeviceVector<bst_float>& preds, const MetaInfo& info, std::int32_t,
-                   linalg::Matrix<GradientPair>* out_gpair) override {
-    CHECK_NE(info.labels.Size(), 0U) << "label set cannot be empty";
-    CHECK_EQ(preds.Size(), info.labels.Size()) << "labels are not correctly provided";
-    const size_t ndata = preds.Size();
-    auto device = ctx_->Device();
-    out_gpair->SetDevice(ctx_->Device());
-    out_gpair->Reshape(info.num_row_, this->Targets(info));
-    label_correct_.Resize(1);
-    label_correct_.Fill(1);
-
-    const bool is_null_weight = info.weights_.Size() == 0;
-    if (!is_null_weight) {
-      CHECK_EQ(info.weights_.Size(), ndata)
-          << "Number of weights should be equal to number of data points.";
-    }
-    common::Transform<>::Init(
-        [=] XGBOOST_DEVICE(size_t _idx,
-                           common::Span<int> _label_correct,
-                           common::Span<GradientPair> _out_gpair,
-                           common::Span<const bst_float> _preds,
-                           common::Span<const bst_float> _labels,
-                           common::Span<const bst_float> _weights) {
-          bst_float p = _preds[_idx];
-          bst_float w = is_null_weight ? 1.0f : _weights[_idx];
-          bst_float y = _labels[_idx];
-          if (y <= 0.0f) {
-            _label_correct[0] = 0;
-          }
-          _out_gpair[_idx] = GradientPair((1 - y / expf(p)) * w, y / expf(p) * w);
-        },
-        common::Range{0, static_cast<int64_t>(ndata)}, this->ctx_->Threads(), device).Eval(
-            &label_correct_, out_gpair->Data(), &preds, info.labels.Data(), &info.weights_);
-
-    // copy "label correct" flags back to host
-    std::vector<int>& label_correct_h = label_correct_.HostVector();
-    for (auto const flag : label_correct_h) {
-      if (flag == 0) {
-        LOG(FATAL) << "GammaRegression: label must be positive.";
-      }
-    }
-  }
-  void PredTransform(HostDeviceVector<bst_float> *io_preds) const override {
-    common::Transform<>::Init(
-        [] XGBOOST_DEVICE(size_t _idx, common::Span<bst_float> _preds) {
-          _preds[_idx] = expf(_preds[_idx]);
-        },
-        common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
-        io_preds->Device())
-        .Eval(io_preds);
-  }
-  void EvalTransform(HostDeviceVector<bst_float> *io_preds) override {
-    PredTransform(io_preds);
-  }
-  [[nodiscard]] float ProbToMargin(bst_float base_score) const override {
-    return std::log(base_score);
-  }
-  [[nodiscard]] const char* DefaultEvalMetric() const override {
-    return "gamma-nloglik";
-  }
-  void SaveConfig(Json* p_out) const override {
-    auto& out = *p_out;
-    out["name"] = String("reg:gamma");
-  }
-  void LoadConfig(Json const&) override {}
-
- private:
-  HostDeviceVector<int> label_correct_;
-};
-
-// register the objective functions
-XGBOOST_REGISTER_OBJECTIVE(GammaRegression, "reg:gamma")
-.describe("Gamma regression for severity data.")
-.set_body([]() { return new GammaRegression(); });
-

 // declare parameter
 struct TweedieRegressionParam : public XGBoostParameter<TweedieRegressionParam> {
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -189,7 +189,7 @@ struct SparsePageView {

  explicit SparsePageView(SparsePage const *p) : base_rowid{p->base_rowid} { view = p->GetView(); }
  SparsePage::Inst operator[](size_t i) { return view[i]; }
-  size_t Size() const { return view.Size(); }
+  [[nodiscard]] size_t Size() const { return view.Size(); }
 };

 struct SingleInstanceView {
@@ -250,7 +250,7 @@ struct GHistIndexMatrixView {
    }
    return ret;
  }
-  size_t Size() const { return page_.Size(); }
+  [[nodiscard]] size_t Size() const { return page_.Size(); }
 };

 template <typename Adapter>
@@ -290,7 +290,7 @@ class AdapterView {
    return ret;
  }

-  size_t Size() const { return adapter_->NumRows(); }
+  [[nodiscard]] size_t Size() const { return adapter_->NumRows(); }

  bst_row_t const static base_rowid = 0;  // NOLINT
 };
@@ -408,31 +408,33 @@ class ColumnSplitHelper {
  ColumnSplitHelper(ColumnSplitHelper &&) noexcept = delete;
  ColumnSplitHelper &operator=(ColumnSplitHelper &&) noexcept = delete;

-  void PredictDMatrix(DMatrix *p_fmat, std::vector<bst_float> *out_preds) {
+  void PredictDMatrix(Context const *ctx, DMatrix *p_fmat, std::vector<bst_float> *out_preds) {
    CHECK(xgboost::collective::IsDistributed())
        << "column-split prediction is only supported for distributed training";

    for (auto const &batch : p_fmat->GetBatches<SparsePage>()) {
      CHECK_EQ(out_preds->size(),
               p_fmat->Info().num_row_ * model_.learner_model_param->num_output_group);
-      PredictBatchKernel<SparsePageView, kBlockOfRowsSize>(SparsePageView{&batch}, out_preds);
+      PredictBatchKernel<SparsePageView, kBlockOfRowsSize>(ctx, SparsePageView{&batch}, out_preds);
    }
  }

-  void PredictInstance(SparsePage::Inst const &inst, std::vector<bst_float> *out_preds) {
+  void PredictInstance(Context const *ctx, SparsePage::Inst const &inst,
+                       std::vector<bst_float> *out_preds) {
    CHECK(xgboost::collective::IsDistributed())
        << "column-split prediction is only supported for distributed training";

-    PredictBatchKernel<SingleInstanceView, 1>(SingleInstanceView{inst}, out_preds);
+    PredictBatchKernel<SingleInstanceView, 1>(ctx, SingleInstanceView{inst}, out_preds);
  }

-  void PredictLeaf(DMatrix *p_fmat, std::vector<bst_float> *out_preds) {
+  void PredictLeaf(Context const* ctx, DMatrix *p_fmat, std::vector<bst_float> *out_preds) {
    CHECK(xgboost::collective::IsDistributed())
        << "column-split prediction is only supported for distributed training";

    for (auto const &batch : p_fmat->GetBatches<SparsePage>()) {
      CHECK_EQ(out_preds->size(), p_fmat->Info().num_row_ * (tree_end_ - tree_begin_));
-      PredictBatchKernel<SparsePageView, kBlockOfRowsSize, true>(SparsePageView{&batch}, out_preds);
+      PredictBatchKernel<SparsePageView, kBlockOfRowsSize, true>(ctx, SparsePageView{&batch},
+                                                                 out_preds);
    }
  }

@@ -453,12 +455,13 @@ class ColumnSplitHelper {
    std::fill(missing_storage_.begin(), missing_storage_.end(), 0);
  }

-  std::size_t BitIndex(std::size_t tree_id, std::size_t row_id, std::size_t node_id) const {
+  [[nodiscard]] std::size_t BitIndex(std::size_t tree_id, std::size_t row_id,
+                                     std::size_t node_id) const {
    size_t tree_index = tree_id - tree_begin_;
    return tree_offsets_[tree_index] * n_rows_ + row_id * tree_sizes_[tree_index] + node_id;
  }

-  void AllreduceBitVectors() {
+  void AllreduceBitVectors(Context const*) {
    collective::Allreduce<collective::Operation::kBitwiseOR>(decision_storage_.data(),
                                                             decision_storage_.size());
    collective::Allreduce<collective::Operation::kBitwiseAND>(missing_storage_.data(),
@@ -547,7 +550,7 @@ class ColumnSplitHelper {
  }

  template <typename DataView, size_t block_of_rows_size, bool predict_leaf = false>
-  void PredictBatchKernel(DataView batch, std::vector<bst_float> *out_preds) {
+  void PredictBatchKernel(Context const* ctx, DataView batch, std::vector<bst_float> *out_preds) {
    auto const num_group = model_.learner_model_param->num_output_group;

    // parallel over local batch
@@ -568,7 +571,7 @@ class ColumnSplitHelper {
      FVecDrop(block_size, fvec_offset, &feat_vecs_);
    });

-    AllreduceBitVectors();
+    AllreduceBitVectors(ctx);

    // auto block_id has the same type as `n_blocks`.
    common::ParallelFor(n_blocks, n_threads_, [&](auto block_id) {
@@ -646,7 +649,7 @@ class CPUPredictor : public Predictor {
          << "Predict DMatrix with column split" << MTNotImplemented();

      ColumnSplitHelper helper(this->ctx_->Threads(), model, tree_begin, tree_end);
-      helper.PredictDMatrix(p_fmat, out_preds);
+      helper.PredictDMatrix(ctx_, p_fmat, out_preds);
      return;
    }

@@ -779,7 +782,7 @@ class CPUPredictor : public Predictor {
          << "Predict instance with column split" << MTNotImplemented();

      ColumnSplitHelper helper(this->ctx_->Threads(), model, 0, ntree_limit);
-      helper.PredictInstance(inst, out_preds);
+      helper.PredictInstance(ctx_, inst, out_preds);
      return;
    }

@@ -811,7 +814,7 @@ class CPUPredictor : public Predictor {
          << "Predict leaf with column split" << MTNotImplemented();

      ColumnSplitHelper helper(n_threads, model, 0, ntree_limit);
-      helper.PredictLeaf(p_fmat, &preds);
+      helper.PredictLeaf(ctx_, p_fmat, &preds);
      return;
    }

--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -62,9 +62,7 @@ struct TreeView {
    cats.node_ptr = tree_cat_ptrs;
  }

-  __device__ bool HasCategoricalSplit() const {
-    return !cats.categories.empty();
-  }
+  [[nodiscard]] __device__ bool HasCategoricalSplit() const { return !cats.categories.empty(); }
 };

 struct SparsePageView {
@@ -77,7 +75,7 @@ struct SparsePageView {
                                common::Span<const bst_row_t> row_ptr,
                                bst_feature_t num_features)
      : d_data{data}, d_row_ptr{row_ptr}, num_features(num_features) {}
-  __device__ float GetElement(size_t ridx, size_t fidx) const {
+  [[nodiscard]] __device__ float GetElement(size_t ridx, size_t fidx) const {
    // Binary search
    auto begin_ptr = d_data.begin() + d_row_ptr[ridx];
    auto end_ptr = d_data.begin() + d_row_ptr[ridx + 1];
@@ -105,8 +103,8 @@ struct SparsePageView {
    // Value is missing
    return nanf("");
  }
-  XGBOOST_DEVICE size_t NumRows() const { return d_row_ptr.size() - 1; }
-  XGBOOST_DEVICE size_t NumCols() const { return num_features; }
+  [[nodiscard]] XGBOOST_DEVICE size_t NumRows() const { return d_row_ptr.size() - 1; }
+  [[nodiscard]] XGBOOST_DEVICE size_t NumCols() const { return num_features; }
 };

 struct SparsePageLoader {
@@ -137,7 +135,7 @@ struct SparsePageLoader {
      __syncthreads();
    }
  }
-  __device__ float GetElement(size_t  ridx, size_t  fidx) const {
+  [[nodiscard]] __device__ float GetElement(size_t ridx, size_t fidx) const {
    if (use_shared) {
      return smem[threadIdx.x * data.num_features + fidx];
    } else {
@@ -151,7 +149,7 @@ struct EllpackLoader {
  XGBOOST_DEVICE EllpackLoader(EllpackDeviceAccessor const& m, bool, bst_feature_t, bst_row_t,
                               size_t, float)
      : matrix{m} {}
-  __device__ __forceinline__ float GetElement(size_t ridx, size_t fidx) const {
+  [[nodiscard]] __device__ __forceinline__ float GetElement(size_t ridx, size_t fidx) const {
    auto gidx = matrix.GetBinIndex(ridx, fidx);
    if (gidx == -1) {
      return nan("");
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -429,11 +429,11 @@ void GPUHistEvaluator::CopyToHost(const std::vector<bst_node_t> &nidx) {
  }
 }

-void GPUHistEvaluator::EvaluateSplits(
-    const std::vector<bst_node_t> &nidx, bst_feature_t max_active_features,
-    common::Span<const EvaluateSplitInputs> d_inputs,
-    EvaluateSplitSharedInputs shared_inputs,
-    common::Span<GPUExpandEntry> out_entries) {
+void GPUHistEvaluator::EvaluateSplits(Context const *ctx, const std::vector<bst_node_t> &nidx,
+                                      bst_feature_t max_active_features,
+                                      common::Span<const EvaluateSplitInputs> d_inputs,
+                                      EvaluateSplitSharedInputs shared_inputs,
+                                      common::Span<GPUExpandEntry> out_entries) {
  auto evaluator = this->tree_evaluator_.template GetEvaluator<GPUTrainingParam>();

  dh::TemporaryArray<DeviceSplitCandidate> splits_out_storage(d_inputs.size());
@@ -451,19 +451,20 @@ void GPUHistEvaluator::EvaluateSplits(
                          out_splits.size() * sizeof(DeviceSplitCandidate));

    // Reduce to get the best candidate from all workers.
-    dh::LaunchN(out_splits.size(), [world_size, all_candidates, out_splits] __device__(size_t i) {
-      out_splits[i] = all_candidates[i];
-      for (auto rank = 1; rank < world_size; rank++) {
-        out_splits[i] = out_splits[i] + all_candidates[rank * out_splits.size() + i];
-      }
-    });
+    dh::LaunchN(out_splits.size(), ctx->CUDACtx()->Stream(),
+                [world_size, all_candidates, out_splits] __device__(size_t i) {
+                  out_splits[i] = all_candidates[i];
+                  for (auto rank = 1; rank < world_size; rank++) {
+                    out_splits[i] = out_splits[i] + all_candidates[rank * out_splits.size() + i];
+                  }
+                });
  }

  auto d_sorted_idx = this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size());
  auto d_entries = out_entries;
  auto device_cats_accessor = this->DeviceCatStorage(nidx);
  // turn candidate into entry, along with handling sort based split.
-  dh::LaunchN(d_inputs.size(), [=] __device__(size_t i) mutable {
+  dh::LaunchN(d_inputs.size(), ctx->CUDACtx()->Stream(), [=] __device__(size_t i) mutable {
    auto const input = d_inputs[i];
    auto &split = out_splits[i];
    // Subtract parent gain here
@@ -498,12 +499,12 @@ void GPUHistEvaluator::EvaluateSplits(
  this->CopyToHost(nidx);
 }

-GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(
-    EvaluateSplitInputs input, EvaluateSplitSharedInputs shared_inputs) {
+GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit(Context const *ctx, EvaluateSplitInputs input,
+                                                     EvaluateSplitSharedInputs shared_inputs) {
  dh::device_vector<EvaluateSplitInputs> inputs = std::vector<EvaluateSplitInputs>{input};
  dh::TemporaryArray<GPUExpandEntry> out_entries(1);
-  this->EvaluateSplits({input.nidx}, input.feature_set.size(), dh::ToSpan(inputs), shared_inputs,
-                       dh::ToSpan(out_entries));
+  this->EvaluateSplits(ctx, {input.nidx}, input.feature_set.size(), dh::ToSpan(inputs),
+                       shared_inputs, dh::ToSpan(out_entries));
  GPUExpandEntry root_entry;

  dh::safe_cuda(cudaMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry),
--- a/src/tree/gpu_hist/evaluate_splits.cuh
+++ b/src/tree/gpu_hist/evaluate_splits.cuh
@@ -193,7 +193,7 @@ class GPUHistEvaluator {
  /**
   * \brief Evaluate splits for left and right nodes.
   */
-  void EvaluateSplits(const std::vector<bst_node_t> &nidx,
+  void EvaluateSplits(Context const* ctx, const std::vector<bst_node_t> &nidx,
                      bst_feature_t max_active_features,
                      common::Span<const EvaluateSplitInputs> d_inputs,
                      EvaluateSplitSharedInputs shared_inputs,
@@ -201,7 +201,7 @@ class GPUHistEvaluator {
  /**
   * \brief Evaluate splits for root node.
   */
-  GPUExpandEntry EvaluateSingleSplit(EvaluateSplitInputs input,
+  GPUExpandEntry EvaluateSingleSplit(Context const *ctx, EvaluateSplitInputs input,
                                     EvaluateSplitSharedInputs shared_inputs);
 };
 }  // namespace tree
--- a/src/tree/gpu_hist/evaluator.cu
+++ b/src/tree/gpu_hist/evaluator.cu
@@ -74,7 +74,7 @@ common::Span<bst_feature_t const> GPUHistEvaluator::SortHistogram(
    TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator) {
  dh::XGBCachingDeviceAllocator<char> alloc;
  auto sorted_idx = this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size());
-  dh::Iota(sorted_idx);
+  dh::Iota(sorted_idx, dh::DefaultStream());
  auto data = this->SortInput(d_inputs.size(), shared_inputs.feature_values.size());
  auto it = thrust::make_counting_iterator(0u);
  auto d_feature_idx = dh::ToSpan(feature_idx_);
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -16,8 +16,7 @@
 #include "row_partitioner.cuh"
 #include "xgboost/base.h"

-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {
 namespace {
 struct Pair {
  GradientPair first;
@@ -53,7 +52,8 @@ struct Clip : public thrust::unary_function<GradientPair, Pair> {
 *
 * to avoid outliers, as the full reduction is reproducible on GPU with reduction tree.
 */
-GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair, MetaInfo const& info) {
+GradientQuantiser::GradientQuantiser(Context const*, common::Span<GradientPair const> gpair,
+                                     MetaInfo const& info) {
  using GradientSumT = GradientPairPrecise;
  using T = typename GradientSumT::ValueT;
  dh::XGBCachingDeviceAllocator<char> alloc;
@@ -100,7 +100,6 @@ GradientQuantiser::GradientQuantiser(common::Span<GradientPair const> gpair, Met
                                 static_cast<T>(1) / to_floating_point_.GetHess());
 }

-
 XGBOOST_DEV_INLINE void
 AtomicAddGpairShared(xgboost::GradientPairInt64 *dest,
               xgboost::GradientPairInt64 const &gpair) {
@@ -333,6 +332,4 @@ void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const&

  dh::safe_cuda(cudaGetLastError());
 }
-
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
--- a/src/tree/gpu_hist/histogram.cuh
+++ b/src/tree/gpu_hist/histogram.cuh
@@ -39,18 +39,20 @@ private:
  GradientPairPrecise to_floating_point_;

 public:
-  GradientQuantiser(common::Span<GradientPair const> gpair, MetaInfo const& info);
-  XGBOOST_DEVICE GradientPairInt64 ToFixedPoint(GradientPair const& gpair) const {
+  GradientQuantiser(Context const* ctx, common::Span<GradientPair const> gpair, MetaInfo const& info);
+  [[nodiscard]] XGBOOST_DEVICE GradientPairInt64 ToFixedPoint(GradientPair const& gpair) const {
    auto adjusted = GradientPairInt64(gpair.GetGrad() * to_fixed_point_.GetGrad(),
-                               gpair.GetHess() * to_fixed_point_.GetHess());
+                                      gpair.GetHess() * to_fixed_point_.GetHess());
    return adjusted;
  }
-  XGBOOST_DEVICE GradientPairInt64 ToFixedPoint(GradientPairPrecise const& gpair) const {
+  [[nodiscard]] XGBOOST_DEVICE GradientPairInt64
+  ToFixedPoint(GradientPairPrecise const& gpair) const {
    auto adjusted = GradientPairInt64(gpair.GetGrad() * to_fixed_point_.GetGrad(),
-                               gpair.GetHess() * to_fixed_point_.GetHess());
+                                      gpair.GetHess() * to_fixed_point_.GetHess());
    return adjusted;
  }
-  XGBOOST_DEVICE GradientPairPrecise ToFloatingPoint(const GradientPairInt64&gpair) const {
+  [[nodiscard]] XGBOOST_DEVICE GradientPairPrecise
+  ToFloatingPoint(const GradientPairInt64& gpair) const {
    auto g = gpair.GetQuantisedGrad() * to_floating_point_.GetGrad();
    auto h = gpair.GetQuantisedHess() * to_floating_point_.GetHess();
    return {g,h};
--- a/src/tree/hist/histogram.h
+++ b/src/tree/hist/histogram.h
@@ -171,7 +171,8 @@ class HistogramBuilder {
    }
  }

-  void SyncHistogram(RegTree const *p_tree, std::vector<bst_node_t> const &nodes_to_build,
+  void SyncHistogram(Context const *, RegTree const *p_tree,
+                     std::vector<bst_node_t> const &nodes_to_build,
                     std::vector<bst_node_t> const &nodes_to_trick) {
    auto n_total_bins = buffer_.TotalBins();
    common::BlockedSpace2d space(
@@ -277,14 +278,14 @@ class MultiHistogramBuilder {
    }

    for (bst_target_t t = 0; t < p_tree->NumTargets(); ++t) {
-      this->target_builders_[t].SyncHistogram(p_tree, nodes, dummy_sub);
+      this->target_builders_[t].SyncHistogram(ctx_, p_tree, nodes, dummy_sub);
    }
  }
  /**
   * @brief Build histogram for left and right child of valid candidates
   */
  template <typename Partitioner, typename ExpandEntry>
-  void BuildHistLeftRight(DMatrix *p_fmat, RegTree const *p_tree,
+  void BuildHistLeftRight(Context const *ctx, DMatrix *p_fmat, RegTree const *p_tree,
                          std::vector<Partitioner> const &partitioners,
                          std::vector<ExpandEntry> const &valid_candidates,
                          linalg::MatrixView<GradientPair const> gpair, BatchParam const &param,
@@ -318,7 +319,7 @@ class MultiHistogramBuilder {
    }

    for (bst_target_t t = 0; t < p_tree->NumTargets(); ++t) {
-      this->target_builders_[t].SyncHistogram(p_tree, nodes_to_build, nodes_to_sub);
+      this->target_builders_[t].SyncHistogram(ctx, p_tree, nodes_to_build, nodes_to_sub);
    }
  }

--- a/src/tree/hist/param.cc
+++ b/src/tree/hist/param.cc
@@ -12,7 +12,7 @@
 namespace xgboost::tree {
 DMLC_REGISTER_PARAMETER(HistMakerTrainParam);

-void HistMakerTrainParam::CheckTreesSynchronized(RegTree const* local_tree) const {
+void HistMakerTrainParam::CheckTreesSynchronized(Context const*, RegTree const* local_tree) const {
  if (!this->debug_synchronize) {
    return;
  }
--- a/src/tree/hist/param.h
+++ b/src/tree/hist/param.h
@@ -15,7 +15,7 @@ struct HistMakerTrainParam : public XGBoostParameter<HistMakerTrainParam> {
  bool debug_synchronize{false};
  std::size_t max_cached_hist_node{DefaultNodes()};

-  void CheckTreesSynchronized(RegTree const* local_tree) const;
+  void CheckTreesSynchronized(Context const* ctx, RegTree const* local_tree) const;

  // declare parameters
  DMLC_DECLARE_PARAMETER(HistMakerTrainParam) {
--- a/src/tree/updater_approx.cc
+++ b/src/tree/updater_approx.cc
@@ -140,7 +140,7 @@ class GloablApproxBuilder {
                      std::vector<GradientPair> const &gpair, common::Span<float> hess) {
    monitor_->Start(__func__);
    this->histogram_builder_.BuildHistLeftRight(
-        p_fmat, p_tree, partitioner_, valid_candidates,
+        ctx_, p_fmat, p_tree, partitioner_, valid_candidates,
        linalg::MakeTensorView(ctx_, gpair, gpair.size(), 1), BatchSpec(*param_, hess));
    monitor_->Stop(__func__);
  }
@@ -248,8 +248,7 @@ class GlobalApproxUpdater : public TreeUpdater {
  std::unique_ptr<GloablApproxBuilder> pimpl_;
  // pointer to the last DMatrix, used for update prediction cache.
  DMatrix *cached_{nullptr};
-  std::shared_ptr<common::ColumnSampler> column_sampler_ =
-      std::make_shared<common::ColumnSampler>();
+  std::shared_ptr<common::ColumnSampler> column_sampler_;
  ObjInfo const *task_;
  HistMakerTrainParam hist_param_;

@@ -284,6 +283,9 @@ class GlobalApproxUpdater : public TreeUpdater {
              common::Span<HostDeviceVector<bst_node_t>> out_position,
              const std::vector<RegTree *> &trees) override {
    CHECK(hist_param_.GetInitialised());
+    if (!column_sampler_) {
+      column_sampler_ = common::MakeColumnSampler(ctx_);
+    }
    pimpl_ = std::make_unique<GloablApproxBuilder>(param, &hist_param_, m->Info(), ctx_,
                                                   column_sampler_, task_, &monitor_);

@@ -300,7 +302,7 @@ class GlobalApproxUpdater : public TreeUpdater {
    std::size_t t_idx = 0;
    for (auto p_tree : trees) {
      this->pimpl_->UpdateTree(m, s_gpair, hess, p_tree, &out_position[t_idx]);
-      hist_param_.CheckTreesSynchronized(p_tree);
+      hist_param_.CheckTreesSynchronized(ctx_, p_tree);
      ++t_idx;
    }
  }
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -225,9 +225,12 @@ class ColMaker: public TreeUpdater {
        }
      }
      {
-        column_sampler_.Init(ctx_, fmat.Info().num_col_,
-                             fmat.Info().feature_weights.ConstHostVector(), param_.colsample_bynode,
-                             param_.colsample_bylevel, param_.colsample_bytree);
+        if (!column_sampler_) {
+          column_sampler_ = common::MakeColumnSampler(ctx_);
+        }
+        column_sampler_->Init(
+            ctx_, fmat.Info().num_col_, fmat.Info().feature_weights.ConstHostVector(),
+            param_.colsample_bynode, param_.colsample_bylevel, param_.colsample_bytree);
      }
      {
        // setup temp space for each thread
@@ -467,7 +470,7 @@ class ColMaker: public TreeUpdater {
                          RegTree *p_tree) {
      auto evaluator = tree_evaluator_.GetEvaluator();

-      auto feat_set = column_sampler_.GetFeatureSet(depth);
+      auto feat_set = column_sampler_->GetFeatureSet(depth);
      for (const auto &batch : p_fmat->GetBatches<SortedCSCPage>(ctx_)) {
        this->UpdateSolution(batch, feat_set->HostVector(), gpair, p_fmat);
      }
@@ -586,7 +589,7 @@ class ColMaker: public TreeUpdater {
    const ColMakerTrainParam& colmaker_train_param_;
    // number of omp thread used during training
    Context const* ctx_;
-    common::ColumnSampler column_sampler_;
+    std::shared_ptr<common::ColumnSampler> column_sampler_;
    // Instance Data: current node position in the tree of each instance
    std::vector<int> position_;
    // PerThread x PerTreeNode: statistics for per thread construction
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -247,7 +247,7 @@ struct GPUHistMakerDevice {
    this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param,
                           dmat->Info().IsColumnSplit(), ctx_->Device());

-    quantiser = std::make_unique<GradientQuantiser>(this->gpair, dmat->Info());
+    quantiser = std::make_unique<GradientQuantiser>(ctx_, this->gpair, dmat->Info());

    row_partitioner.reset();  // Release the device memory first before reallocating
    row_partitioner = std::make_unique<RowPartitioner>(ctx_->Device(), sample.sample_rows);
@@ -277,7 +277,7 @@ struct GPUHistMakerDevice {
        matrix.min_fvalue,
        matrix.is_dense && !collective::IsDistributed()
    };
-    auto split = this->evaluator_.EvaluateSingleSplit(inputs, shared_inputs);
+    auto split = this->evaluator_.EvaluateSingleSplit(ctx_, inputs, shared_inputs);
    return split;
  }

@@ -330,7 +330,7 @@ struct GPUHistMakerDevice {
        d_node_inputs.data().get(), h_node_inputs.data(),
        h_node_inputs.size() * sizeof(EvaluateSplitInputs), cudaMemcpyDefault));

-    this->evaluator_.EvaluateSplits(nidx, max_active_features, dh::ToSpan(d_node_inputs),
+    this->evaluator_.EvaluateSplits(ctx_, nidx, max_active_features, dh::ToSpan(d_node_inputs),
                                    shared_inputs, dh::ToSpan(entries));
    dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(),
                                  entries.data().get(), sizeof(GPUExpandEntry) * entries.size(),
@@ -848,7 +848,7 @@ class GPUHistMaker : public TreeUpdater {
      std::size_t t_idx{0};
      for (xgboost::RegTree* tree : trees) {
        this->UpdateTree(param, gpair_hdv, dmat, tree, &out_position[t_idx]);
-        this->hist_maker_param_.CheckTreesSynchronized(tree);
+        this->hist_maker_param_.CheckTreesSynchronized(ctx_, tree);
        ++t_idx;
      }

@@ -992,7 +992,7 @@ class GPUGlobalApproxMaker : public TreeUpdater {
    std::size_t t_idx{0};
    for (xgboost::RegTree* tree : trees) {
      this->UpdateTree(gpair->Data(), p_fmat, tree, &out_position[t_idx]);
-      this->hist_maker_param_.CheckTreesSynchronized(tree);
+      this->hist_maker_param_.CheckTreesSynchronized(ctx_, tree);
      ++t_idx;
    }

--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2017-2023 by XGBoost Contributors
+ * Copyright 2017-2023, XGBoost Contributors
 * \file updater_quantile_hist.cc
 * \brief use quantized feature values to construct a tree
 * \author Philip Cho, Tianqi Checn, Egor Smirnov
@@ -228,8 +228,8 @@ class MultiTargetHistBuilder {
                      std::vector<MultiExpandEntry> const &valid_candidates,
                      linalg::MatrixView<GradientPair const> gpair) {
    monitor_->Start(__func__);
-    histogram_builder_->BuildHistLeftRight(p_fmat, p_tree, partitioner_, valid_candidates, gpair,
-                                           HistBatch(param_));
+    histogram_builder_->BuildHistLeftRight(ctx_, p_fmat, p_tree, partitioner_, valid_candidates,
+                                           gpair, HistBatch(param_));
    monitor_->Stop(__func__);
  }

@@ -436,8 +436,8 @@ class HistUpdater {
                      std::vector<CPUExpandEntry> const &valid_candidates,
                      linalg::MatrixView<GradientPair const> gpair) {
    monitor_->Start(__func__);
-    this->histogram_builder_->BuildHistLeftRight(p_fmat, p_tree, partitioner_, valid_candidates,
-                                                 gpair, HistBatch(param_));
+    this->histogram_builder_->BuildHistLeftRight(ctx_, p_fmat, p_tree, partitioner_,
+                                                 valid_candidates, gpair, HistBatch(param_));
    monitor_->Stop(__func__);
  }

@@ -470,8 +470,7 @@ class HistUpdater {
 class QuantileHistMaker : public TreeUpdater {
  std::unique_ptr<HistUpdater> p_impl_{nullptr};
  std::unique_ptr<MultiTargetHistBuilder> p_mtimpl_{nullptr};
-  std::shared_ptr<common::ColumnSampler> column_sampler_ =
-      std::make_shared<common::ColumnSampler>();
+  std::shared_ptr<common::ColumnSampler> column_sampler_;
  common::Monitor monitor_;
  ObjInfo const *task_{nullptr};
  HistMakerTrainParam hist_param_;
@@ -495,6 +494,10 @@ class QuantileHistMaker : public TreeUpdater {
  void Update(TrainParam const *param, linalg::Matrix<GradientPair> *gpair, DMatrix *p_fmat,
              common::Span<HostDeviceVector<bst_node_t>> out_position,
              const std::vector<RegTree *> &trees) override {
+    if (!column_sampler_) {
+      column_sampler_ = common::MakeColumnSampler(ctx_);
+    }
+
    if (trees.front()->IsMultiTarget()) {
      CHECK(hist_param_.GetInitialised());
      CHECK(param->monotone_constraints.empty()) << "monotone constraint" << MTNotImplemented();
@@ -537,7 +540,7 @@ class QuantileHistMaker : public TreeUpdater {
                                   h_out_position, *tree_it);
      }

-      hist_param_.CheckTreesSynchronized(*tree_it);
+      hist_param_.CheckTreesSynchronized(ctx_, *tree_it);
    }
  }