merge latest changes

2023-12-13 21:06:28 -08:00 · 2023-12-13 21:06:28 -08:00 · 2d7ffbdf3d
commit 2d7ffbdf3d
parent c81731308c 9c56916fd7
194 changed files with 4859 additions and 2838 deletions
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@ -29,7 +29,7 @@ jobs:
      run: |
        mkdir build
        cd build
-        cmake .. -DGOOGLE_TEST=ON -DUSE_OPENMP=ON -DUSE_DMLC_GTEST=ON -DPLUGIN_DENSE_PARSER=ON -GNinja -DBUILD_DEPRECATED_CLI=ON
+        cmake .. -DGOOGLE_TEST=ON -DUSE_OPENMP=ON -DUSE_DMLC_GTEST=ON -GNinja -DBUILD_DEPRECATED_CLI=ON
        ninja -v
    - name: Run gtest binary
      run: |
@ -63,6 +63,45 @@ jobs:
        cd build
        ctest --extra-verbose

+  gtest-cpu-sycl:
+    name: Test Google C++ unittest (CPU SYCL)
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest]
+        python-version: ["3.8"]
+    steps:
+    - uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791 # v2.5.0
+      with:
+        submodules: 'true'
+    - uses: mamba-org/provision-with-micromamba@f347426e5745fe3dfc13ec5baf20496990d0281f # v14
+      with:
+        cache-downloads: true
+        cache-env: true
+        environment-name: linux_sycl_test
+        environment-file: tests/ci_build/conda_env/linux_sycl_test.yml
+
+    - name: Display Conda env
+      run: |
+        conda info
+        conda list
+    - name: Build and install XGBoost
+      shell: bash -l {0}
+      run: |
+        mkdir build
+        cd build
+        cmake .. -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DPLUGIN_SYCL=ON -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX
+        make -j$(nproc)
+    - name: Run gtest binary for SYCL
+      run: |
+        cd build
+        ./testxgboost --gtest_filter=Sycl*
+    - name: Run gtest binary for non SYCL
+      run: |
+        cd build
+        ./testxgboost --gtest_filter=-Sycl*
+
  c-api-demo:
    name: Test installing XGBoost lib + building the C API demo
    runs-on: ${{ matrix.os }}
@ -144,11 +183,5 @@ jobs:
        python -m pip install wheel setuptools cmakelint cpplint pylint
    - name: Run lint
      run: |
-        python3 tests/ci_build/lint_cpp.py xgboost cpp R-package/src
-
-        python3 tests/ci_build/lint_cpp.py xgboost cpp include src python-package \
-            --exclude_path python-package/xgboost/dmlc-core python-package/xgboost/include \
-                           python-package/xgboost/lib python-package/xgboost/rabit \
-                           python-package/xgboost/src
-
+        python3 tests/ci_build/lint_cpp.py
        sh ./tests/ci_build/lint_cmake.sh
--- a/.github/workflows/python_tests.yml
+++ b/.github/workflows/python_tests.yml
@ -256,6 +256,47 @@ jobs:
      run: |
        pytest -s -v -rxXs --durations=0 ./tests/test_distributed/test_with_spark

+  python-sycl-tests-on-ubuntu:
+    name: Test XGBoost Python package with SYCL on ${{ matrix.config.os }}
+    runs-on: ${{ matrix.config.os }}
+    timeout-minutes: 90
+    strategy:
+      matrix:
+        config:
+          - {os: ubuntu-latest, python-version: "3.8"}
+
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        submodules: 'true'
+
+    - uses: mamba-org/provision-with-micromamba@f347426e5745fe3dfc13ec5baf20496990d0281f # v14
+      with:
+        cache-downloads: true
+        cache-env: true
+        environment-name: linux_sycl_test
+        environment-file: tests/ci_build/conda_env/linux_sycl_test.yml
+
+    - name: Display Conda env
+      run: |
+        conda info
+        conda list
+    - name: Build XGBoost on Ubuntu
+      run: |
+        mkdir build
+        cd build
+        cmake .. -DPLUGIN_SYCL=ON -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
+        make -j$(nproc)
+    - name: Install Python package
+      run: |
+        cd python-package
+        python --version
+        pip install -v .
+    - name: Test Python package
+      run: |
+        pytest -s -v -rxXs --durations=0 ./tests/python-sycl/
+
+
  python-system-installation-on-ubuntu:
    name: Test XGBoost Python package System Installation on ${{ matrix.os }}
    runs-on: ${{ matrix.os }}
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,4 +1,11 @@
 cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
+
+if(PLUGIN_SYCL)
+  set(CMAKE_CXX_COMPILER  "g++")
+  set(CMAKE_C_COMPILER  "gcc")
+  string(REPLACE " -isystem ${CONDA_PREFIX}/include" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+endif()
+
 project(xgboost LANGUAGES CXX C VERSION 2.1.0)
 include(cmake/Utils.cmake)
 list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules")
@ -69,7 +76,10 @@ option(KEEP_BUILD_ARTIFACTS_IN_BINARY_DIR "Output build artifacts in CMake binar
 option(USE_CUDA  "Build with GPU acceleration" OFF)
 option(USE_PER_THREAD_DEFAULT_STREAM "Build with per-thread default stream" ON)
 option(USE_NCCL  "Build with NCCL to enable distributed GPU support." OFF)
+# This is specifically designed for PyPI binary release and should be disabled for most of the cases.
+option(USE_DLOPEN_NCCL "Whether to load nccl dynamically." OFF)
 option(BUILD_WITH_SHARED_NCCL "Build with shared NCCL library." OFF)
+
 if(USE_CUDA)
  if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES AND NOT DEFINED ENV{CUDAARCHS})
    set(GPU_COMPUTE_VER "" CACHE STRING
@ -80,6 +90,7 @@ if(USE_CUDA)
    unset(GPU_COMPUTE_VER CACHE)
  endif()
 endif()
+
 # CUDA device LTO was introduced in CMake v3.25 and requires host LTO to also be enabled but can still
 # be explicitly disabled allowing for LTO on host only, host and device, or neither, but device-only LTO
 # is not a supproted configuration
@ -91,6 +102,8 @@ cmake_dependent_option(USE_CUDA_LTO
 ## HIP
 option(USE_HIP  "Build with GPU acceleration" OFF)
 option(USE_RCCL "Build with RCCL to enable distributed GPU support." OFF)
+# This is specifically designed for PyPI binary release and should be disabled for most of the cases.
+option(USE_DLOPEN_RCCL "Whether to load nccl dynamically." OFF)
 option(BUILD_WITH_SHARED_RCCL "Build with shared RCCL library." OFF)
 ## Sanitizers
 option(USE_SANITIZER "Use santizer flags" OFF)
@ -99,11 +112,10 @@ set(ENABLED_SANITIZERS "address" "leak" CACHE STRING
  "Semicolon separated list of sanitizer names. E.g 'address;leak'. Supported sanitizers are
 address, leak, undefined and thread.")
 ## Plugins
-option(PLUGIN_DENSE_PARSER "Build dense parser plugin" OFF)
 option(PLUGIN_RMM "Build with RAPIDS Memory Manager (RMM)" OFF)
 option(PLUGIN_FEDERATED "Build with Federated Learning" OFF)
 ## TODO: 1. Add check if DPC++ compiler is used for building
-option(PLUGIN_UPDATER_ONEAPI "DPC++ updater" OFF)
+option(PLUGIN_SYCL "SYCL plugin" OFF)
 option(ADD_PKGCONFIG "Add xgboost.pc into system." ON)

 #-- Checks for building XGBoost
@ -119,12 +131,24 @@ endif()
 if(BUILD_WITH_SHARED_NCCL AND (NOT USE_NCCL))
  message(SEND_ERROR "Build XGBoost with -DUSE_NCCL=ON to enable BUILD_WITH_SHARED_NCCL.")
 endif()
+if(USE_DLOPEN_NCCL AND (NOT USE_NCCL))
+  message(SEND_ERROR "Build XGBoost with -DUSE_NCCL=ON to enable USE_DLOPEN_NCCL.")
+endif()
+if(USE_DLOPEN_NCCL AND (NOT (CMAKE_SYSTEM_NAME STREQUAL "Linux")))
+  message(SEND_ERROR "`USE_DLOPEN_NCCL` supports only Linux at the moment.")
+endif()
 if(USE_RCCL AND NOT (USE_HIP))
    message(SEND_ERROR "`USE_RCCL` must be enabled with `USE_HIP` flag.")
 endif()
 if(BUILD_WITH_SHARED_RCCL AND (NOT USE_RCCL))
  message(SEND_ERROR "Build XGBoost with -DUSE_RCCL=ON to enable BUILD_WITH_SHARED_RCCL.")
 endif()
+if(USE_DLOPEN_RCCL AND (NOT USE_RCCL))
+  message(SEND_ERROR "Build XGBoost with -DUSE_RCCL=ON to enable USE_DLOPEN_RCCL.")
+endif()
+if(USE_DLOPEN_RCCL AND (NOT (CMAKE_SYSTEM_NAME STREQUAL "Linux")))
+  message(SEND_ERROR "`USE_DLOPEN_RCCL` supports only Linux at the moment.")
+endif()
 if(JVM_BINDINGS AND R_LIB)
  message(SEND_ERROR "`R_LIB' is not compatible with `JVM_BINDINGS' as they both have customized configurations.")
 endif()
@ -185,6 +209,9 @@ endif()
 if(USE_HDFS)
  message(SEND_ERROR "The option `USE_HDFS` has been removed from XGBoost")
 endif()
+if(PLUGIN_DENSE_PARSER)
+  message(SEND_ERROR "The option `PLUGIN_DENSE_PARSER` has been removed from XGBoost.")
+endif()

 #-- Sanitizer
 if(USE_SANITIZER)
@ -332,6 +359,15 @@ if(PLUGIN_RMM)
  get_target_property(rmm_link_libs rmm::rmm INTERFACE_LINK_LIBRARIES)
 endif()

+if(PLUGIN_SYCL)
+  set(CMAKE_CXX_LINK_EXECUTABLE
+      "icpx <FLAGS> <CMAKE_CXX_LINK_FLAGS> -qopenmp <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
+  set(CMAKE_CXX_CREATE_SHARED_LIBRARY
+      "icpx <CMAKE_SHARED_LIBRARY_CXX_FLAGS> -qopenmp <LANGUAGE_COMPILE_FLAGS> \
+      <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <SONAME_FLAG>,<TARGET_SONAME> \
+      -o <TARGET> <OBJECTS> <LINK_LIBRARIES>")
+endif()
+
 #-- library
 if(BUILD_STATIC_LIB)
  add_library(xgboost STATIC)
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@ -10,8 +10,8 @@ The Project Management Committee(PMC) consists group of active committers that m
  - Tianqi is a Ph.D. student working on large-scale machine learning. He is the creator of the project.
 * [Michael Benesty](https://github.com/pommedeterresautee)
  - Michael is a lawyer and data scientist in France. He is the creator of XGBoost interactive analysis module in R.
-* [Yuan Tang](https://github.com/terrytangyuan), Akuity
-  - Yuan is a founding engineer at Akuity. He contributed mostly in R and Python packages.
+* [Yuan Tang](https://github.com/terrytangyuan), Red Hat
+  - Yuan is a principal software engineer at Red Hat. He contributed mostly in R and Python packages.
 * [Nan Zhu](https://github.com/CodingCat), Uber
  - Nan is a software engineer in Uber. He contributed mostly in JVM packages.
 * [Jiaming Yuan](https://github.com/trivialfis)
--- a/R-package/R/xgb.DMatrix.R
+++ b/R-package/R/xgb.DMatrix.R
@ -5,7 +5,7 @@
 #' \code{\link{xgb.DMatrix.save}}).
 #'
 #' @param data a \code{matrix} object (either numeric or integer), a \code{dgCMatrix} object,
-#'        a \code{dgRMatrix} object (only when making predictions from a fitted model),
+#'        a \code{dgRMatrix} object,
 #'        a \code{dsparseVector} object (only when making predictions from a fitted model, will be
 #'        interpreted as a row vector), or a character string representing a filename.
 #' @param info a named list of additional information to store in the \code{xgb.DMatrix} object.
--- a/R-package/man/xgb.DMatrix.Rd
+++ b/R-package/man/xgb.DMatrix.Rd
@ -15,7 +15,7 @@ xgb.DMatrix(
 }
 \arguments{
 \item{data}{a \code{matrix} object (either numeric or integer), a \code{dgCMatrix} object,
-a \code{dgRMatrix} object (only when making predictions from a fitted model),
+a \code{dgRMatrix} object,
 a \code{dsparseVector} object (only when making predictions from a fitted model, will be
 interpreted as a row vector), or a character string representing a filename.}

--- a/R-package/src/xgboost_R.cc
+++ b/R-package/src/xgboost_R.cc
@ -8,6 +8,7 @@
 #include <xgboost/data.h>
 #include <xgboost/logging.h>

+#include <cstdint>
 #include <cstdio>
 #include <cstring>
 #include <sstream>
@ -21,6 +22,118 @@

 #include "./xgboost_R.h"  // Must follow other includes.

+namespace {
+
+struct ErrorWithUnwind : public std::exception {};
+
+void ThrowExceptionFromRError(void *unused, Rboolean jump) {
+  if (jump) {
+    throw ErrorWithUnwind();
+  }
+}
+
+struct PtrToConstChar {
+  const char *ptr;
+};
+
+SEXP WrappedMkChar(void *void_ptr) {
+  return Rf_mkChar(static_cast<PtrToConstChar*>(void_ptr)->ptr);
+}
+
+SEXP SafeMkChar(const char *c_str, SEXP continuation_token) {
+  PtrToConstChar ptr_struct{c_str};
+  return R_UnwindProtect(
+    WrappedMkChar, static_cast<void*>(&ptr_struct),
+    ThrowExceptionFromRError, nullptr,
+    continuation_token);
+}
+
+[[nodiscard]] std::string MakeArrayInterfaceFromRMat(SEXP R_mat) {
+  SEXP mat_dims = Rf_getAttrib(R_mat, R_DimSymbol);
+  const int *ptr_mat_dims = INTEGER(mat_dims);
+
+  // Lambda for type dispatch.
+  auto make_matrix = [=](auto const *ptr) {
+    using namespace xgboost;  // NOLINT
+    using T = std::remove_pointer_t<decltype(ptr)>;
+
+    auto m = linalg::MatrixView<T>{
+        common::Span{ptr,
+          static_cast<std::size_t>(ptr_mat_dims[0]) * static_cast<std::size_t>(ptr_mat_dims[1])},
+        {ptr_mat_dims[0], ptr_mat_dims[1]},  // Shape
+        DeviceOrd::CPU(),
+        linalg::Order::kF  // R uses column-major
+    };
+    CHECK(m.FContiguous());
+    return linalg::ArrayInterfaceStr(m);
+  };
+
+  const SEXPTYPE arr_type = TYPEOF(R_mat);
+  switch (arr_type) {
+    case REALSXP:
+      return make_matrix(REAL(R_mat));
+    case INTSXP:
+      return make_matrix(INTEGER(R_mat));
+    case LGLSXP:
+      return make_matrix(LOGICAL(R_mat));
+    default:
+      LOG(FATAL) << "Array or matrix has unsupported type.";
+  }
+
+  LOG(FATAL) << "Not reachable";
+  return "";
+}
+
+[[nodiscard]] std::string MakeArrayInterfaceFromRVector(SEXP R_vec) {
+  const size_t vec_len = Rf_xlength(R_vec);
+
+  // Lambda for type dispatch.
+  auto make_vec = [=](auto const *ptr) {
+    using namespace xgboost;  // NOLINT
+    auto v = linalg::MakeVec(ptr, vec_len);
+    return linalg::ArrayInterfaceStr(v);
+  };
+
+  const SEXPTYPE arr_type = TYPEOF(R_vec);
+  switch (arr_type) {
+    case REALSXP:
+      return make_vec(REAL(R_vec));
+    case INTSXP:
+      return make_vec(INTEGER(R_vec));
+    case LGLSXP:
+      return make_vec(LOGICAL(R_vec));
+    default:
+      LOG(FATAL) << "Array or matrix has unsupported type.";
+  }
+
+  LOG(FATAL) << "Not reachable";
+  return "";
+}
+
+[[nodiscard]] std::string MakeJsonConfigForArray(SEXP missing, SEXP n_threads, SEXPTYPE arr_type) {
+  using namespace ::xgboost;  // NOLINT
+  Json jconfig{Object{}};
+
+  const SEXPTYPE missing_type = TYPEOF(missing);
+  if (Rf_isNull(missing) || (missing_type == REALSXP && ISNAN(Rf_asReal(missing))) ||
+      (missing_type == LGLSXP && Rf_asLogical(missing) == R_NaInt) ||
+      (missing_type == INTSXP && Rf_asInteger(missing) == R_NaInt)) {
+    // missing is not specified
+    if (arr_type == REALSXP) {
+      jconfig["missing"] = std::numeric_limits<double>::quiet_NaN();
+    } else {
+      jconfig["missing"] = R_NaInt;
+    }
+  } else {
+    // missing specified
+    jconfig["missing"] = Rf_asReal(missing);
+  }
+
+  jconfig["nthread"] = Rf_asInteger(n_threads);
+  return Json::Dump(jconfig);
+}
+}  // namespace
+
 /*!
 * \brief macro to annotate begin of api
 */
@ -47,13 +160,6 @@

 using dmlc::BeginPtr;

-xgboost::Context const *DMatrixCtx(DMatrixHandle handle) {
-  CHECK_HANDLE();
-  auto p_m = static_cast<std::shared_ptr<xgboost::DMatrix> *>(handle);
-  CHECK(p_m);
-  return p_m->get()->Ctx();
-}
-
 XGB_DLL SEXP XGCheckNullPtr_R(SEXP handle) {
  return ScalarLogical(R_ExternalPtrAddr(handle) == NULL);
 }
@ -82,11 +188,11 @@ XGB_DLL SEXP XGBGetGlobalConfig_R() {
 }

 XGB_DLL SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) {
-  SEXP ret;
+  SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
  R_API_BEGIN();
  DMatrixHandle handle;
  CHECK_CALL(XGDMatrixCreateFromFile(CHAR(asChar(fname)), asInteger(silent), &handle));
-  ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
+  R_SetExternalPtrAddr(ret, handle);
  R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
  R_API_END();
  UNPROTECT(1);
@ -94,47 +200,19 @@ XGB_DLL SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) {
 }

 XGB_DLL SEXP XGDMatrixCreateFromMat_R(SEXP mat, SEXP missing, SEXP n_threads) {
-  SEXP ret;
+  SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
  R_API_BEGIN();
-  SEXP dim = getAttrib(mat, R_DimSymbol);
-  size_t nrow = static_cast<size_t>(INTEGER(dim)[0]);
-  size_t ncol = static_cast<size_t>(INTEGER(dim)[1]);
-  const bool is_int = TYPEOF(mat) == INTSXP;
-  double *din;
-  int *iin;
-  if (is_int) {
-    iin = INTEGER(mat);
-  } else {
-    din = REAL(mat);
-  }
-  std::vector<float> data(nrow * ncol);
-  xgboost::Context ctx;
-  ctx.nthread = asInteger(n_threads);
-  std::int32_t threads = ctx.Threads();
-
-  if (is_int) {
-    xgboost::common::ParallelFor(nrow, threads, [&](xgboost::omp_ulong i) {
-      for (size_t j = 0; j < ncol; ++j) {
-        auto v = iin[i + nrow * j];
-        if (v == NA_INTEGER) {
-          data[i * ncol + j] = std::numeric_limits<float>::quiet_NaN();
-        } else {
-          data[i * ncol + j] = static_cast<float>(v);
-        }
-      }
-    });
-  } else {
-    xgboost::common::ParallelFor(nrow, threads, [&](xgboost::omp_ulong i) {
-      for (size_t j = 0; j < ncol; ++j) {
-        data[i * ncol + j] = din[i + nrow * j];
-      }
-    });
-  }

  DMatrixHandle handle;
-  CHECK_CALL(XGDMatrixCreateFromMat_omp(BeginPtr(data), nrow, ncol,
-                                        asReal(missing), &handle, threads));
-  ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
+  int res_code;
+  {
+    auto array_str = MakeArrayInterfaceFromRMat(mat);
+    auto config_str = MakeJsonConfigForArray(missing, n_threads, TYPEOF(mat));
+
+    res_code = XGDMatrixCreateFromDense(array_str.c_str(), config_str.c_str(), &handle);
+  }
+  CHECK_CALL(res_code);
+  R_SetExternalPtrAddr(ret, handle);
  R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
  R_API_END();
  UNPROTECT(1);
@ -148,8 +226,8 @@ void CreateFromSparse(SEXP indptr, SEXP indices, SEXP data, std::string *indptr_
  const int *p_indices = INTEGER(indices);
  const double *p_data = REAL(data);

-  auto nindptr = static_cast<std::size_t>(length(indptr));
-  auto ndata = static_cast<std::size_t>(length(data));
+  auto nindptr = static_cast<std::size_t>(Rf_xlength(indptr));
+  auto ndata = static_cast<std::size_t>(Rf_xlength(data));
  CHECK_EQ(ndata, p_indptr[nindptr - 1]);
  xgboost::detail::MakeSparseFromPtr(p_indptr, p_indices, p_data, nindptr, indptr_str, indices_str,
                                     data_str);
@ -158,30 +236,32 @@ void CreateFromSparse(SEXP indptr, SEXP indices, SEXP data, std::string *indptr_

 XGB_DLL SEXP XGDMatrixCreateFromCSC_R(SEXP indptr, SEXP indices, SEXP data, SEXP num_row,
                                      SEXP missing, SEXP n_threads) {
-  SEXP ret;
+  SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
  R_API_BEGIN();
  std::int32_t threads = asInteger(n_threads);
-
-  using xgboost::Integer;
-  using xgboost::Json;
-  using xgboost::Object;
-
-  std::string sindptr, sindices, sdata;
-  CreateFromSparse(indptr, indices, data, &sindptr, &sindices, &sdata);
-  auto nrow = static_cast<std::size_t>(INTEGER(num_row)[0]);
-
  DMatrixHandle handle;
-  Json jconfig{Object{}};
-  // Construct configuration
-  jconfig["nthread"] = Integer{threads};
-  jconfig["missing"] = xgboost::Number{asReal(missing)};
-  std::string config;
-  Json::Dump(jconfig, &config);
-  CHECK_CALL(XGDMatrixCreateFromCSC(sindptr.c_str(), sindices.c_str(), sdata.c_str(), nrow,
-                                    config.c_str(), &handle));

-  ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
+  int res_code;
+  {
+    using xgboost::Integer;
+    using xgboost::Json;
+    using xgboost::Object;
+    std::string sindptr, sindices, sdata;
+    CreateFromSparse(indptr, indices, data, &sindptr, &sindices, &sdata);
+    auto nrow = static_cast<std::size_t>(INTEGER(num_row)[0]);

+    Json jconfig{Object{}};
+    // Construct configuration
+    jconfig["nthread"] = Integer{threads};
+    jconfig["missing"] = xgboost::Number{asReal(missing)};
+    std::string config;
+    Json::Dump(jconfig, &config);
+    res_code = XGDMatrixCreateFromCSC(sindptr.c_str(), sindices.c_str(), sdata.c_str(), nrow,
+                                      config.c_str(), &handle);
+  }
+  CHECK_CALL(res_code);
+
+  R_SetExternalPtrAddr(ret, handle);
  R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
  R_API_END();
  UNPROTECT(1);
@ -190,29 +270,31 @@ XGB_DLL SEXP XGDMatrixCreateFromCSC_R(SEXP indptr, SEXP indices, SEXP data, SEXP

 XGB_DLL SEXP XGDMatrixCreateFromCSR_R(SEXP indptr, SEXP indices, SEXP data, SEXP num_col,
                                      SEXP missing, SEXP n_threads) {
-  SEXP ret;
+  SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
  R_API_BEGIN();
  std::int32_t threads = asInteger(n_threads);
-
-  using xgboost::Integer;
-  using xgboost::Json;
-  using xgboost::Object;
-
-  std::string sindptr, sindices, sdata;
-  CreateFromSparse(indptr, indices, data, &sindptr, &sindices, &sdata);
-  auto ncol = static_cast<std::size_t>(INTEGER(num_col)[0]);
-
  DMatrixHandle handle;
-  Json jconfig{Object{}};
-  // Construct configuration
-  jconfig["nthread"] = Integer{threads};
-  jconfig["missing"] = xgboost::Number{asReal(missing)};
-  std::string config;
-  Json::Dump(jconfig, &config);
-  CHECK_CALL(XGDMatrixCreateFromCSR(sindptr.c_str(), sindices.c_str(), sdata.c_str(), ncol,
-                                    config.c_str(), &handle));
-  ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));

+  int res_code;
+  {
+    using xgboost::Integer;
+    using xgboost::Json;
+    using xgboost::Object;
+
+    std::string sindptr, sindices, sdata;
+    CreateFromSparse(indptr, indices, data, &sindptr, &sindices, &sdata);
+    auto ncol = static_cast<std::size_t>(INTEGER(num_col)[0]);
+
+    Json jconfig{Object{}};
+    // Construct configuration
+    jconfig["nthread"] = Integer{threads};
+    jconfig["missing"] = xgboost::Number{asReal(missing)};
+    std::string config;
+    Json::Dump(jconfig, &config);
+    res_code = XGDMatrixCreateFromCSR(sindptr.c_str(), sindices.c_str(), sdata.c_str(), ncol,
+                                      config.c_str(), &handle);
+  }
+  R_SetExternalPtrAddr(ret, handle);
  R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
  R_API_END();
  UNPROTECT(1);
@ -220,19 +302,28 @@ XGB_DLL SEXP XGDMatrixCreateFromCSR_R(SEXP indptr, SEXP indices, SEXP data, SEXP
 }

 XGB_DLL SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset) {
-  SEXP ret;
+  SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
  R_API_BEGIN();
-  int len = length(idxset);
-  std::vector<int> idxvec(len);
-  for (int i = 0; i < len; ++i) {
-    idxvec[i] = INTEGER(idxset)[i] - 1;
-  }
+  R_xlen_t len = Rf_xlength(idxset);
+  const int *idxset_ = INTEGER(idxset);
  DMatrixHandle res;
-  CHECK_CALL(XGDMatrixSliceDMatrixEx(R_ExternalPtrAddr(handle),
-                                     BeginPtr(idxvec), len,
-                                     &res,
-                                     0));
-  ret = PROTECT(R_MakeExternalPtr(res, R_NilValue, R_NilValue));
+
+  int res_code;
+  {
+    std::vector<int> idxvec(len);
+    #ifndef _MSC_VER
+    #pragma omp simd
+    #endif
+    for (R_xlen_t i = 0; i < len; ++i) {
+      idxvec[i] = idxset_[i] - 1;
+    }
+    res_code = XGDMatrixSliceDMatrixEx(R_ExternalPtrAddr(handle),
+                                       BeginPtr(idxvec), len,
+                                       &res,
+                                       0);
+  }
+  CHECK_CALL(res_code);
+  R_SetExternalPtrAddr(ret, res);
  R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
  R_API_END();
  UNPROTECT(1);
@ -250,23 +341,15 @@ XGB_DLL SEXP XGDMatrixSaveBinary_R(SEXP handle, SEXP fname, SEXP silent) {

 XGB_DLL SEXP XGDMatrixSetInfo_R(SEXP handle, SEXP field, SEXP array) {
  R_API_BEGIN();
-  int len = length(array);
-  const char *name = CHAR(asChar(field));
-  auto ctx = DMatrixCtx(R_ExternalPtrAddr(handle));
-  if (!strcmp("group", name)) {
-    std::vector<unsigned> vec(len);
-    xgboost::common::ParallelFor(len, ctx->Threads(), [&](xgboost::omp_ulong i) {
-      vec[i] = static_cast<unsigned>(INTEGER(array)[i]);
-    });
-    CHECK_CALL(
-        XGDMatrixSetUIntInfo(R_ExternalPtrAddr(handle), CHAR(asChar(field)), BeginPtr(vec), len));
-  } else {
-    std::vector<float> vec(len);
-    xgboost::common::ParallelFor(len, ctx->Threads(),
-                                 [&](xgboost::omp_ulong i) { vec[i] = REAL(array)[i]; });
-    CHECK_CALL(
-        XGDMatrixSetFloatInfo(R_ExternalPtrAddr(handle), CHAR(asChar(field)), BeginPtr(vec), len));
+  SEXP field_ = PROTECT(Rf_asChar(field));
+  int res_code;
+  {
+    const std::string array_str = MakeArrayInterfaceFromRVector(array);
+    res_code = XGDMatrixSetInfoFromInterface(
+      R_ExternalPtrAddr(handle), CHAR(field_), array_str.c_str());
  }
+  CHECK_CALL(res_code);
+  UNPROTECT(1);
  R_API_END();
  return R_NilValue;
 }
@ -275,18 +358,30 @@ XGB_DLL SEXP XGDMatrixSetStrFeatureInfo_R(SEXP handle, SEXP field, SEXP array) {
  R_API_BEGIN();
  size_t len{0};
  if (!isNull(array)) {
-    len = length(array);
+    len = Rf_xlength(array);
  }

-  const char *name = CHAR(asChar(field));
-  std::vector<std::string> str_info;
+  SEXP str_info_holder = PROTECT(Rf_allocVector(VECSXP, len));
  for (size_t i = 0; i < len; ++i) {
-    str_info.emplace_back(CHAR(asChar(VECTOR_ELT(array, i))));
+    SET_VECTOR_ELT(str_info_holder, i, Rf_asChar(VECTOR_ELT(array, i)));
  }
-  std::vector<char const*> vec(len);
-  std::transform(str_info.cbegin(), str_info.cend(), vec.begin(),
-                 [](std::string const &str) { return str.c_str(); });
-  CHECK_CALL(XGDMatrixSetStrFeatureInfo(R_ExternalPtrAddr(handle), name, vec.data(), len));
+
+  SEXP field_ = PROTECT(Rf_asChar(field));
+  const char *name = CHAR(field_);
+  int res_code;
+  {
+    std::vector<std::string> str_info;
+    str_info.reserve(len);
+    for (size_t i = 0; i < len; ++i) {
+      str_info.emplace_back(CHAR(VECTOR_ELT(str_info_holder, i)));
+    }
+    std::vector<char const*> vec(len);
+    std::transform(str_info.cbegin(), str_info.cend(), vec.begin(),
+                   [](std::string const &str) { return str.c_str(); });
+    res_code = XGDMatrixSetStrFeatureInfo(R_ExternalPtrAddr(handle), name, vec.data(), len);
+  }
+  CHECK_CALL(res_code);
+  UNPROTECT(2);
  R_API_END();
  return R_NilValue;
 }
@ -319,8 +414,9 @@ XGB_DLL SEXP XGDMatrixGetInfo_R(SEXP handle, SEXP field) {
  const float *res;
  CHECK_CALL(XGDMatrixGetFloatInfo(R_ExternalPtrAddr(handle), CHAR(asChar(field)), &olen, &res));
  ret = PROTECT(allocVector(REALSXP, olen));
+  double *ret_ = REAL(ret);
  for (size_t i = 0; i < olen; ++i) {
-    REAL(ret)[i] = res[i];
+    ret_[i] = res[i];
  }
  R_API_END();
  UNPROTECT(1);
@ -351,16 +447,21 @@ void _BoosterFinalizer(SEXP ext) {
 }

 XGB_DLL SEXP XGBoosterCreate_R(SEXP dmats) {
-  SEXP ret;
+  SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
  R_API_BEGIN();
-  int len = length(dmats);
-  std::vector<void*> dvec;
-  for (int i = 0; i < len; ++i) {
-    dvec.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i)));
-  }
+  R_xlen_t len = Rf_xlength(dmats);
  BoosterHandle handle;
-  CHECK_CALL(XGBoosterCreate(BeginPtr(dvec), dvec.size(), &handle));
-  ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
+
+  int res_code;
+  {
+    std::vector<void*> dvec(len);
+    for (R_xlen_t i = 0; i < len; ++i) {
+      dvec[i] = R_ExternalPtrAddr(VECTOR_ELT(dmats, i));
+    }
+    res_code = XGBoosterCreate(BeginPtr(dvec), dvec.size(), &handle);
+  }
+  CHECK_CALL(res_code);
+  R_SetExternalPtrAddr(ret, handle);
  R_RegisterCFinalizerEx(ret, _BoosterFinalizer, TRUE);
  R_API_END();
  UNPROTECT(1);
@ -369,13 +470,18 @@ XGB_DLL SEXP XGBoosterCreate_R(SEXP dmats) {

 XGB_DLL SEXP XGBoosterCreateInEmptyObj_R(SEXP dmats, SEXP R_handle) {
  R_API_BEGIN();
-  int len = length(dmats);
-  std::vector<void*> dvec;
-  for (int i = 0; i < len; ++i) {
-    dvec.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i)));
-  }
+  R_xlen_t len = Rf_xlength(dmats);
  BoosterHandle handle;
-  CHECK_CALL(XGBoosterCreate(BeginPtr(dvec), dvec.size(), &handle));
+
+  int res_code;
+  {
+    std::vector<void*> dvec(len);
+    for (R_xlen_t i = 0; i < len; ++i) {
+      dvec[i] = R_ExternalPtrAddr(VECTOR_ELT(dmats, i));
+    }
+    res_code = XGBoosterCreate(BeginPtr(dvec), dvec.size(), &handle);
+  }
+  CHECK_CALL(res_code);
  R_SetExternalPtrAddr(R_handle, handle);
  R_RegisterCFinalizerEx(R_handle, _BoosterFinalizer, TRUE);
  R_API_END();
@ -384,9 +490,12 @@ XGB_DLL SEXP XGBoosterCreateInEmptyObj_R(SEXP dmats, SEXP R_handle) {

 XGB_DLL SEXP XGBoosterSetParam_R(SEXP handle, SEXP name, SEXP val) {
  R_API_BEGIN();
+  SEXP name_ = PROTECT(Rf_asChar(name));
+  SEXP val_ = PROTECT(Rf_asChar(val));
  CHECK_CALL(XGBoosterSetParam(R_ExternalPtrAddr(handle),
-                               CHAR(asChar(name)),
-                               CHAR(asChar(val))));
+                               CHAR(name_),
+                               CHAR(val_)));
+  UNPROTECT(2);
  R_API_END();
  return R_NilValue;
 }
@ -402,7 +511,7 @@ XGB_DLL SEXP XGBoosterUpdateOneIter_R(SEXP handle, SEXP iter, SEXP dtrain) {

 XGB_DLL SEXP XGBoosterTrainOneIter_R(SEXP handle, SEXP dtrain, SEXP iter, SEXP grad, SEXP hess) {
  R_API_BEGIN();
-  CHECK_EQ(length(grad), length(hess)) << "gradient and hess must have same length.";
+  CHECK_EQ(Rf_xlength(grad), Rf_xlength(hess)) << "gradient and hess must have same length.";
  SEXP gdim = getAttrib(grad, R_DimSymbol);
  auto n_samples = static_cast<std::size_t>(INTEGER(gdim)[0]);
  auto n_targets = static_cast<std::size_t>(INTEGER(gdim)[1]);
@ -413,11 +522,15 @@ XGB_DLL SEXP XGBoosterTrainOneIter_R(SEXP handle, SEXP dtrain, SEXP iter, SEXP g
  double const *d_grad = REAL(grad);
  double const *d_hess = REAL(hess);

-  auto ctx = xgboost::detail::BoosterCtx(R_ExternalPtrAddr(handle));
-  auto [s_grad, s_hess] = xgboost::detail::MakeGradientInterface(
-      ctx, d_grad, d_hess, xgboost::linalg::kF, n_samples, n_targets);
-  CHECK_CALL(XGBoosterTrainOneIter(R_ExternalPtrAddr(handle), R_ExternalPtrAddr(dtrain),
-                                   asInteger(iter), s_grad.c_str(), s_hess.c_str()));
+  int res_code;
+  {
+    auto ctx = xgboost::detail::BoosterCtx(R_ExternalPtrAddr(handle));
+    auto [s_grad, s_hess] = xgboost::detail::MakeGradientInterface(
+        ctx, d_grad, d_hess, xgboost::linalg::kF, n_samples, n_targets);
+    res_code = XGBoosterTrainOneIter(R_ExternalPtrAddr(handle), R_ExternalPtrAddr(dtrain),
+                                     asInteger(iter), s_grad.c_str(), s_hess.c_str());
+  }
+  CHECK_CALL(res_code);

  R_API_END();
  return R_NilValue;
@ -426,24 +539,35 @@ XGB_DLL SEXP XGBoosterTrainOneIter_R(SEXP handle, SEXP dtrain, SEXP iter, SEXP g
 XGB_DLL SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evnames) {
  const char *ret;
  R_API_BEGIN();
-  CHECK_EQ(length(dmats), length(evnames))
+  CHECK_EQ(Rf_xlength(dmats), Rf_xlength(evnames))
      << "dmats and evnams must have same length";
-  int len = length(dmats);
-  std::vector<void*> vec_dmats;
-  std::vector<std::string> vec_names;
-  std::vector<const char*> vec_sptr;
-  for (int i = 0; i < len; ++i) {
-    vec_dmats.push_back(R_ExternalPtrAddr(VECTOR_ELT(dmats, i)));
-    vec_names.emplace_back(CHAR(asChar(VECTOR_ELT(evnames, i))));
+  R_xlen_t len = Rf_xlength(dmats);
+  SEXP evnames_lst = PROTECT(Rf_allocVector(VECSXP, len));
+  for (R_xlen_t i = 0; i < len; i++) {
+    SET_VECTOR_ELT(evnames_lst, i, Rf_asChar(VECTOR_ELT(evnames, i)));
  }
-  for (int i = 0; i < len; ++i) {
-    vec_sptr.push_back(vec_names[i].c_str());
+
+  int res_code;
+  {
+    std::vector<void*> vec_dmats(len);
+    std::vector<std::string> vec_names;
+    vec_names.reserve(len);
+    std::vector<const char*> vec_sptr(len);
+    for (R_xlen_t i = 0; i < len; ++i) {
+      vec_dmats[i] = R_ExternalPtrAddr(VECTOR_ELT(dmats, i));
+      vec_names.emplace_back(CHAR(VECTOR_ELT(evnames_lst, i)));
+    }
+    for (R_xlen_t i = 0; i < len; ++i) {
+      vec_sptr[i] = vec_names[i].c_str();
+    }
+    res_code = XGBoosterEvalOneIter(R_ExternalPtrAddr(handle),
+                                    asInteger(iter),
+                                    BeginPtr(vec_dmats),
+                                    BeginPtr(vec_sptr),
+                                    len, &ret);
  }
-  CHECK_CALL(XGBoosterEvalOneIter(R_ExternalPtrAddr(handle),
-                                  asInteger(iter),
-                                  BeginPtr(vec_dmats),
-                                  BeginPtr(vec_sptr),
-                                  len, &ret));
+  CHECK_CALL(res_code);
+  UNPROTECT(1);
  R_API_END();
  return mkString(ret);
 }
@ -451,10 +575,11 @@ XGB_DLL SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evn
 XGB_DLL SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_config)  {
  SEXP r_out_shape;
  SEXP r_out_result;
-  SEXP r_out;
+  SEXP r_out = PROTECT(allocVector(VECSXP, 2));
+  SEXP json_config_ = PROTECT(Rf_asChar(json_config));

  R_API_BEGIN();
-  char const *c_json_config = CHAR(asChar(json_config));
+  char const *c_json_config = CHAR(json_config_);

  bst_ulong out_dim;
  bst_ulong const *out_shape;
@ -465,23 +590,19 @@ XGB_DLL SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_con

  r_out_shape = PROTECT(allocVector(INTSXP, out_dim));
  size_t len = 1;
+  int *r_out_shape_ = INTEGER(r_out_shape);
  for (size_t i = 0; i < out_dim; ++i) {
-    INTEGER(r_out_shape)[i] = out_shape[i];
+    r_out_shape_[i] = out_shape[i];
    len *= out_shape[i];
  }
  r_out_result = PROTECT(allocVector(REALSXP, len));
-  auto ctx = xgboost::detail::BoosterCtx(R_ExternalPtrAddr(handle));
-  xgboost::common::ParallelFor(len, ctx->Threads(), [&](xgboost::omp_ulong i) {
-    REAL(r_out_result)[i] = out_result[i];
-  });
-
-  r_out = PROTECT(allocVector(VECSXP, 2));
+  std::copy(out_result, out_result + len, REAL(r_out_result));

  SET_VECTOR_ELT(r_out, 0, r_out_shape);
  SET_VECTOR_ELT(r_out, 1, r_out_result);

  R_API_END();
-  UNPROTECT(3);
+  UNPROTECT(4);

  return r_out;
 }
@ -504,7 +625,7 @@ XGB_DLL SEXP XGBoosterLoadModelFromRaw_R(SEXP handle, SEXP raw) {
  R_API_BEGIN();
  CHECK_CALL(XGBoosterLoadModelFromBuffer(R_ExternalPtrAddr(handle),
                                          RAW(raw),
-                                          length(raw)));
+                                          Rf_xlength(raw)));
  R_API_END();
  return R_NilValue;
 }
@ -562,45 +683,54 @@ XGB_DLL SEXP XGBoosterUnserializeFromBuffer_R(SEXP handle, SEXP raw) {
  R_API_BEGIN();
  CHECK_CALL(XGBoosterUnserializeFromBuffer(R_ExternalPtrAddr(handle),
                                 RAW(raw),
-                                 length(raw)));
+                                 Rf_xlength(raw)));
  R_API_END();
  return R_NilValue;
 }

 XGB_DLL SEXP XGBoosterDumpModel_R(SEXP handle, SEXP fmap, SEXP with_stats, SEXP dump_format) {
  SEXP out;
+  SEXP continuation_token = PROTECT(R_MakeUnwindCont());
+  SEXP dump_format_ = PROTECT(Rf_asChar(dump_format));
+  SEXP fmap_ = PROTECT(Rf_asChar(fmap));
  R_API_BEGIN();
  bst_ulong olen;
  const char **res;
-  const char *fmt = CHAR(asChar(dump_format));
+  const char *fmt = CHAR(dump_format_);
  CHECK_CALL(XGBoosterDumpModelEx(R_ExternalPtrAddr(handle),
-                                CHAR(asChar(fmap)),
+                                CHAR(fmap_),
                                asInteger(with_stats),
                                fmt,
                                &olen, &res));
  out = PROTECT(allocVector(STRSXP, olen));
-  if (!strcmp("json", fmt)) {
-    std::stringstream stream;
-    stream <<  "[\n";
-    for (size_t i = 0; i < olen; ++i) {
-      stream << res[i];
-      if (i < olen - 1) {
-        stream << ",\n";
-      } else {
-        stream << "\n";
+  try {
+    if (!strcmp("json", fmt)) {
+      std::stringstream stream;
+      stream <<  "[\n";
+      for (size_t i = 0; i < olen; ++i) {
+        stream << res[i];
+        if (i < olen - 1) {
+          stream << ",\n";
+        } else {
+          stream << "\n";
+        }
+      }
+      stream <<  "]";
+      const std::string temp_str = stream.str();
+      SET_STRING_ELT(out, 0, SafeMkChar(temp_str.c_str(), continuation_token));
+    } else {
+      for (size_t i = 0; i < olen; ++i) {
+        std::stringstream stream;
+        stream <<  "booster[" << i <<"]\n" << res[i];
+        const std::string temp_str = stream.str();
+        SET_STRING_ELT(out, i, SafeMkChar(temp_str.c_str(), continuation_token));
      }
    }
-    stream <<  "]";
-    SET_STRING_ELT(out, 0, mkChar(stream.str().c_str()));
-  } else {
-    for (size_t i = 0; i < olen; ++i) {
-      std::stringstream stream;
-      stream <<  "booster[" << i <<"]\n" << res[i];
-      SET_STRING_ELT(out, i, mkChar(stream.str().c_str()));
-    }
+  } catch (ErrorWithUnwind &e) {
+    R_ContinueUnwind(continuation_token);
  }
  R_API_END();
-  UNPROTECT(1);
+  UNPROTECT(4);
  return out;
 }

@ -626,9 +756,19 @@ XGB_DLL SEXP XGBoosterGetAttr_R(SEXP handle, SEXP name) {

 XGB_DLL SEXP XGBoosterSetAttr_R(SEXP handle, SEXP name, SEXP val) {
  R_API_BEGIN();
-  const char *v = isNull(val) ? nullptr : CHAR(asChar(val));
+  const char *v = nullptr;
+  SEXP name_ = PROTECT(Rf_asChar(name));
+  SEXP val_;
+  int n_protected = 1;
+  if (!Rf_isNull(val)) {
+    val_ = PROTECT(Rf_asChar(val));
+    n_protected++;
+    v = CHAR(val_);
+  }
+
  CHECK_CALL(XGBoosterSetAttr(R_ExternalPtrAddr(handle),
-                              CHAR(asChar(name)), v));
+                              CHAR(name_), v));
+  UNPROTECT(n_protected);
  R_API_END();
  return R_NilValue;
 }
@ -657,7 +797,7 @@ XGB_DLL SEXP XGBoosterFeatureScore_R(SEXP handle, SEXP json_config) {
  SEXP out_features_sexp;
  SEXP out_scores_sexp;
  SEXP out_shape_sexp;
-  SEXP r_out;
+  SEXP r_out = PROTECT(allocVector(VECSXP, 3));

  R_API_BEGIN();
  char const *c_json_config = CHAR(asChar(json_config));
@ -673,23 +813,20 @@ XGB_DLL SEXP XGBoosterFeatureScore_R(SEXP handle, SEXP json_config) {
                                   &out_dim, &out_shape, &out_scores));
  out_shape_sexp = PROTECT(allocVector(INTSXP, out_dim));
  size_t len = 1;
+  int *out_shape_sexp_ = INTEGER(out_shape_sexp);
  for (size_t i = 0; i < out_dim; ++i) {
-    INTEGER(out_shape_sexp)[i] = out_shape[i];
+    out_shape_sexp_[i] = out_shape[i];
    len *= out_shape[i];
  }

-  out_scores_sexp = PROTECT(allocVector(REALSXP, len));
-  auto ctx = xgboost::detail::BoosterCtx(R_ExternalPtrAddr(handle));
-  xgboost::common::ParallelFor(len, ctx->Threads(), [&](xgboost::omp_ulong i) {
-    REAL(out_scores_sexp)[i] = out_scores[i];
-  });
-
  out_features_sexp = PROTECT(allocVector(STRSXP, out_n_features));
  for (size_t i = 0; i < out_n_features; ++i) {
    SET_STRING_ELT(out_features_sexp, i, mkChar(out_features[i]));
  }

-  r_out = PROTECT(allocVector(VECSXP, 3));
+  out_scores_sexp = PROTECT(allocVector(REALSXP, len));
+  std::copy(out_scores, out_scores + len, REAL(out_scores_sexp));
+
  SET_VECTOR_ELT(r_out, 0, out_features_sexp);
  SET_VECTOR_ELT(r_out, 1, out_shape_sexp);
  SET_VECTOR_ELT(r_out, 2, out_scores_sexp);
--- a/R-package/tests/testthat/test_dmatrix.R
+++ b/R-package/tests/testthat/test_dmatrix.R
@ -265,3 +265,35 @@ test_that("xgb.DMatrix: print", {
    })
    expect_equal(txt, "xgb.DMatrix  dim: 6513 x 126  info: NA  colnames: no")
 })
+
+test_that("xgb.DMatrix: Inf as missing", {
+  x_inf <- matrix(as.numeric(1:10), nrow = 5)
+  x_inf[2, 1] <- Inf
+
+  x_nan <- x_inf
+  x_nan[2, 1] <- NA_real_
+
+  m_inf <- xgb.DMatrix(x_inf, nthread = n_threads, missing = Inf)
+  xgb.DMatrix.save(m_inf, "inf.dmatrix")
+
+  m_nan <- xgb.DMatrix(x_nan, nthread = n_threads, missing = NA_real_)
+  xgb.DMatrix.save(m_nan, "nan.dmatrix")
+
+  infconn <- file("inf.dmatrix", "rb")
+  nanconn <- file("nan.dmatrix", "rb")
+
+  expect_equal(file.size("inf.dmatrix"), file.size("nan.dmatrix"))
+
+  bytes <- file.size("inf.dmatrix")
+  infdmatrix <- readBin(infconn, "raw", n = bytes)
+  nandmatrix <- readBin(nanconn, "raw", n = bytes)
+
+  expect_equal(length(infdmatrix), length(nandmatrix))
+  expect_equal(infdmatrix, nandmatrix)
+
+  close(infconn)
+  close(nanconn)
+
+  file.remove("inf.dmatrix")
+  file.remove("nan.dmatrix")
+})
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@ -188,31 +188,45 @@ function(xgboost_set_hip_flags target)
    HIP_SEPARABLE_COMPILATION OFF)
 endfunction(xgboost_set_hip_flags)

-macro(xgboost_link_nccl target)
+function(xgboost_link_nccl target)
+  set(xgboost_nccl_flags -DXGBOOST_USE_NCCL=1)
+  if(USE_DLOPEN_NCCL)
+    list(APPEND xgboost_nccl_flags -DXGBOOST_USE_DLOPEN_NCCL=1)
+  endif()
+
  if(BUILD_STATIC_LIB)
    target_include_directories(${target} PUBLIC ${NCCL_INCLUDE_DIR})
-    target_compile_definitions(${target} PUBLIC -DXGBOOST_USE_NCCL=1)
+    target_compile_definitions(${target} PUBLIC ${xgboost_nccl_flags})
    target_link_libraries(${target} PUBLIC ${NCCL_LIBRARY})
  else()
    target_include_directories(${target} PRIVATE ${NCCL_INCLUDE_DIR})
-    target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_NCCL=1)
-    target_link_libraries(${target} PRIVATE ${NCCL_LIBRARY})
+    target_compile_definitions(${target} PRIVATE ${xgboost_nccl_flags})
+    if(NOT USE_DLOPEN_NCCL)
+      target_link_libraries(${target} PRIVATE ${NCCL_LIBRARY})
+    endif()
+  endif()
+endfunction()
+
+function(xgboost_link_rccl target)
+  set(xgboost_rccl_flags -DXGBOOST_USE_RCCL=1)
+  if(USE_DLOPEN_RCCL)
+    list(APPEND xgboost_rccl_flags -DXGBOOST_USE_DLOPEN_RCCL=1)
  endif()
-endmacro()

-macro(xgboost_link_rccl target)
  if(BUILD_STATIC_LIB)
    target_include_directories(${target} PUBLIC ${RCCL_INCLUDE_DIR})
-    target_compile_definitions(${target} PUBLIC -DXGBOOST_USE_RCCL=1)
+    target_compile_definitions(${target} PUBLIC ${xgboost_rccl_flags})
    target_link_directories(${target} PUBLIC ${HIP_LIB_INSTALL_DIR})
    target_link_libraries(${target} PUBLIC ${RCCL_LIBRARY})
  else()
    target_include_directories(${target} PRIVATE ${RCCL_INCLUDE_DIR})
-    target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_RCCL=1)
+    target_compile_definitions(${target} PRIVATE ${xgboost_rccl_flags})
    target_link_directories(${target} PUBLIC ${HIP_LIB_INSTALL_DIR})
-    target_link_libraries(${target} PRIVATE ${RCCL_LIBRARY})
+    if(NOT USE_DLOPEN_RCCL)
+      target_link_libraries(${target} PRIVATE ${RCCL_LIBRARY})
+    endif()
  endif()
-endmacro()
+endfunction()

 # compile options
 macro(xgboost_target_properties target)
--- a/cmake/modules/FindNccl.cmake
+++ b/cmake/modules/FindNccl.cmake
@ -54,17 +54,24 @@ find_path(NCCL_INCLUDE_DIR
  NAMES nccl.h
  HINTS  ${NCCL_ROOT}/include $ENV{NCCL_ROOT}/include)

-find_library(NCCL_LIBRARY
-  NAMES ${NCCL_LIB_NAME}
-  HINTS ${NCCL_ROOT}/lib $ENV{NCCL_ROOT}/lib/)
+if(USE_DLOPEN_NCCL)
+  include(FindPackageHandleStandardArgs)
+  find_package_handle_standard_args(Nccl DEFAULT_MSG NCCL_INCLUDE_DIR)

-message(STATUS "Using nccl library: ${NCCL_LIBRARY}")
+  mark_as_advanced(NCCL_INCLUDE_DIR)
+else()
+  find_library(NCCL_LIBRARY
+    NAMES ${NCCL_LIB_NAME}
+    HINTS ${NCCL_ROOT}/lib $ENV{NCCL_ROOT}/lib/)

-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(Nccl DEFAULT_MSG
-                                  NCCL_INCLUDE_DIR NCCL_LIBRARY)
+  message(STATUS "Using nccl library: ${NCCL_LIBRARY}")

-mark_as_advanced(
-  NCCL_INCLUDE_DIR
-  NCCL_LIBRARY
-)
+  include(FindPackageHandleStandardArgs)
+  find_package_handle_standard_args(Nccl DEFAULT_MSG
+    NCCL_INCLUDE_DIR NCCL_LIBRARY)
+
+  mark_as_advanced(
+    NCCL_INCLUDE_DIR
+    NCCL_LIBRARY
+  )
+endif()
--- a/demo/dask/cpu_survival.py
+++ b/demo/dask/cpu_survival.py
@ -9,7 +9,7 @@ import os
 import dask.dataframe as dd
 from dask.distributed import Client, LocalCluster

-import xgboost as xgb
+from xgboost import dask as dxgb
 from xgboost.dask import DaskDMatrix


@ -48,14 +48,14 @@ def main(client):
        "lambda": 0.01,
        "alpha": 0.02,
    }
-    output = xgb.dask.train(
+    output = dxgb.train(
        client, params, dtrain, num_boost_round=100, evals=[(dtrain, "train")]
    )
    bst = output["booster"]
    history = output["history"]

    # you can pass output directly into `predict` too.
-    prediction = xgb.dask.predict(client, bst, dtrain)
+    prediction = dxgb.predict(client, bst, dtrain)
    print("Evaluation history: ", history)

    # Uncomment the following line to save the model to the disk
--- a/demo/dask/cpu_training.py
+++ b/demo/dask/cpu_training.py
@ -6,7 +6,7 @@ Example of training with Dask on CPU
 from dask import array as da
 from dask.distributed import Client, LocalCluster

-import xgboost as xgb
+from xgboost import dask as dxgb
 from xgboost.dask import DaskDMatrix


@ -25,7 +25,7 @@ def main(client):
    # distributed version of train returns a dictionary containing the
    # resulting booster and evaluation history obtained from
    # evaluation metrics.
-    output = xgb.dask.train(
+    output = dxgb.train(
        client,
        {"verbosity": 1, "tree_method": "hist"},
        dtrain,
@ -36,7 +36,7 @@ def main(client):
    history = output["history"]

    # you can pass output directly into `predict` too.
-    prediction = xgb.dask.predict(client, bst, dtrain)
+    prediction = dxgb.predict(client, bst, dtrain)
    print("Evaluation history:", history)
    return prediction

--- a/demo/dask/dask_callbacks.py
+++ b/demo/dask/dask_callbacks.py
@ -8,6 +8,7 @@ from dask_ml.datasets import make_regression
 from dask_ml.model_selection import train_test_split

 import xgboost as xgb
+import xgboost.dask as dxgb
 from xgboost.dask import DaskDMatrix


@ -61,7 +62,7 @@ def main(client):
    dtrain = DaskDMatrix(client, X_train, y_train)
    dtest = DaskDMatrix(client, X_test, y_test)

-    output = xgb.dask.train(
+    output = dxgb.train(
        client,
        {
            "verbosity": 1,
--- a/demo/dask/gpu_training.py
+++ b/demo/dask/gpu_training.py
@ -8,7 +8,6 @@ from dask import dataframe as dd
 from dask.distributed import Client
 from dask_cuda import LocalCUDACluster

-import xgboost as xgb
 from xgboost import dask as dxgb
 from xgboost.dask import DaskDMatrix

@ -21,7 +20,7 @@ def using_dask_matrix(client: Client, X: da.Array, y: da.Array) -> da.Array:
    # Use train method from xgboost.dask instead of xgboost.  This distributed version
    # of train returns a dictionary containing the resulting booster and evaluation
    # history obtained from evaluation metrics.
-    output = xgb.dask.train(
+    output = dxgb.train(
        client,
        {
            "verbosity": 2,
@ -37,7 +36,7 @@ def using_dask_matrix(client: Client, X: da.Array, y: da.Array) -> da.Array:
    history = output["history"]

    # you can pass output directly into `predict` too.
-    prediction = xgb.dask.predict(client, bst, dtrain)
+    prediction = dxgb.predict(client, bst, dtrain)
    print("Evaluation history:", history)
    return prediction

@ -56,14 +55,14 @@ def using_quantile_device_dmatrix(client: Client, X: da.Array, y: da.Array) -> d
    # be used for anything else other than training unless a reference is specified. See
    # the `ref` argument of `DaskQuantileDMatrix`.
    dtrain = dxgb.DaskQuantileDMatrix(client, X, y)
-    output = xgb.dask.train(
+    output = dxgb.train(
        client,
        {"verbosity": 2, "tree_method": "hist", "device": "cuda"},
        dtrain,
        num_boost_round=4,
    )

-    prediction = xgb.dask.predict(client, output, X)
+    prediction = dxgb.predict(client, output, X)
    return prediction


--- a/demo/dask/sklearn_cpu_training.py
+++ b/demo/dask/sklearn_cpu_training.py
@ -5,7 +5,7 @@ Use scikit-learn regressor interface with CPU histogram tree method
 from dask import array as da
 from dask.distributed import Client, LocalCluster

-import xgboost
+from xgboost import dask as dxgb


 def main(client):
@ -16,7 +16,7 @@ def main(client):
    X = da.random.random((m, n), partition_size)
    y = da.random.random(m, partition_size)

-    regressor = xgboost.dask.DaskXGBRegressor(verbosity=1, n_estimators=2)
+    regressor = dxgb.DaskXGBRegressor(verbosity=1, n_estimators=2)
    regressor.set_params(tree_method="hist")
    # assigning client here is optional
    regressor.client = client
--- a/demo/dask/sklearn_gpu_training.py
+++ b/demo/dask/sklearn_gpu_training.py
@ -9,7 +9,7 @@ from dask.distributed import Client
 # It's recommended to use dask_cuda for GPU assignment
 from dask_cuda import LocalCUDACluster

-import xgboost
+from xgboost import dask as dxgb


 def main(client):
@ -20,7 +20,7 @@ def main(client):
    X = da.random.random((m, n), partition_size)
    y = da.random.random(m, partition_size)

-    regressor = xgboost.dask.DaskXGBRegressor(verbosity=1)
+    regressor = dxgb.DaskXGBRegressor(verbosity=1)
    # set the device to CUDA
    regressor.set_params(tree_method="hist", device="cuda")
    # assigning client here is optional
--- a/doc/contrib/coding_guide.rst
+++ b/doc/contrib/coding_guide.rst
@ -118,16 +118,40 @@ two automatic checks to enforce coding style conventions. To expedite the code r

 Linter
 ======
-We use `pylint <https://github.com/PyCQA/pylint>`_ and `cpplint <https://github.com/cpplint/cpplint>`_ to enforce style convention and find potential errors. Linting is especially useful for Python, as we can catch many errors that would have otherwise occurred at run-time.
+We use a combination of linters to enforce style convention and find potential errors. Linting is especially useful for scripting languages like Python, as we can catch many errors that would have otherwise occurred at run-time.

-To run this check locally, run the following command from the top level source tree:
+For Python scripts, `pylint <https://github.com/PyCQA/pylint>`_, `black <https://github.com/psf/black>`__ and `isort <https://github.com/PyCQA/isort>`__ are used for providing guidance on coding style, and `mypy <https://github.com/python/mypy>`__ is required for type checking. For C++, `cpplint <https://github.com/cpplint/cpplint>`_ is used along with ``clang-tidy``. For R, ``lintr`` is used.
+
+To run checks for Python locally, install the checkers mentioned previously and run:

 .. code-block:: bash

  cd /path/to/xgboost/
-  make lint
+  python ./tests/ci_build/lint_python.py --fix
+
+To run checks for R:
+
+.. code-block:: bash
+
+  cd /path/to/xgboost/
+  Rscript tests/ci_build/lint_r.R $(pwd)
+
+To run checks for cpplint locally:
+
+.. code-block:: bash
+
+  cd /path/to/xgboost/
+  python ./tests/ci_build/lint_cpp.py
+
+
+See next section for clang-tidy. For CMake scripts:
+
+.. code-block:: bash
+
+  bash ./tests/ci_build/lint_cmake.sh
+
+Lastly, the linter for jvm-packages is integrated into the maven build process.

-This command requires the Python packages pylint and cpplint.

 Clang-tidy
 ==========
--- a/doc/contrib/consistency.rst
+++ b/doc/contrib/consistency.rst
@ -0,0 +1,62 @@
+#################################
+Consistency for Language Bindings
+#################################
+
+XGBoost has many different language bindings developed over the years, some are in the main repository while others live independently. Many features and interfaces are inconsistent with each others, this document aims to provide some guidelines and actionable items for language binding designers.
+
+*******************
+Model Serialization
+*******************
+
+XGBoost C API exposes a couple functions for serializing a model for persistence storage. These saved files are backward compatible, meaning one can load an older XGBoost model with a newer XGBoost version. If there's change in the model format, we have deprecation notice inside the C++ implementation and public issue for tracking the status. See :doc:`/tutorials/saving_model` for details.
+
+As a result, these are considered to be stable and should work across language bindings. For instance, a model trained in R should be fully functioning in C or Python. Please don't pad anything to the output file or buffer.
+
+If there are extra fields that must be saved:
+
+- First review whether the attribute can be retrieved from known properties of the model. For instance, there's a :py:attr:`~xgboost.XGBClassifier.classes_` attribute in the scikit-learn interface :py:class:`~xgboost.XGBClassifier`, which can be obtained through `numpy.arange(n_classes)` and doesn't need to be saved into the model. Preserving version compatibility is not a trivial task and we are still spending a significant amount of time to maintain it. Please don't make complication if it's not necessary.
+
+- Then please consider whether it's universal. For instance, we have added `feature_types` to the model serialization for categorical features (which is a new feature after 1.6), the attribute is useful or will be useful in the future regardless of the language binding.
+
+- If the field is small, we can save it as model attribute (which is a key-value structure). These attributes are ignored by all other language bindings and mostly an ad-hoc storage.
+
+- Lastly, we should use the UBJSON as the default output format when given a chance (not to be burdened by the old binary format).
+
+*********************
+Training Continuation
+*********************
+
+There are cases where we want to train a model based on the previous model, for boosting trees, it's either adding new trees or modifying the existing trees. This can be normal model update, error recovery, or other special cases we don't know of yet. When it happens, the training iteration should start from 0, not from the last boosted rounds of the model. 0 is a special iteration number, we perform some extra checks like whether the label is valid during that iteration. These checks can be expensive but necessary for eliminating silent errors. Keeping the iteration starts from zero allows us to perform these checks only once for each input data.
+
+*********
+Inference
+*********
+
+The inference function is quite inconsistent among language bindings at the time of writing due to historical reasons, but this makes more important for us to have consistency in mind in the future development.
+
+- Firstly, it's the output shape. There's a relatively new parameter called ``strict_shape`` in XGBoost and is rarely used. We want to make it as the default behavior but couldn't due to compatibility concerns. See :doc:`/prediction` for details. In short, if specified, XGBoost C++ implementation can output prediction with the correct shape, instead of letting the language binding to handle it.
+- Policy around early stopping is at the moment inconsistent between various interfaces. Some considers the ``best_iteration`` attribute while others don't. We should formalize that all interfaces in the future should use the ``best_iteration`` during inference unless user has explicitly specified the ``iteration_range`` parameter.
+
+****************
+Parameter naming
+****************
+
+There are many parameter naming conventions out there, Some XGBoost interfaces try to align with the larger communities. For example, the R package might support parameters naming like ``max.depth=3``, while the Spark package might support ``MaxDepth=3``. These are fine, it's better for the users to keep their pipeline consistent. However, while supporting naming variants, the normal, XGBoost way of naming should also be supported, meaning ``max_depth=3`` should be a valid parameter no-matter what language one is using. If someone were to write duplicated parameter ``max.depth=3, max_depth=3``, a clear error should be preferred instead of prioritizing one over the other.
+
+******************
+Default Parameters
+******************
+
+Like many other machine learning libraries, all parameters from XGBoost can either be inferred from the data or have default values. Bindings should not make copies of these default values and let the XGBoost core decide. When the parameter key is not passed into the C++ core, XGBoost will pick the default accordingly. These defaults are not necessarily optimal, but they are there for consistency. If there's a new choice of default parameter, we can change it inside the core and it will be automatically propagated to all bindings. Given the same set of parameters and data, various bindings should strive to produce the same model. One exception is the `num_boost_rounds`, which exists only in high-level bindings and has various alias like ``n_estimators``. Its default value is close to arbitrary at the moment, we haven't been able to get a good default yet.
+
+*******
+Logging
+*******
+
+XGBoost has a default logger builtin that can be a wrapper over binding-specific logging facility. For instance, the Python binding registers a callback to use Python :py:mod:`warnings` and :py:func:`print` function to output logging. We want to keep logging native to the larger communities instead of using the ``std::cerr`` from C++.
+
+***********************************
+Minimum Amount of Data Manipulation
+***********************************
+
+XGBoost is mostly a machine learning library providing boosting algorithm implementation. Some other implementations might perform some sort of data manipulation implicitly like deciding the coding of the data, and transforming the data according to some heuristic before training. We prefer to keep these operations based on necessities instead of convenience to keep the scope of the project well-defined. Whenever possible, we should leave these features to 3-party libraries and consider how a user can compose their pipeline. For instance, XGBoost itself should not perform ordinal encoding for categorical data, users will pick an encoder that fits their use cases (like out-of-core implementation, distributed implementation, known mapping, etc). If some transformations are decided to be part of the algorithm, we can have it inside the core instead of the language binding. Examples would be target-encoding or sketching the response variables. If we were to support them, we could have it inside the core implementation as part of the ML algorithm. This aligns with the same principles of default parameters, various bindings should provide similar (if not the same) results given the same set of parameters and data.
--- a/doc/contrib/index.rst
+++ b/doc/contrib/index.rst
@ -23,6 +23,7 @@ Here are guidelines for contributing to various aspect of the XGBoost project:
  Community Guideline <community>
  donate
  coding_guide
+  consistency
  python_packaging
  unit_tests
  Docs and Examples <docs>
--- a/doc/jvm/xgboost4j_spark_gpu_tutorial.rst
+++ b/doc/jvm/xgboost4j_spark_gpu_tutorial.rst
@ -18,9 +18,9 @@ Build an ML Application with XGBoost4J-Spark-GPU
 Add XGBoost to Your Project
 ===========================

-Before we go into the tour of how to use XGBoost4J-Spark-GPU, you should first consult
-:ref:`Installation from Maven repository <install_jvm_packages>` in order to add XGBoost4J-Spark-GPU as
-a dependency for your project. We provide both stable releases and snapshots.
+Prior to delving into the tutorial on utilizing XGBoost4J-Spark-GPU, it is advisable to refer to
+:ref:`Installation from Maven repository <install_jvm_packages>` for instructions on adding XGBoost4J-Spark-GPU
+as a project dependency. We offer both stable releases and snapshots for your convenience.

 Data Preparation
 ================
@ -54,7 +54,7 @@ Read Dataset with Spark's Built-In Reader
      .schema(schema)
      .csv(dataPath)

-In the first line, we create an instance of a `SparkSession <https://spark.apache.org/docs/latest/sql-getting-started.html#starting-point-sparksession>`_
+At first, we create an instance of a `SparkSession <https://spark.apache.org/docs/latest/sql-getting-started.html#starting-point-sparksession>`_
 which is the entry point of any Spark application working with DataFrames. The ``schema`` variable
 defines the schema of the DataFrame wrapping Iris data. With this explicitly set schema, we
 can define the column names as well as their types; otherwise the column names would be
@ -112,7 +112,7 @@ models. Although we use the Iris dataset in this tutorial to show how we use
 ``XGBoost/XGBoost4J-Spark-GPU`` to resolve a multi-classes classification problem, the
 usage in Regression is very similar to classification.

-To train a XGBoost model for classification, we need to claim a XGBoostClassifier first:
+To train a XGBoost model for classification, we need to define a XGBoostClassifier first:

 .. code-block:: scala

@ -130,9 +130,13 @@ To train a XGBoost model for classification, we need to claim a XGBoostClassifie
      .setFeaturesCol(featuresNames)
      .setLabelCol(labelName)

-The ``device`` parameter is for informing XGBoost that CUDA devices should be used instead of CPU. Unlike the single-node mode, GPUs are managed by spark instead of by XGBoost. Therefore, explicitly specified device ordinal like ``cuda:1`` is not support.
+The ``device`` parameter is for informing XGBoost that CUDA devices should be used instead of CPU.
+Unlike the single-node mode, GPUs are managed by spark instead of by XGBoost. Therefore,
+explicitly specified device ordinal like ``cuda:1`` is not support.

-The available parameters for training a XGBoost model can be found in :doc:`here </parameter>`. Similar to the XGBoost4J-Spark package, in addition to the default set of parameters, XGBoost4J-Spark-GPU also supports the camel-case variant of these parameters to be consistent with Spark's MLlib naming convention.
+The available parameters for training a XGBoost model can be found in :doc:`here </parameter>`.
+Similar to the XGBoost4J-Spark package, in addition to the default set of parameters,
+XGBoost4J-Spark-GPU also supports the camel-case variant of these parameters to be consistent with Spark's MLlib naming convention.

 Specifically, each parameter in :doc:`this page </parameter>` has its equivalent form in
 XGBoost4J-Spark-GPU with camel case. For example, to set ``max_depth`` for each tree, you
@ -211,21 +215,36 @@ and the prediction for each instance.
 Submit the application
 **********************

-Here’s an example to submit an end-to-end XGBoost-4j-Spark-GPU Spark application to an
-Apache Spark Standalone cluster, assuming the application main class is Iris and the
-application jar is iris-1.0.0.jar
+Assuming you have configured the Spark standalone cluster with GPU support. Otherwise, please
+refer to `spark standalone configuration with GPU support <https://nvidia.github.io/spark-rapids/docs/get-started/getting-started-on-prem.html#spark-standalone-cluster>`_.
+
+Starting from XGBoost 2.1.0, stage-level scheduling is automatically enabled. Therefore,
+if you are using Spark standalone cluster version 3.4.0 or higher, we strongly recommend
+configuring the ``"spark.task.resource.gpu.amount"`` as a fractional value. This will
+enable running multiple tasks in parallel during the ETL phase. An example configuration
+would be ``"spark.task.resource.gpu.amount=1/spark.executor.cores"``. However, if you are
+using a XGBoost version earlier than 2.1.0 or a Spark standalone cluster version below 3.4.0,
+you still need to set ``"spark.task.resource.gpu.amount"`` equal to ``"spark.executor.resource.gpu.amount"``.
+
+.. note::
+
+  As of now, the stage-level scheduling feature in XGBoost is limited to the Spark standalone cluster mode.
+  However, we have plans to expand its compatibility to YARN and Kubernetes once Spark 3.5.1 is officially released.
+
+Assuming that the application main class is "Iris" and the application jar is "iris-1.0.0.jar",`
+provided below is an instance demonstrating how to submit the xgboost application to an Apache
+Spark Standalone cluster.

 .. code-block:: bash

-  cudf_version=22.02.0
-  rapids_version=22.02.0
-  xgboost_version=1.6.1
+  rapids_version=23.10.0
+  xgboost_version=2.0.1
  main_class=Iris
  app_jar=iris-1.0.0.jar

  spark-submit \
    --master $master \
-    --packages ai.rapids:cudf:${cudf_version},com.nvidia:rapids-4-spark_2.12:${rapids_version},ml.dmlc:xgboost4j-gpu_2.12:${xgboost_version},ml.dmlc:xgboost4j-spark-gpu_2.12:${xgboost_version} \
+    --packages com.nvidia:rapids-4-spark_2.12:${rapids_version},ml.dmlc:xgboost4j-gpu_2.12:${xgboost_version},ml.dmlc:xgboost4j-spark-gpu_2.12:${xgboost_version} \
    --conf spark.executor.cores=12 \
    --conf spark.task.cpus=1 \
    --conf spark.executor.resource.gpu.amount=1 \
@ -236,7 +255,7 @@ application jar is iris-1.0.0.jar
    --class ${main_class} \
     ${app_jar}

-* First, we need to specify the ``RAPIDS Accelerator, cudf, xgboost4j-gpu, xgboost4j-spark-gpu`` packages by ``--packages``
+* First, we need to specify the ``RAPIDS Accelerator, xgboost4j-gpu, xgboost4j-spark-gpu`` packages by ``--packages``
 * Second, ``RAPIDS Accelerator`` is a Spark plugin, so we need to configure it by specifying ``spark.plugins=com.nvidia.spark.SQLPlugin``

 For details about other ``RAPIDS Accelerator`` other configurations, please refer to the `configuration <https://nvidia.github.io/spark-rapids/docs/configs.html>`_.
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@ -470,7 +470,7 @@ Parameter for using Pseudo-Huber (``reg:pseudohubererror``)
 Parameter for using Quantile Loss (``reg:quantileerror``)
 =========================================================

-* ``quantile_alpha``: A scala or a list of targeted quantiles.
+* ``quantile_alpha``: A scalar or a list of targeted quantiles.

    .. versionadded:: 2.0.0

--- a/doc/tutorials/dask.rst
+++ b/doc/tutorials/dask.rst
@ -39,7 +39,8 @@ on a dask cluster:

 .. code-block:: python

-    import xgboost as xgb
+    from xgboost import dask as dxgb
+
    import dask.array as da
    import dask.distributed

@ -53,11 +54,11 @@ on a dask cluster:
        X = da.random.random(size=(num_obs, num_features), chunks=(1000, num_features))
        y = da.random.random(size=(num_obs, 1), chunks=(1000, 1))

-        dtrain = xgb.dask.DaskDMatrix(client, X, y)
+        dtrain = dxgb.DaskDMatrix(client, X, y)
        # or
-        # dtrain = xgb.dask.DaskQuantileDMatrix(client, X, y)
+        # dtrain = dxgb.DaskQuantileDMatrix(client, X, y)

-        output = xgb.dask.train(
+        output = dxgb.train(
            client,
            {"verbosity": 2, "tree_method": "hist", "objective": "reg:squarederror"},
            dtrain,
@ -87,25 +88,27 @@ returns a model and the computation history as a Python dictionary:

 .. code-block:: python

-  {'booster': Booster,
-   'history': dict}
+  {
+    "booster": Booster,
+    "history": dict,
+  }

 For prediction, pass the ``output`` returned by ``train`` into :py:func:`xgboost.dask.predict`:

 .. code-block:: python

-  prediction = xgb.dask.predict(client, output, dtrain)
+  prediction = dxgb.predict(client, output, dtrain)
  # Or equivalently, pass ``output['booster']``:
-  prediction = xgb.dask.predict(client, output['booster'], dtrain)
+  prediction = dxgb.predict(client, output['booster'], dtrain)

 Eliminating the construction of DaskDMatrix is also possible, this can make the
 computation a bit faster when meta information like ``base_margin`` is not needed:

 .. code-block:: python

-  prediction = xgb.dask.predict(client, output, X)
+  prediction = dxgb.predict(client, output, X)
  # Use inplace version.
-  prediction = xgb.dask.inplace_predict(client, output, X)
+  prediction = dxgb.inplace_predict(client, output, X)

 Here ``prediction`` is a dask ``Array`` object containing predictions from model if input
 is a ``DaskDMatrix`` or ``da.Array``.  When putting dask collection directly into the
@ -134,14 +137,14 @@ both memory usage and prediction time.
 .. code-block:: python

  # dtrain is the DaskDMatrix defined above.
-  prediction = xgb.dask.predict(client, booster, dtrain)
+  prediction = dxgb.predict(client, booster, dtrain)

 or equivalently:

 .. code-block:: python

  # where X is a dask DataFrame or dask Array.
-  prediction = xgb.dask.predict(client, booster, X)
+  prediction = dxgb.predict(client, booster, X)

 Also for inplace prediction:

@ -149,7 +152,7 @@ Also for inplace prediction:

  # where X is a dask DataFrame or dask Array backed by cupy or cuDF.
  booster.set_param({"device": "cuda"})
-  prediction = xgb.dask.inplace_predict(client, booster, X)
+  prediction = dxgb.inplace_predict(client, booster, X)

 When input is ``da.Array`` object, output is always ``da.Array``.  However, if the input
 type is ``dd.DataFrame``, output can be ``dd.Series``, ``dd.DataFrame`` or ``da.Array``,
@ -174,7 +177,7 @@ One simple optimization for running consecutive predictions is using
    futures = []
    for X in dataset:
        # Here we pass in a future instead of concrete booster
-        shap_f = xgb.dask.predict(client, booster_f, X, pred_contribs=True)
+        shap_f = dxgb.predict(client, booster_f, X, pred_contribs=True)
        futures.append(shap_f)

    results = client.gather(futures)
@ -186,7 +189,7 @@ Scikit-Learn wrapper object:

 .. code-block:: python

-    cls = xgb.dask.DaskXGBClassifier()
+    cls = dxgb.DaskXGBClassifier()
    cls.fit(X, y)

    booster = cls.get_booster()
@ -207,12 +210,12 @@ collection.
 .. code-block:: python

    from distributed import LocalCluster, Client
-    import xgboost as xgb
+    from xgboost import dask as dxgb


    def main(client: Client) -> None:
        X, y = load_data()
-        clf = xgb.dask.DaskXGBClassifier(n_estimators=100, tree_method="hist")
+        clf = dxgb.DaskXGBClassifier(n_estimators=100, tree_method="hist")
        clf.client = client  # assign the client
        clf.fit(X, y, eval_set=[(X, y)])
        proba = clf.predict_proba(X)
@ -242,7 +245,7 @@ In the example below, a ``KubeCluster`` is used for `deploying Dask on Kubernete

  from dask_kubernetes import KubeCluster  # Need to install the ``dask-kubernetes`` package
  from dask.distributed import Client
-  import xgboost as xgb
+  from xgboost import dask as dxgb
  import dask
  import dask.array as da

@ -265,7 +268,7 @@ In the example below, a ``KubeCluster`` is used for `deploying Dask on Kubernete
          X = da.random.random(size=(m, n), chunks=100)
          y = da.random.random(size=(m, ), chunks=100)

-          regressor = xgb.dask.DaskXGBRegressor(n_estimators=10, missing=0.0)
+          regressor = dxgb.DaskXGBRegressor(n_estimators=10, missing=0.0)
          regressor.client = client
          regressor.set_params(tree_method='hist', device="cuda")
          regressor.fit(X, y, eval_set=[(X, y)])
@ -298,7 +301,7 @@ threads in each process for training.  But if ``nthread`` parameter is set:

 .. code-block:: python

-    output = xgb.dask.train(
+    output = dxgb.train(
        client,
        {"verbosity": 1, "nthread": 8, "tree_method": "hist"},
        dtrain,
@ -330,12 +333,12 @@ Functional interface:

    async with dask.distributed.Client(scheduler_address, asynchronous=True) as client:
        X, y = generate_array()
-        m = await xgb.dask.DaskDMatrix(client, X, y)
-        output = await xgb.dask.train(client, {}, dtrain=m)
+        m = await dxgb.DaskDMatrix(client, X, y)
+        output = await dxgb.train(client, {}, dtrain=m)

-        with_m = await xgb.dask.predict(client, output, m)
-        with_X = await xgb.dask.predict(client, output, X)
-        inplace = await xgb.dask.inplace_predict(client, output, X)
+        with_m = await dxgb.predict(client, output, m)
+        with_X = await dxgb.predict(client, output, X)
+        inplace = await dxgb.inplace_predict(client, output, X)

        # Use ``client.compute`` instead of the ``compute`` method from dask collection
        print(await client.compute(with_m))
@ -349,7 +352,7 @@ actual computation will return a coroutine and hence require awaiting:

    async with dask.distributed.Client(scheduler_address, asynchronous=True) as client:
        X, y = generate_array()
-        regressor = await xgb.dask.DaskXGBRegressor(verbosity=1, n_estimators=2)
+        regressor = await dxgb.DaskXGBRegressor(verbosity=1, n_estimators=2)
        regressor.set_params(tree_method='hist')  # trivial method, synchronous operation
        regressor.client = client  #  accessing attribute, synchronous operation
        regressor = await regressor.fit(X, y, eval_set=[(X, y)])
@ -371,7 +374,7 @@ To enable early stopping, pass one or more validation sets containing ``DaskDMat
 .. code-block:: python

    import dask.array as da
-    import xgboost as xgb
+    from xgboost import dask as dxgb

    num_rows = 1e6
    num_features = 100
@ -398,19 +401,19 @@ To enable early stopping, pass one or more validation sets containing ``DaskDMat
        chunks=(rows_per_chunk, 1)
    )

-    dtrain = xgb.dask.DaskDMatrix(
+    dtrain = dxgb.DaskDMatrix(
        client=client,
        data=data,
        label=labels
    )

-    dvalid = xgb.dask.DaskDMatrix(
+    dvalid = dxgb.DaskDMatrix(
        client=client,
        data=X_eval,
        label=y_eval
    )

-    result = xgb.dask.train(
+    result = dxgb.train(
        client=client,
        params={
            "objective": "reg:squarederror",
@ -421,7 +424,7 @@ To enable early stopping, pass one or more validation sets containing ``DaskDMat
        early_stopping_rounds=3
    )

-When validation sets are provided to ``xgb.dask.train()`` in this way, the model object returned by ``xgb.dask.train()`` contains a history of evaluation metrics for each validation set, across all boosting rounds.
+When validation sets are provided to :py:func:`xgboost.dask.train` in this way, the model object returned by :py:func:`xgboost.dask.train` contains a history of evaluation metrics for each validation set, across all boosting rounds.

 .. code-block:: python

@ -463,7 +466,7 @@ interface, including callback functions, custom evaluation metric and objective:
        save_best=True,
    )

-    booster = xgb.dask.train(
+    booster = dxgb.train(
        client,
        params={
            "objective": "binary:logistic",
@ -533,6 +536,37 @@ Troubleshooting
 - MIG (Multi-Instance GPU) is not yet supported by NCCL. You will receive an error message
  that includes `Multiple processes within a communication group ...` upon initialization.

+.. _nccl-load:
+
+- Starting from version 2.1.0, to reduce the size of the binary wheel, the XGBoost package
+  (installed using pip) loads NCCL from the environment instead of bundling it
+  directly. This means that if you encounter an error message like
+  "Failed to load nccl ...", it indicates that NCCL is not installed or properly
+  configured in your environment.
+
+  To resolve this issue, you can install NCCL using pip:
+
+  .. code-block:: sh
+
+    pip install nvidia-nccl-cu12 # (or with any compatible CUDA version)
+
+  The default conda installation of XGBoost should not encounter this error. If you are
+  using a customized XGBoost, please make sure one of the followings is true:
+
+  + XGBoost is NOT compiled with the `USE_DLOPEN_NCCL` flag.
+  + The `dmlc_nccl_path` parameter is set to full NCCL path when initializing the collective.
+
+  Here are some additional tips for troubleshooting NCCL dependency issues:
+
+  + Check the NCCL installation path and verify that it's installed correctly. We try to
+    find NCCL by using ``from nvidia.nccl import lib`` in Python when XGBoost is installed
+    using pip.
+  + Ensure that you have the correct CUDA version installed. NCCL requires a compatible
+    CUDA version to function properly.
+  + If you are not using distributed training with XGBoost and yet see this error, please
+    open an issue on GitHub.
+  + If you continue to encounter NCCL dependency issues, please open an issue on GitHub.
+
 ************
 IPv6 Support
 ************
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@ -1508,6 +1508,83 @@ XGB_DLL int XGBoosterFeatureScore(BoosterHandle handle, const char *config,
 * @{
 */

+/**
+ * @brief Handle to tracker.
+ *
+ *   There are currently two types of tracker in XGBoost, first one is `rabit`, while the
+ *   other one is `federated`.
+ *
+ *   This is still under development.
+ */
+typedef void *TrackerHandle; /* NOLINT */
+
+/**
+ * @brief Create a new tracker.
+ *
+ * @param config JSON encoded parameters.
+ *
+ *   - dmlc_communicator: String, the type of tracker to create. Available options are `rabit`
+ *                        and `federated`.
+ *   - n_workers: Integer, the number of workers.
+ *   - port: (Optional) Integer, the port this tracker should listen to.
+ *   - timeout: (Optional) Integer, timeout in seconds for various networking operations.
+ *
+ *   Some configurations are `rabit` specific:
+ *   - host: (Optional) String, Used by the the `rabit` tracker to specify the address of the host.
+ *
+ *   Some `federated` specific configurations:
+ *   - federated_secure: Boolean, whether this is a secure server.
+ *   - server_key_path: Path to the server key. Used only if this is a secure server.
+ *   - server_cert_path: Path to the server certificate. Used only if this is a secure server.
+ *   - client_cert_path: Path to the client certificate. Used only if this is a secure server.
+ *
+ * @param handle The handle to the created tracker.
+ *
+ * @return 0 for success, -1 for failure.
+ */
+XGB_DLL int XGTrackerCreate(char const *config, TrackerHandle *handle);
+
+/**
+ * @brief Get the arguments needed for running workers. This should be called after
+ *        XGTrackerRun() and XGTrackerWait()
+ *
+ * @param handle The handle to the tracker.
+ * @param args The arguments returned as a JSON document.
+ *
+ * @return 0 for success, -1 for failure.
+ */
+XGB_DLL int XGTrackerWorkerArgs(TrackerHandle handle, char const **args);
+
+/**
+ * @brief Run the tracker.
+ *
+ * @param handle The handle to the tracker.
+ *
+ * @return 0 for success, -1 for failure.
+ */
+XGB_DLL int XGTrackerRun(TrackerHandle handle);
+
+/**
+ * @brief Wait for the tracker to finish, should be called after XGTrackerRun().
+ *
+ * @param handle The handle to the tracker.
+ * @param config JSON encoded configuration. No argument is required yet, preserved for
+ *        the future.
+ *
+ * @return 0 for success, -1 for failure.
+ */
+XGB_DLL int XGTrackerWait(TrackerHandle handle, char const *config);
+
+/**
+ * @brief Free a tracker instance. XGTrackerWait() is called internally. If the tracker
+ *        cannot close properly, manual interruption is required.
+ *
+ * @param handle The handle to the tracker.
+ *
+ * @return 0 for success, -1 for failure.
+ */
+XGB_DLL int XGTrackerFree(TrackerHandle handle);
+
 /*!
 * \brief Initialize the collective communicator.
 *
@ -1536,6 +1613,8 @@ XGB_DLL int XGBoosterFeatureScore(BoosterHandle handle, const char *config,
 *   - DMLC_TRACKER_PORT: Port number of the tracker.
 *   - DMLC_TASK_ID: ID of the current task, can be used to obtain deterministic rank assignment.
 *   - DMLC_WORKER_CONNECT_RETRY: Number of retries to connect to the tracker.
+ *   - dmlc_nccl_path: The path to NCCL shared object. Only used if XGBoost is compiled with
+ *                     `USE_DLOPEN_NCCL`.
 * Only applicable to the Federated communicator (use upper case for environment variables, use
 * lower case for runtime configuration):
 *   - federated_server_address: Address of the federated server.
--- a/include/xgboost/collective/socket.h
+++ b/include/xgboost/collective/socket.h
@ -412,19 +412,24 @@ class TCPSocket {
    return Success();
  }

-  void SetKeepAlive() {
+  [[nodiscard]] Result SetKeepAlive() {
    std::int32_t keepalive = 1;
-    xgboost_CHECK_SYS_CALL(setsockopt(handle_, SOL_SOCKET, SO_KEEPALIVE,
-                                      reinterpret_cast<char *>(&keepalive), sizeof(keepalive)),
-                           0);
+    auto rc = setsockopt(handle_, SOL_SOCKET, SO_KEEPALIVE, reinterpret_cast<char *>(&keepalive),
+                         sizeof(keepalive));
+    if (rc != 0) {
+      return system::FailWithCode("Failed to set TCP keeaplive.");
+    }
+    return Success();
  }

-  void SetNoDelay() {
+  [[nodiscard]] Result SetNoDelay() {
    std::int32_t tcp_no_delay = 1;
-    xgboost_CHECK_SYS_CALL(
-        setsockopt(handle_, IPPROTO_TCP, TCP_NODELAY, reinterpret_cast<char *>(&tcp_no_delay),
-                   sizeof(tcp_no_delay)),
-        0);
+    auto rc = setsockopt(handle_, IPPROTO_TCP, TCP_NODELAY, reinterpret_cast<char *>(&tcp_no_delay),
+                         sizeof(tcp_no_delay));
+    if (rc != 0) {
+      return system::FailWithCode("Failed to set TCP no delay.");
+    }
+    return Success();
  }

  /**
--- a/include/xgboost/context.h
+++ b/include/xgboost/context.h
@ -250,9 +250,15 @@ struct Context : public XGBoostParameter<Context> {
      default:
        // Do not use the device name as this is likely an internal error, the name
        // wouldn't be valid.
-        LOG(FATAL) << "Unknown device type:"
-                   << static_cast<std::underlying_type_t<DeviceOrd::Type>>(this->Device().device);
-        break;
+        if (this->Device().IsSycl()) {
+          LOG(WARNING) << "The requested feature doesn't have SYCL specific implementation yet. "
+                       << "CPU implementation is used";
+          return cpu_fn();
+        } else {
+          LOG(FATAL) << "Unknown device type:"
+                     << static_cast<std::underlying_type_t<DeviceOrd::Type>>(this->Device().device);
+          break;
+        }
    }
    return std::invoke_result_t<CPUFn>();
  }
@ -262,7 +268,6 @@ struct Context : public XGBoostParameter<Context> {
   */
  template <typename CPUFn, typename CUDAFn, typename SYCLFn>
  decltype(auto) DispatchDevice(CPUFn&& cpu_fn, CUDAFn&& cuda_fn, SYCLFn&& sycl_fn) const {
-    static_assert(std::is_same_v<std::invoke_result_t<CPUFn>, std::invoke_result_t<CUDAFn>>);
    static_assert(std::is_same_v<std::invoke_result_t<CPUFn>, std::invoke_result_t<SYCLFn>>);
    if (this->Device().IsSycl()) {
      return sycl_fn();
--- a/include/xgboost/data.h
+++ b/include/xgboost/data.h
@ -178,7 +178,7 @@ class MetaInfo {
   * in vertical federated learning, since each worker loads its own list of columns,
   * we need to sum them.
   */
-  void SynchronizeNumberOfColumns();
+  void SynchronizeNumberOfColumns(Context const* ctx);

  /*! \brief Whether the data is split row-wise. */
  bool IsRowSplit() const {
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@ -582,20 +582,20 @@ auto MakeTensorView(Context const *ctx, Container &data, S &&...shape) {  // NOL
  return TensorView<T, sizeof...(S)>{data, in_shape, ctx->Device()};
 }

-template <typename T, typename... S>
-LINALG_HD auto MakeTensorView(DeviceOrd device, common::Span<T> data, S &&...shape) {
+template <typename T, decltype(common::dynamic_extent) ext, typename... S>
+LINALG_HD auto MakeTensorView(DeviceOrd device, common::Span<T, ext> data, S &&...shape) {
  std::size_t in_shape[sizeof...(S)];
  detail::IndexToArr(in_shape, std::forward<S>(shape)...);
  return TensorView<T, sizeof...(S)>{data, in_shape, device};
 }

-template <typename T, typename... S>
-auto MakeTensorView(Context const *ctx, common::Span<T> data, S &&...shape) {
+template <typename T, decltype(common::dynamic_extent) ext, typename... S>
+auto MakeTensorView(Context const *ctx, common::Span<T, ext> data, S &&...shape) {
  return MakeTensorView(ctx->Device(), data, std::forward<S>(shape)...);
 }

-template <typename T, typename... S>
-auto MakeTensorView(Context const *ctx, Order order, common::Span<T> data, S &&...shape) {
+template <typename T, decltype(common::dynamic_extent) ext, typename... S>
+auto MakeTensorView(Context const *ctx, Order order, common::Span<T, ext> data, S &&...shape) {
  std::size_t in_shape[sizeof...(S)];
  detail::IndexToArr(in_shape, std::forward<S>(shape)...);
  return TensorView<T, sizeof...(S)>{data, in_shape, ctx->Device(), order};
--- a/include/xgboost/predictor.h
+++ b/include/xgboost/predictor.h
@ -92,8 +92,8 @@ class Predictor {
   * \param out_predt Prediction vector to be initialized.
   * \param model Tree model used for prediction.
   */
-  void InitOutPredictions(const MetaInfo& info, HostDeviceVector<bst_float>* out_predt,
-                          const gbm::GBTreeModel& model) const;
+  virtual void InitOutPredictions(const MetaInfo& info, HostDeviceVector<bst_float>* out_predt,
+                                  const gbm::GBTreeModel& model) const;

  /**
   * \brief Generate batch predictions for a given feature matrix. May use
--- a/include/xgboost/string_view.h
+++ b/include/xgboost/string_view.h
@ -1,23 +1,24 @@
 /**
- * Copyright 2021-2023 by XGBoost Contributors
+ * Copyright 2021-2023, XGBoost Contributors
 */
 #ifndef XGBOOST_STRING_VIEW_H_
 #define XGBOOST_STRING_VIEW_H_
 #include <xgboost/logging.h>  // CHECK_LT
 #include <xgboost/span.h>     // Span

-#include <algorithm>          // std::equal,std::min
-#include <iterator>           // std::reverse_iterator
-#include <ostream>            // std::ostream
-#include <string>             // std::char_traits,std::string
+#include <algorithm>  // for equal, min
+#include <cstddef>    // for size_t
+#include <iterator>   // for reverse_iterator
+#include <ostream>    // for ostream
+#include <string>     // for char_traits, string

 namespace xgboost {
 struct StringView {
 private:
-  using CharT = char;  // unsigned char
+  using CharT = char;
  using Traits = std::char_traits<CharT>;
  CharT const* str_{nullptr};
-  size_t size_{0};
+  std::size_t size_{0};

 public:
  using value_type = CharT;                                        // NOLINT
@ -28,40 +29,41 @@ struct StringView {

 public:
  constexpr StringView() = default;
-  constexpr StringView(CharT const* str, std::size_t size) : str_{str}, size_{size} {}
+  constexpr StringView(value_type const* str, std::size_t size) : str_{str}, size_{size} {}
  StringView(std::string const& str) : str_{str.c_str()}, size_{str.size()} {}  // NOLINT
-  constexpr StringView(CharT const* str)  // NOLINT
+  constexpr StringView(value_type const* str)                                   // NOLINT
      : str_{str}, size_{str == nullptr ? 0ul : Traits::length(str)} {}

-  CharT const& operator[](size_t p) const { return str_[p]; }
-  CharT const& at(size_t p) const {  // NOLINT
+  [[nodiscard]] value_type const& operator[](std::size_t p) const { return str_[p]; }
+  [[nodiscard]] explicit operator std::string() const { return {this->c_str(), this->size()}; }
+  [[nodiscard]] value_type const& at(std::size_t p) const {  // NOLINT
    CHECK_LT(p, size_);
    return str_[p];
  }
-  constexpr std::size_t size() const { return size_; }  // NOLINT
-  constexpr bool empty() const { return size() == 0; }  // NOLINT
-  StringView substr(size_t beg, size_t n) const {       // NOLINT
+  [[nodiscard]] constexpr std::size_t size() const { return size_; }       // NOLINT
+  [[nodiscard]] constexpr bool empty() const { return size() == 0; }       // NOLINT
+  [[nodiscard]] StringView substr(std::size_t beg, std::size_t n) const {  // NOLINT
    CHECK_LE(beg, size_);
-    size_t len = std::min(n, size_ - beg);
+    std::size_t len = std::min(n, size_ - beg);
    return {str_ + beg, len};
  }
-  CharT const* c_str() const { return str_; }                    // NOLINT
+  [[nodiscard]] value_type const* c_str() const { return str_; }  // NOLINT

-  constexpr CharT const* cbegin() const { return str_; }         // NOLINT
-  constexpr CharT const* cend() const { return str_ + size(); }  // NOLINT
-  constexpr CharT const* begin() const { return str_; }          // NOLINT
-  constexpr CharT const* end() const { return str_ + size(); }   // NOLINT
+  [[nodiscard]] constexpr const_iterator cbegin() const { return str_; }         // NOLINT
+  [[nodiscard]] constexpr const_iterator cend() const { return str_ + size(); }  // NOLINT
+  [[nodiscard]] constexpr iterator begin() const { return str_; }                // NOLINT
+  [[nodiscard]] constexpr iterator end() const { return str_ + size(); }         // NOLINT

-  const_reverse_iterator rbegin() const noexcept {               // NOLINT
+  [[nodiscard]] const_reverse_iterator rbegin() const noexcept {  // NOLINT
    return const_reverse_iterator(this->end());
  }
-  const_reverse_iterator crbegin() const noexcept {  // NOLINT
+  [[nodiscard]] const_reverse_iterator crbegin() const noexcept {  // NOLINT
    return const_reverse_iterator(this->end());
  }
-  const_reverse_iterator rend() const noexcept {  // NOLINT
+  [[nodiscard]] const_reverse_iterator rend() const noexcept {  // NOLINT
    return const_reverse_iterator(this->begin());
  }
-  const_reverse_iterator crend() const noexcept {  // NOLINT
+  [[nodiscard]] const_reverse_iterator crend() const noexcept {  // NOLINT
    return const_reverse_iterator(this->begin());
  }
 };
--- a/jvm-packages/create_jni.py
+++ b/jvm-packages/create_jni.py
@ -106,9 +106,11 @@ if __name__ == "__main__":
            if cli_args.use_cuda == 'ON':
                CONFIG['USE_CUDA'] = 'ON'
                CONFIG['USE_NCCL'] = 'ON'
+                CONFIG["USE_DLOPEN_NCCL"] = "OFF"
            elif cli_args.use_hip== 'ON':
                CONFIG['USE_HIP'] = 'ON'
                CONFIG['USE_RCCL'] = 'ON'
+                CONFIG["USE_DLOPEN_RCCL"] = "OFF"

            args = ["-D{0}:BOOL={1}".format(k, v) for k, v in CONFIG.items()]

--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@ -33,7 +33,7 @@
        <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
        <maven.compiler.source>1.8</maven.compiler.source>
        <maven.compiler.target>1.8</maven.compiler.target>
-        <flink.version>1.17.1</flink.version>
+        <flink.version>1.18.0</flink.version>
        <junit.version>4.13.2</junit.version>
        <spark.version>3.4.1</spark.version>
        <spark.version.gpu>3.4.1</spark.version.gpu>
@ -43,11 +43,11 @@
        <maven.wagon.http.retryHandler.count>5</maven.wagon.http.retryHandler.count>
        <log.capi.invocation>OFF</log.capi.invocation>
        <use.cuda>OFF</use.cuda>
+        <cudf.version>23.10.0</cudf.version>
+        <spark.rapids.version>23.10.0</spark.rapids.version>
        <use.hip>OFF</use.hip>
-        <cudf.version>23.08.0</cudf.version>
-        <spark.rapids.version>23.08.1</spark.rapids.version>
        <cudf.classifier>cuda11</cudf.classifier>
-        <scalatest.version>3.2.16</scalatest.version>
+        <scalatest.version>3.2.17</scalatest.version>
        <scala-collection-compat.version>2.11.0</scala-collection-compat.version>
      </properties>
    <repositories>
@ -382,7 +382,7 @@
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-checkstyle-plugin</artifactId>
-                <version>3.3.0</version>
+                <version>3.3.1</version>
                <configuration>
                    <configLocation>checkstyle.xml</configLocation>
                    <failOnViolation>true</failOnViolation>
@ -435,7 +435,7 @@
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-surefire-plugin</artifactId>
-                <version>3.1.2</version>
+                <version>3.2.2</version>
                <configuration>
                    <skipTests>false</skipTests>
                    <useSystemClassLoader>false</useSystemClassLoader>
@ -458,7 +458,7 @@
        <plugins>
            <plugin>
                <artifactId>maven-project-info-reports-plugin</artifactId>
-                <version>3.4.5</version>
+                <version>3.5.0</version>
            </plugin>
            <plugin>
                <groupId>net.alchim31.maven</groupId>
@ -482,7 +482,7 @@
        <dependency>
            <groupId>commons-logging</groupId>
            <artifactId>commons-logging</artifactId>
-            <version>1.2</version>
+            <version>1.3.0</version>
        </dependency>
        <dependency>
            <groupId>org.scalatest</groupId>
--- a/jvm-packages/xgboost4j-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-gpu/pom.xml
@ -63,7 +63,7 @@
        <dependency>
            <groupId>org.apache.commons</groupId>
            <artifactId>commons-lang3</artifactId>
-            <version>3.13.0</version>
+            <version>3.14.0</version>
        </dependency>
    </dependencies>

@ -72,7 +72,7 @@
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-javadoc-plugin</artifactId>
-                <version>3.5.0</version>
+                <version>3.6.2</version>
                <configuration>
                    <show>protected</show>
                    <nohelp>true</nohelp>
--- a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuXGBoostGeneralSuite.scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuXGBoostGeneralSuite.scala
@ -206,7 +206,7 @@ class GpuXGBoostGeneralSuite extends GpuTestSuite {
        .setDevice("cuda:1")
        .fit(trainingDf)
      }
-      assert(thrown.getMessage.contains("`cuda` or `gpu`"))
+      assert(thrown.getMessage.contains("device given invalid value cuda:1"))
    }
  }
 }
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
@ -31,7 +31,8 @@ import org.apache.commons.logging.LogFactory
 import org.apache.hadoop.fs.FileSystem

 import org.apache.spark.rdd.RDD
-import org.apache.spark.{SparkContext, TaskContext}
+import org.apache.spark.resource.{ResourceProfileBuilder, TaskResourceRequests}
+import org.apache.spark.{SparkConf, SparkContext, TaskContext}
 import org.apache.spark.sql.SparkSession

 /**
@ -72,7 +73,8 @@ private[scala] case class XGBoostExecutionParams(
    device: Option[String],
    isLocal: Boolean,
    featureNames: Option[Array[String]],
-    featureTypes: Option[Array[String]]) {
+    featureTypes: Option[Array[String]],
+    runOnGpu: Boolean) {

  private var rawParamMap: Map[String, Any] = _

@ -186,14 +188,15 @@ private[this] class XGBoostExecutionParamsFactory(rawParams: Map[String, Any], s
                                 .asInstanceOf[Boolean]

    val treeMethod: Option[String] = overridedParams.get("tree_method").map(_.toString)
-    // back-compatible with "gpu_hist"
-    val device: Option[String] = if (treeMethod.exists(_ == "gpu_hist")) {
-      Some("cuda")
-    } else overridedParams.get("device").map(_.toString)
+    val device: Option[String] = overridedParams.get("device").map(_.toString)
+    val deviceIsGpu = device.exists(_ == "cuda")

-    require(!(treeMethod.exists(_ == "approx") && device.exists(_ == "cuda")),
+    require(!(treeMethod.exists(_ == "approx") && deviceIsGpu),
      "The tree method \"approx\" is not yet supported for Spark GPU cluster")

+    // back-compatible with "gpu_hist"
+    val runOnGpu = treeMethod.exists(_ == "gpu_hist") || deviceIsGpu
+
    val trackerConf = overridedParams.get("tracker_conf") match {
      case None => TrackerConf()
      case Some(conf: TrackerConf) => conf
@ -228,7 +231,8 @@ private[this] class XGBoostExecutionParamsFactory(rawParams: Map[String, Any], s
      device,
      isLocal,
      featureNames,
-      featureTypes
+      featureTypes,
+      runOnGpu
    )
    xgbExecParam.setRawParamMap(overridedParams)
    xgbExecParam
@ -253,7 +257,132 @@ private[this] class XGBoostExecutionParamsFactory(rawParams: Map[String, Any], s
  )
 }

-object XGBoost extends Serializable {
+/**
+ * A trait to manage stage-level scheduling
+ */
+private[spark] trait XGBoostStageLevel extends Serializable {
+  private val logger = LogFactory.getLog("XGBoostSpark")
+
+  private[spark] def isStandaloneOrLocalCluster(conf: SparkConf): Boolean = {
+    val master = conf.get("spark.master")
+    master != null && (master.startsWith("spark://") || master.startsWith("local-cluster"))
+  }
+
+  /**
+   * To determine if stage-level scheduling should be skipped according to the spark version
+   * and spark configurations
+   *
+   * @param sparkVersion spark version
+   * @param runOnGpu     if xgboost training run on GPUs
+   * @param conf         spark configurations
+   * @return Boolean to skip stage-level scheduling or not
+   */
+  private[spark] def skipStageLevelScheduling(
+      sparkVersion: String,
+      runOnGpu: Boolean,
+      conf: SparkConf): Boolean = {
+    if (runOnGpu) {
+      if (sparkVersion < "3.4.0") {
+        logger.info("Stage-level scheduling in xgboost requires spark version 3.4.0+")
+        return true
+      }
+
+      if (!isStandaloneOrLocalCluster(conf)) {
+        logger.info("Stage-level scheduling in xgboost requires spark standalone or " +
+          "local-cluster mode")
+        return true
+      }
+
+      val executorCores = conf.getInt("spark.executor.cores", -1)
+      val executorGpus = conf.getInt("spark.executor.resource.gpu.amount", -1)
+      if (executorCores == -1 || executorGpus == -1) {
+        logger.info("Stage-level scheduling in xgboost requires spark.executor.cores, " +
+          "spark.executor.resource.gpu.amount to be set.")
+        return true
+      }
+
+      if (executorCores == 1) {
+        logger.info("Stage-level scheduling in xgboost requires spark.executor.cores > 1")
+        return true
+      }
+
+      if (executorGpus > 1) {
+        logger.info("Stage-level scheduling in xgboost will not work " +
+          "when spark.executor.resource.gpu.amount > 1")
+        return true
+      }
+
+      val taskGpuAmount = conf.getDouble("spark.task.resource.gpu.amount", -1.0).toFloat
+
+      if (taskGpuAmount == -1.0) {
+        // The ETL tasks will not grab a gpu when spark.task.resource.gpu.amount is not set,
+        // but with stage-level scheduling, we can make training task grab the gpu.
+        return false
+      }
+
+      if (taskGpuAmount == executorGpus.toFloat) {
+        // spark.executor.resource.gpu.amount = spark.task.resource.gpu.amount
+        // results in only 1 task running at a time, which may cause perf issue.
+        return true
+      }
+      // We can enable stage-level scheduling
+      false
+    } else true // Skip stage-level scheduling for cpu training.
+  }
+
+  /**
+   * Attempt to modify the task resources so that only one task can be executed
+   * on a single executor simultaneously.
+   *
+   * @param sc  the spark context
+   * @param rdd which rdd to be applied with new resource profile
+   * @return the original rdd or the changed rdd
+   */
+  private[spark] def tryStageLevelScheduling(
+      sc: SparkContext,
+      xgbExecParams: XGBoostExecutionParams,
+      rdd: RDD[(Booster, Map[String, Array[Float]])]
+    ): RDD[(Booster, Map[String, Array[Float]])] = {
+
+    val conf = sc.getConf
+    if (skipStageLevelScheduling(sc.version, xgbExecParams.runOnGpu, conf)) {
+      return rdd
+    }
+
+    // Ensure executor_cores is not None
+    val executor_cores = conf.getInt("spark.executor.cores", -1)
+    if (executor_cores == -1) {
+      throw new RuntimeException("Wrong spark.executor.cores")
+    }
+
+    // Spark-rapids is a GPU-acceleration project for Spark SQL.
+    // When spark-rapids is enabled, we prevent concurrent execution of other ETL tasks
+    // that utilize GPUs alongside training tasks in order to avoid GPU out-of-memory errors.
+    val spark_plugins = conf.get("spark.plugins", " ")
+    val spark_rapids_sql_enabled = conf.get("spark.rapids.sql.enabled", "true")
+
+    // Determine the number of cores required for each task.
+    val task_cores = if (spark_plugins.contains("com.nvidia.spark.SQLPlugin") &&
+      spark_rapids_sql_enabled.toLowerCase == "true") {
+      executor_cores
+    } else {
+      (executor_cores / 2) + 1
+    }
+
+    // Each training task requires cpu cores > total executor cores//2 + 1 to
+    // ensure tasks are sent to different executors.
+    // Note: We cannot use GPUs to limit concurrent tasks
+    // due to https://issues.apache.org/jira/browse/SPARK-45527.
+    val task_gpus = 1.0
+    val treqs = new TaskResourceRequests().cpus(task_cores).resource("gpu", task_gpus)
+    val rp = new ResourceProfileBuilder().require(treqs).build()
+
+    logger.info(s"XGBoost training tasks require the resource(cores=$task_cores, gpu=$task_gpus).")
+    rdd.withResources(rp)
+  }
+}
+
+object XGBoost extends XGBoostStageLevel {
  private val logger = LogFactory.getLog("XGBoostSpark")

  def getGPUAddrFromResources: Int = {
@ -315,7 +444,7 @@ object XGBoost extends Serializable {
      val externalCheckpointParams = xgbExecutionParam.checkpointParam

      var params = xgbExecutionParam.toMap
-      if (xgbExecutionParam.device.exists(m => (m == "cuda" || m == "gpu"))) {
+      if (xgbExecutionParam.runOnGpu) {
        val gpuId = if (xgbExecutionParam.isLocal) {
          // For local mode, force gpu id to primary device
          0
@ -413,10 +542,12 @@ object XGBoost extends Serializable {

        }}

+        val boostersAndMetricsWithRes = tryStageLevelScheduling(sc, xgbExecParams,
+          boostersAndMetrics)
        // The repartition step is to make training stage as ShuffleMapStage, so that when one
        // of the training task fails the training stage can retry. ResultStage won't retry when
        // it fails.
-        val (booster, metrics) = boostersAndMetrics.repartition(1).collect()(0)
+        val (booster, metrics) = boostersAndMetricsWithRes.repartition(1).collect()(0)
        val trackerReturnVal = tracker.waitFor(0L)
        logger.info(s"Rabit returns with exit code $trackerReturnVal")
        if (trackerReturnVal != 0) {
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/BoosterParams.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/BoosterParams.scala
@ -154,11 +154,13 @@ private[spark] trait BoosterParams extends Params {
    (value: String) => BoosterParams.supportedTreeMethods.contains(value))

  final def getTreeMethod: String = $(treeMethod)
+
  /**
    *  The device for running XGBoost algorithms, options: cpu, cuda
    */
  final val device = new Param[String](
-    this, "device", "The device for running XGBoost algorithms, options: cpu, cuda"
+    this, "device", "The device for running XGBoost algorithms, options: cpu, cuda",
+    (value: String) => BoosterParams.supportedDevices.contains(value)
  )

  final def getDevice: String = $(device)
@ -288,4 +290,6 @@ private[scala] object BoosterParams {
  val supportedSampleType = HashSet("uniform", "weighted")

  val supportedNormalizeType = HashSet("tree", "forest")
+
+  val supportedDevices = HashSet("cpu", "cuda")
 }
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostSuite.scala
@ -0,0 +1,150 @@
+/*
+ Copyright (c) 2023 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package ml.dmlc.xgboost4j.scala.spark
+
+import ml.dmlc.xgboost4j.scala.Booster
+import org.apache.spark.SparkConf
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.SparkSession
+import org.scalatest.funsuite.AnyFunSuite
+
+class XGBoostSuite extends AnyFunSuite with PerTest {
+
+  // Do not create spark context
+  override def beforeEach(): Unit = {}
+
+  test("XGBoost execution parameters") {
+    var xgbExecutionParams = new XGBoostExecutionParamsFactory(
+      Map("device" -> "cpu", "num_workers" -> 1, "num_round" -> 1), sc)
+      .buildXGBRuntimeParams
+    assert(!xgbExecutionParams.runOnGpu)
+
+    xgbExecutionParams = new XGBoostExecutionParamsFactory(
+      Map("device" -> "cuda", "num_workers" -> 1, "num_round" -> 1), sc)
+      .buildXGBRuntimeParams
+    assert(xgbExecutionParams.runOnGpu)
+
+    xgbExecutionParams = new XGBoostExecutionParamsFactory(
+      Map("device" -> "cpu", "tree_method" -> "gpu_hist", "num_workers" -> 1, "num_round" -> 1), sc)
+      .buildXGBRuntimeParams
+    assert(xgbExecutionParams.runOnGpu)
+
+    xgbExecutionParams = new XGBoostExecutionParamsFactory(
+      Map("device" -> "cuda", "tree_method" -> "gpu_hist",
+        "num_workers" -> 1, "num_round" -> 1), sc)
+      .buildXGBRuntimeParams
+    assert(xgbExecutionParams.runOnGpu)
+  }
+
+  test("skip stage-level scheduling") {
+    val conf = new SparkConf()
+      .setMaster("spark://foo")
+      .set("spark.executor.cores", "12")
+      .set("spark.task.cpus", "1")
+      .set("spark.executor.resource.gpu.amount", "1")
+      .set("spark.task.resource.gpu.amount", "0.08")
+
+    // the correct configurations should not skip stage-level scheduling
+    assert(!XGBoost.skipStageLevelScheduling(sparkVersion = "3.4.0", runOnGpu = true, conf))
+
+    // spark version < 3.4.0
+    assert(XGBoost.skipStageLevelScheduling(sparkVersion = "3.3.0", runOnGpu = true, conf))
+
+    // not run on GPU
+    assert(XGBoost.skipStageLevelScheduling(sparkVersion = "3.4.0", runOnGpu = false, conf))
+
+    // spark.executor.cores is not set
+    var badConf = conf.clone().remove("spark.executor.cores")
+    assert(XGBoost.skipStageLevelScheduling(sparkVersion = "3.4.0", runOnGpu = true, badConf))
+
+    // spark.executor.cores=1
+    badConf = conf.clone().set("spark.executor.cores", "1")
+    assert(XGBoost.skipStageLevelScheduling(sparkVersion = "3.4.0", runOnGpu = true, badConf))
+
+    // spark.executor.resource.gpu.amount is not set
+    badConf = conf.clone().remove("spark.executor.resource.gpu.amount")
+    assert(XGBoost.skipStageLevelScheduling(sparkVersion = "3.4.0", runOnGpu = true, badConf))
+
+    // spark.executor.resource.gpu.amount>1
+    badConf = conf.clone().set("spark.executor.resource.gpu.amount", "2")
+    assert(XGBoost.skipStageLevelScheduling(sparkVersion = "3.4.0", runOnGpu = true, badConf))
+
+    // spark.task.resource.gpu.amount is not set
+    badConf = conf.clone().remove("spark.task.resource.gpu.amount")
+    assert(!XGBoost.skipStageLevelScheduling(sparkVersion = "3.4.0", runOnGpu = true, badConf))
+
+    // spark.task.resource.gpu.amount=1
+    badConf = conf.clone().set("spark.task.resource.gpu.amount", "1")
+    assert(XGBoost.skipStageLevelScheduling(sparkVersion = "3.4.0", runOnGpu = true, badConf))
+
+    // yarn
+    badConf = conf.clone().setMaster("yarn")
+    assert(XGBoost.skipStageLevelScheduling(sparkVersion = "3.4.0", runOnGpu = true, badConf))
+
+    // k8s
+    badConf = conf.clone().setMaster("k8s://")
+    assert(XGBoost.skipStageLevelScheduling(sparkVersion = "3.4.0", runOnGpu = true, badConf))
+  }
+
+
+  object FakedXGBoost extends XGBoostStageLevel {
+
+    // Do not skip stage-level scheduling for testing purposes.
+    override private[spark] def skipStageLevelScheduling(
+      sparkVersion: String,
+      runOnGpu: Boolean,
+      conf: SparkConf) = false
+  }
+
+  test("try stage-level scheduling without spark-rapids") {
+
+    val builder = SparkSession.builder()
+      .master(s"local-cluster[1, 4, 1024]")
+      .appName("XGBoostSuite")
+      .config("spark.ui.enabled", false)
+      .config("spark.driver.memory", "512m")
+      .config("spark.barrier.sync.timeout", 10)
+      .config("spark.task.cpus", 1)
+      .config("spark.executor.cores", 4)
+      .config("spark.executor.resource.gpu.amount", 1)
+      .config("spark.task.resource.gpu.amount", 0.25)
+
+    val ss = builder.getOrCreate()
+
+    try {
+      val df = ss.range(1, 10)
+      val rdd = df.rdd
+
+      val xgbExecutionParams = new XGBoostExecutionParamsFactory(
+        Map("device" -> "cuda", "num_workers" -> 1, "num_round" -> 1), sc)
+        .buildXGBRuntimeParams
+      assert(xgbExecutionParams.runOnGpu)
+
+      val finalRDD = FakedXGBoost.tryStageLevelScheduling(ss.sparkContext, xgbExecutionParams,
+        rdd.asInstanceOf[RDD[(Booster, Map[String, Array[Float]])]])
+
+      val taskResources = finalRDD.getResourceProfile().taskResources
+      assert(taskResources.contains("cpus"))
+      assert(taskResources.get("cpus").get.amount == 3)
+
+      assert(taskResources.contains("gpu"))
+      assert(taskResources.get("gpu").get.amount == 1.0)
+    } finally {
+      ss.stop()
+    }
+  }
+}
--- a/jvm-packages/xgboost4j/pom.xml
+++ b/jvm-packages/xgboost4j/pom.xml
@ -60,7 +60,7 @@
          <plugin>
              <groupId>org.apache.maven.plugins</groupId>
              <artifactId>maven-javadoc-plugin</artifactId>
-              <version>3.5.0</version>
+              <version>3.6.2</version>
              <configuration>
                  <show>protected</show>
                  <nohelp>true</nohelp>
--- a/plugin/CMakeLists.txt
+++ b/plugin/CMakeLists.txt
@ -1,31 +1,29 @@
-if(PLUGIN_DENSE_PARSER)
-  target_sources(objxgboost PRIVATE ${xgboost_SOURCE_DIR}/plugin/dense_parser/dense_libsvm.cc)
-endif()
-
-if(PLUGIN_UPDATER_ONEAPI)
-  add_library(oneapi_plugin OBJECT
-    ${xgboost_SOURCE_DIR}/plugin/updater_oneapi/regression_obj_oneapi.cc
-    ${xgboost_SOURCE_DIR}/plugin/updater_oneapi/predictor_oneapi.cc)
-  target_include_directories(oneapi_plugin
+if(PLUGIN_SYCL)
+  set(CMAKE_CXX_COMPILER "icpx")
+  add_library(plugin_sycl OBJECT
+    ${xgboost_SOURCE_DIR}/plugin/sycl/device_manager.cc
+    ${xgboost_SOURCE_DIR}/plugin/sycl/predictor/predictor.cc)
+  target_include_directories(plugin_sycl
    PRIVATE
    ${xgboost_SOURCE_DIR}/include
    ${xgboost_SOURCE_DIR}/dmlc-core/include
    ${xgboost_SOURCE_DIR}/rabit/include)
-  target_compile_definitions(oneapi_plugin PUBLIC -DXGBOOST_USE_ONEAPI=1)
-  target_link_libraries(oneapi_plugin PUBLIC -fsycl)
-  set_target_properties(oneapi_plugin PROPERTIES
+    target_compile_definitions(plugin_sycl PUBLIC -DXGBOOST_USE_SYCL=1)
+    target_link_libraries(plugin_sycl PUBLIC -fsycl)
+    set_target_properties(plugin_sycl PROPERTIES
    COMPILE_FLAGS -fsycl
    CXX_STANDARD 17
    CXX_STANDARD_REQUIRED ON
    POSITION_INDEPENDENT_CODE ON)
  if(USE_OPENMP)
    find_package(OpenMP REQUIRED)
-    target_link_libraries(oneapi_plugin PUBLIC OpenMP::OpenMP_CXX)
+    set_target_properties(plugin_sycl PROPERTIES
+    COMPILE_FLAGS "-fsycl -qopenmp")
  endif()
-  # Get compilation and link flags of oneapi_plugin and propagate to objxgboost
-  target_link_libraries(objxgboost PUBLIC oneapi_plugin)
-  # Add all objects of oneapi_plugin to objxgboost
-  target_sources(objxgboost INTERFACE $<TARGET_OBJECTS:oneapi_plugin>)
+  # Get compilation and link flags of plugin_sycl and propagate to objxgboost
+  target_link_libraries(objxgboost PUBLIC plugin_sycl)
+  # Add all objects of plugin_sycl to objxgboost
+  target_sources(objxgboost INTERFACE $<TARGET_OBJECTS:plugin_sycl>)
 endif()

 # Add the Federate Learning plugin if enabled.
--- a/plugin/README.md
+++ b/plugin/README.md
@ -36,5 +36,3 @@ The register macros available to plugin writers are:
 And from dmlc-core:

 - DMLC_REGISTER_PARAMETER - Register a set of parameter for a specific usecase
- - DMLC_REGISTER_DATA_PARSER - Register a data parser where the data can be
-   represented by a URL. This is used by DMatrix.
--- a/plugin/dense_parser/dense_libsvm.cc
+++ b/plugin/dense_parser/dense_libsvm.cc
@ -1,87 +0,0 @@
-/*!
- * Copyright 2015 by Contributors
- * \file dense_libsvm.cc
- * \brief Plugin to load in libsvm, but fill all the missing entries with zeros.
- *  This plugin is mainly used for benchmark purposes and do not need to be included.
- */
-#include <xgboost/base.h>
-#include <dmlc/data.h>
-#include <memory>
-
-namespace dmlc {
-namespace data {
-
-template<typename IndexType>
-class DensifyParser : public dmlc::Parser<IndexType> {
- public:
-  DensifyParser(dmlc::Parser<IndexType>* parser, uint32_t num_col)
-      : parser_(parser), num_col_(num_col) {
-  }
-
-  void BeforeFirst() override {
-    parser_->BeforeFirst();
-  }
-
-  bool Next() override {
-    if (!parser_->Next()) return false;
-    const RowBlock<IndexType>& batch = parser_->Value();
-    LOG(INFO) << batch.size;
-    dense_index_.resize(num_col_ * batch.size);
-    dense_value_.resize(num_col_ * batch.size);
-    std::fill(dense_value_.begin(), dense_value_.end(), 0.0);
-    offset_.resize(batch.size + 1);
-    offset_[0] = 0;
-
-    for (size_t i = 0; i < batch.size; ++i) {
-      offset_[i + 1] = (i + 1) * num_col_;
-      Row<IndexType> row = batch[i];
-      for (uint32_t j = 0; j < num_col_; ++j) {
-        dense_index_[i * num_col_ + j] = j;
-      }
-      for (unsigned k = 0; k < row.length; ++k) {
-        uint32_t index = row.get_index(k);
-        CHECK_LT(index, num_col_)
-            << "Featuere index larger than num_col";
-        dense_value_[i * num_col_ + index]  = row.get_value(k);
-      }
-    }
-    out_ = batch;
-    out_.index = dmlc::BeginPtr(dense_index_);
-    out_.value = dmlc::BeginPtr(dense_value_);
-    out_.offset = dmlc::BeginPtr(offset_);
-    return true;
-  }
-
-  const dmlc::RowBlock<IndexType>& Value() const override {
-    return out_;
-  }
-
-  size_t BytesRead() const override {
-    return parser_->BytesRead();
-  }
-
- private:
-  RowBlock<IndexType> out_;
-  std::unique_ptr<Parser<IndexType> > parser_;
-  uint32_t num_col_;
-  std::vector<size_t> offset_;
-  std::vector<IndexType> dense_index_;
-  std::vector<xgboost::bst_float> dense_value_;
-};
-
-template<typename IndexType, typename DType = real_t>
-Parser<IndexType> *
-CreateDenseLibSVMParser(const std::string& path,
-                        const std::map<std::string, std::string>& args,
-                        unsigned part_index,
-                        unsigned num_parts) {
-  CHECK_NE(args.count("num_col"), 0) << "expect num_col in dense_libsvm";
-  return new DensifyParser<IndexType>(
-            Parser<IndexType>::Create(path.c_str(), part_index, num_parts, "libsvm"),
-           uint32_t(atoi(args.at("num_col").c_str())));
-}
-}  // namespace data
-
-DMLC_REGISTER_DATA_PARSER(uint32_t, real_t, dense_libsvm,
-  data::CreateDenseLibSVMParser<uint32_t __DMLC_COMMA real_t>);
-}  // namespace dmlc
--- a/plugin/federated/federated_coll.cc
+++ b/plugin/federated/federated_coll.cc
@ -29,7 +29,7 @@ namespace {
  auto stub = fed->Handle();

  BroadcastRequest request;
-  request.set_sequence_number(*sequence_number++);
+  request.set_sequence_number((*sequence_number)++);
  request.set_rank(comm.Rank());
  if (comm.Rank() != root) {
    request.set_send_buffer(nullptr, 0);
@ -90,9 +90,9 @@ Coll *FederatedColl::MakeCUDAVar() {
 [[nodiscard]] Result FederatedColl::Broadcast(Comm const &comm, common::Span<std::int8_t> data,
                                              std::int32_t root) {
  if (comm.Rank() == root) {
-    return BroadcastImpl(comm, &sequence_number_, data, root);
+    return BroadcastImpl(comm, &this->sequence_number_, data, root);
  } else {
-    return BroadcastImpl(comm, &sequence_number_, data, root);
+    return BroadcastImpl(comm, &this->sequence_number_, data, root);
  }
 }

--- a/plugin/federated/federated_comm.cc
+++ b/plugin/federated/federated_comm.cc
@ -60,7 +60,8 @@ void FederatedComm::Init(std::string const& host, std::int32_t port, std::int32_
  }
 }

-FederatedComm::FederatedComm(Json const& config) {
+FederatedComm::FederatedComm(std::int32_t retry, std::chrono::seconds timeout, std::string task_id,
+                             Json const& config) {
  /**
   * Topology
   */
@ -93,6 +94,13 @@ FederatedComm::FederatedComm(Json const& config) {
  CHECK_NE(world_size, 0) << "Parameter `federated_world_size` is required.";
  CHECK(!server_address.empty()) << "Parameter `federated_server_address` is required.";

+  /**
+   * Basic config
+   */
+  this->retry_ = retry;
+  this->timeout_ = timeout;
+  this->task_id_ = task_id;
+
  /**
   * Certificates
   */
--- a/plugin/federated/federated_comm.cu
+++ b/plugin/federated/federated_comm.cu
@ -11,6 +11,8 @@ namespace xgboost::collective {
 CUDAFederatedComm::CUDAFederatedComm(Context const* ctx, std::shared_ptr<FederatedComm const> impl)
    : FederatedComm{impl}, stream_{ctx->CUDACtx()->Stream()} {
  CHECK(impl);
+  CHECK(ctx->IsCUDA());
+  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
 }

 Comm* FederatedComm::MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll>) const {
--- a/plugin/federated/federated_comm.cuh
+++ b/plugin/federated/federated_comm.cuh
@ -5,9 +5,11 @@

 #include <memory>  // for shared_ptr

+#include "../../src/collective/coll.h"          // for Coll
 #include "../../src/common/device_helpers.cuh"  // for CUDAStreamView
 #include "federated_comm.h"                     // for FederatedComm
 #include "xgboost/context.h"                    // for Context
+#include "xgboost/logging.h"

 namespace xgboost::collective {
 class CUDAFederatedComm : public FederatedComm {
@ -16,5 +18,9 @@ class CUDAFederatedComm : public FederatedComm {
 public:
  explicit CUDAFederatedComm(Context const* ctx, std::shared_ptr<FederatedComm const> impl);
  [[nodiscard]] auto Stream() const { return stream_; }
+  Comm* MakeCUDAVar(Context const*, std::shared_ptr<Coll>) const override {
+    LOG(FATAL) << "[Internal Error]: Invalid request for CUDA variant.";
+    return nullptr;
+  }
 };
 }  // namespace xgboost::collective
--- a/plugin/federated/federated_comm.h
+++ b/plugin/federated/federated_comm.h
@ -10,12 +10,12 @@
 #include <memory>   // for unique_ptr
 #include <string>   // for string

-#include "../../src/collective/comm.h"    // for Comm
+#include "../../src/collective/comm.h"    // for HostComm
 #include "../../src/common/json_utils.h"  // for OptionalArg
 #include "xgboost/json.h"

 namespace xgboost::collective {
-class FederatedComm : public Comm {
+class FederatedComm : public HostComm {
  std::shared_ptr<federated::Federated::Stub> stub_;

  void Init(std::string const& host, std::int32_t port, std::int32_t world, std::int32_t rank,
@ -27,6 +27,10 @@ class FederatedComm : public Comm {
    this->rank_ = that->Rank();
    this->world_ = that->World();

+    this->retry_ = that->Retry();
+    this->timeout_ = that->Timeout();
+    this->task_id_ = that->TaskID();
+
    this->tracker_ = that->TrackerInfo();
  }

@ -41,7 +45,8 @@ class FederatedComm : public Comm {
   * - federated_client_key_path
   * - federated_client_cert_path
   */
-  explicit FederatedComm(Json const& config);
+  explicit FederatedComm(std::int32_t retry, std::chrono::seconds timeout, std::string task_id,
+                         Json const& config);
  explicit FederatedComm(std::string const& host, std::int32_t port, std::int32_t world,
                         std::int32_t rank) {
    this->Init(host, port, world, rank, {}, {}, {});
@ -59,6 +64,6 @@ class FederatedComm : public Comm {
  [[nodiscard]] bool IsFederated() const override { return true; }
  [[nodiscard]] federated::Federated::Stub* Handle() const { return stub_.get(); }

-  Comm* MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll> pimpl) const override;
+  [[nodiscard]] Comm* MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll> pimpl) const override;
 };
 }  // namespace xgboost::collective
--- a/plugin/sycl/README.md
+++ b/plugin/sycl/README.md
@ -0,0 +1,40 @@
+<!--
+******************************************************************************
+* Copyright by Contributors 2017-2023
+*******************************************************************************/-->
+
+# SYCL-based Algorithm for Tree Construction
+This plugin adds support of SYCL programming model for prediction algorithms to XGBoost.
+
+## Usage
+Specify the 'device' parameter as described in the table below to offload model training and inference on SYCL device.
+
+### Algorithms
+| device | Description |
+| --- | --- |
+sycl | use default sycl device  |
+sycl:gpu | use default sycl gpu  |
+sycl:cpu | use default sycl cpu  |
+sycl:gpu:N | use sycl gpu number N |
+sycl:cpu:N | use sycl cpu number N |
+
+Python example:
+```python
+param['device'] = 'sycl:gpu:0'
+```
+Note: 'sycl:cpu' devices have full functional support but can't provide good enough performance. We recommend use 'sycl:cpu' devices only for test purposes.
+Note: if device is specified to be 'sycl', device type will be automatically chosen. In case the system has both sycl GPU and sycl CPU, GPU will on use.
+
+## Dependencies
+To build and use the plugin, install [Intel® oneAPI DPC++/C++ Compiler](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compiler.html).
+See also [Intel® oneAPI Programming Guide](https://www.intel.com/content/www/us/en/docs/oneapi/programming-guide/2024-0/overview.html).
+
+## Build
+From the ``xgboost`` directory, run:
+
+```bash
+$ mkdir build
+$ cd build
+$ cmake .. -DPLUGIN_SYCL=ON
+$ make -j
+```
--- a/plugin/sycl/data.h
+++ b/plugin/sycl/data.h
@ -0,0 +1,256 @@
+/*!
+ * Copyright by Contributors 2017-2023
+ */
+#ifndef PLUGIN_SYCL_DATA_H_
+#define PLUGIN_SYCL_DATA_H_
+
+#include <cstddef>
+#include <limits>
+#include <mutex>
+#include <vector>
+#include <memory>
+#include <algorithm>
+
+#include "xgboost/base.h"
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
+#pragma GCC diagnostic ignored "-W#pragma-messages"
+#include "xgboost/data.h"
+#pragma GCC diagnostic pop
+#include "xgboost/logging.h"
+#include "xgboost/host_device_vector.h"
+
+#include "../../src/common/threading_utils.h"
+
+#include "CL/sycl.hpp"
+
+namespace xgboost {
+namespace sycl {
+enum class MemoryType { shared, on_device};
+
+
+template <typename T>
+class USMDeleter {
+ public:
+  explicit USMDeleter(::sycl::queue qu) : qu_(qu) {}
+
+  void operator()(T* data) const {
+    ::sycl::free(data, qu_);
+  }
+
+ private:
+  ::sycl::queue qu_;
+};
+
+template <typename T, MemoryType memory_type = MemoryType::shared>
+class USMVector {
+  static_assert(std::is_standard_layout<T>::value, "USMVector admits only POD types");
+
+  std::shared_ptr<T> allocate_memory_(::sycl::queue* qu, size_t size) {
+    if constexpr (memory_type == MemoryType::shared) {
+      return std::shared_ptr<T>(::sycl::malloc_shared<T>(size_, *qu), USMDeleter<T>(*qu));
+    } else {
+      return std::shared_ptr<T>(::sycl::malloc_device<T>(size_, *qu), USMDeleter<T>(*qu));
+    }
+  }
+
+  void copy_vector_to_memory_(::sycl::queue* qu, const std::vector<T> &vec) {
+    if constexpr (memory_type == MemoryType::shared) {
+      std::copy(vec.begin(), vec.end(), data_.get());
+    } else {
+      qu->memcpy(data_.get(), vec.data(), size_ * sizeof(T));
+    }
+  }
+
+
+ public:
+  USMVector() : size_(0), capacity_(0), data_(nullptr) {}
+
+  USMVector(::sycl::queue& qu, size_t size) : size_(size), capacity_(size) {
+    data_ = allocate_memory_(qu, size_);
+  }
+
+  USMVector(::sycl::queue& qu, size_t size, T v) : size_(size), capacity_(size) {
+    data_ = allocate_memory_(qu, size_);
+    qu.fill(data_.get(), v, size_).wait();
+  }
+
+  USMVector(::sycl::queue* qu, const std::vector<T> &vec) {
+    size_ = vec.size();
+    capacity_ = size_;
+    data_ = allocate_memory_(qu, size_);
+    copy_vector_to_memory_(qu, vec);
+  }
+
+  ~USMVector() {
+  }
+
+  USMVector<T>& operator=(const USMVector<T>& other) {
+    size_ = other.size_;
+    capacity_ = other.capacity_;
+    data_ = other.data_;
+    return *this;
+  }
+
+  T* Data() { return data_.get(); }
+  const T* DataConst() const { return data_.get(); }
+
+  size_t Size() const { return size_; }
+
+  size_t Capacity() const { return capacity_; }
+
+  T& operator[] (size_t i) { return data_.get()[i]; }
+  const T& operator[] (size_t i) const { return data_.get()[i]; }
+
+  T* Begin () const { return data_.get(); }
+  T* End () const { return data_.get() + size_; }
+
+  bool Empty() const { return (size_ == 0); }
+
+  void Clear() {
+    data_.reset();
+    size_ = 0;
+    capacity_ = 0;
+  }
+
+  void Resize(::sycl::queue* qu, size_t size_new) {
+    if (size_new <= capacity_) {
+      size_ = size_new;
+    } else {
+      size_t size_old = size_;
+      auto data_old = data_;
+      size_ = size_new;
+      capacity_ = size_new;
+      data_ = allocate_memory_(qu, size_);;
+      if (size_old > 0) {
+        qu->memcpy(data_.get(), data_old.get(), sizeof(T) * size_old).wait();
+      }
+    }
+  }
+
+  void Resize(::sycl::queue* qu, size_t size_new, T v) {
+    if (size_new <= size_) {
+      size_ = size_new;
+    } else if (size_new <= capacity_) {
+      qu->fill(data_.get() + size_, v, size_new - size_).wait();
+      size_ = size_new;
+    } else {
+      size_t size_old = size_;
+      auto data_old = data_;
+      size_ = size_new;
+      capacity_ = size_new;
+      data_ = allocate_memory_(qu, size_);
+      if (size_old > 0) {
+        qu->memcpy(data_.get(), data_old.get(), sizeof(T) * size_old).wait();
+      }
+      qu->fill(data_.get() + size_old, v, size_new - size_old).wait();
+    }
+  }
+
+  ::sycl::event ResizeAsync(::sycl::queue* qu, size_t size_new, T v) {
+    if (size_new <= size_) {
+      size_ = size_new;
+      return ::sycl::event();
+    } else if (size_new <= capacity_) {
+      auto event = qu->fill(data_.get() + size_, v, size_new - size_);
+      size_ = size_new;
+      return event;
+    } else {
+      size_t size_old = size_;
+      auto data_old = data_;
+      size_ = size_new;
+      capacity_ = size_new;
+      data_ = allocate_memory_(qu, size_);
+      ::sycl::event event;
+      if (size_old > 0) {
+        event = qu->memcpy(data_.get(), data_old.get(), sizeof(T) * size_old);
+      }
+      return qu->fill(data_.get() + size_old, v, size_new - size_old, event);
+    }
+  }
+
+  ::sycl::event ResizeAndFill(::sycl::queue* qu, size_t size_new, int v) {
+    if (size_new <= size_) {
+      size_ = size_new;
+      return qu->memset(data_.get(), v, size_new * sizeof(T));
+    } else if (size_new <= capacity_) {
+      size_ = size_new;
+      return qu->memset(data_.get(), v, size_new * sizeof(T));
+    } else {
+      size_t size_old = size_;
+      auto data_old = data_;
+      size_ = size_new;
+      capacity_ = size_new;
+      data_ = allocate_memory_(qu, size_);
+      return qu->memset(data_.get(), v, size_new * sizeof(T));
+    }
+  }
+
+  ::sycl::event Fill(::sycl::queue* qu, T v) {
+    return qu->fill(data_.get(), v, size_);
+  }
+
+  void Init(::sycl::queue* qu, const std::vector<T> &vec) {
+    size_ = vec.size();
+    capacity_ = size_;
+    data_ = allocate_memory_(qu, size_);
+    copy_vector_to_memory_(qu, vec);
+  }
+
+  using value_type = T;  // NOLINT
+
+ private:
+  size_t size_;
+  size_t capacity_;
+  std::shared_ptr<T> data_;
+};
+
+/* Wrapper for DMatrix which stores all batches in a single USM buffer */
+struct DeviceMatrix {
+  DMatrix* p_mat;  // Pointer to the original matrix on the host
+  ::sycl::queue qu_;
+  USMVector<size_t> row_ptr;
+  USMVector<Entry> data;
+  size_t total_offset;
+
+  DeviceMatrix(::sycl::queue qu, DMatrix* dmat) : p_mat(dmat), qu_(qu) {
+    size_t num_row = 0;
+    size_t num_nonzero = 0;
+    for (auto &batch : dmat->GetBatches<SparsePage>()) {
+      const auto& data_vec = batch.data.HostVector();
+      const auto& offset_vec = batch.offset.HostVector();
+      num_nonzero += data_vec.size();
+      num_row += batch.Size();
+    }
+
+    row_ptr.Resize(&qu_, num_row + 1);
+    data.Resize(&qu_, num_nonzero);
+
+    size_t data_offset = 0;
+    for (auto &batch : dmat->GetBatches<SparsePage>()) {
+      const auto& data_vec = batch.data.HostVector();
+      const auto& offset_vec = batch.offset.HostVector();
+      size_t batch_size = batch.Size();
+      if (batch_size > 0) {
+        std::copy(offset_vec.data(), offset_vec.data() + batch_size,
+                  row_ptr.Data() + batch.base_rowid);
+        if (batch.base_rowid > 0) {
+          for (size_t i = 0; i < batch_size; i++)
+            row_ptr[i + batch.base_rowid] += batch.base_rowid;
+        }
+        std::copy(data_vec.data(), data_vec.data() + offset_vec[batch_size],
+                  data.Data() + data_offset);
+        data_offset += offset_vec[batch_size];
+      }
+    }
+    row_ptr[num_row] = data_offset;
+    total_offset = data_offset;
+  }
+
+  ~DeviceMatrix() {
+  }
+};
+}  // namespace sycl
+}  // namespace xgboost
+
+#endif  // PLUGIN_SYCL_DATA_H_
--- a/plugin/sycl/device_manager.cc
+++ b/plugin/sycl/device_manager.cc
@ -0,0 +1,124 @@
+/*!
+ * Copyright 2017-2023 by Contributors
+ * \file device_manager.cc
+ */
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
+#pragma GCC diagnostic ignored "-W#pragma-messages"
+#include <rabit/rabit.h>
+#pragma GCC diagnostic pop
+
+#include "../sycl/device_manager.h"
+
+namespace xgboost {
+namespace sycl {
+
+::sycl::device DeviceManager::GetDevice(const DeviceOrd& device_spec) const {
+    if (!device_spec.IsSycl()) {
+        LOG(WARNING) << "Sycl kernel is executed with non-sycl context: "
+                     << device_spec.Name() << ". "
+                     << "Default sycl device_selector will be used.";
+    }
+
+    bool not_use_default_selector = (device_spec.ordinal != kDefaultOrdinal) ||
+                                    (rabit::IsDistributed());
+    if (not_use_default_selector) {
+      DeviceRegister& device_register = GetDevicesRegister();
+      const int device_idx = rabit::IsDistributed() ? rabit::GetRank() : device_spec.ordinal;
+      if (device_spec.IsSyclDefault()) {
+          auto& devices = device_register.devices;
+          CHECK_LT(device_idx, devices.size());
+          return devices[device_idx];
+      } else if (device_spec.IsSyclCPU()) {
+          auto& cpu_devices = device_register.cpu_devices;
+          CHECK_LT(device_idx, cpu_devices.size());
+          return cpu_devices[device_idx];
+      } else {
+          auto& gpu_devices = device_register.gpu_devices;
+          CHECK_LT(device_idx, gpu_devices.size());
+          return gpu_devices[device_idx];
+      }
+    } else {
+        if (device_spec.IsSyclCPU()) {
+            return ::sycl::device(::sycl::cpu_selector_v);
+        } else if (device_spec.IsSyclGPU()) {
+            return ::sycl::device(::sycl::gpu_selector_v);
+        } else {
+            return ::sycl::device(::sycl::default_selector_v);
+        }
+    }
+}
+
+::sycl::queue DeviceManager::GetQueue(const DeviceOrd& device_spec) const {
+    if (!device_spec.IsSycl()) {
+        LOG(WARNING) << "Sycl kernel is executed with non-sycl context: "
+                     << device_spec.Name() << ". "
+                     << "Default sycl device_selector will be used.";
+    }
+
+    QueueRegister_t& queue_register = GetQueueRegister();
+    if (queue_register.count(device_spec.Name()) > 0) {
+        return queue_register.at(device_spec.Name());
+    }
+
+    bool not_use_default_selector = (device_spec.ordinal != kDefaultOrdinal) ||
+                                    (rabit::IsDistributed());
+    std::lock_guard<std::mutex> guard(queue_registering_mutex);
+    if (not_use_default_selector) {
+      DeviceRegister& device_register = GetDevicesRegister();
+      const int device_idx = rabit::IsDistributed() ? rabit::GetRank() : device_spec.ordinal;
+      if (device_spec.IsSyclDefault()) {
+          auto& devices = device_register.devices;
+          CHECK_LT(device_idx, devices.size());
+          queue_register[device_spec.Name()] = ::sycl::queue(devices[device_idx]);
+      } else if (device_spec.IsSyclCPU()) {
+          auto& cpu_devices = device_register.cpu_devices;
+          CHECK_LT(device_idx, cpu_devices.size());
+          queue_register[device_spec.Name()] = ::sycl::queue(cpu_devices[device_idx]);;
+      } else if (device_spec.IsSyclGPU()) {
+          auto& gpu_devices = device_register.gpu_devices;
+          CHECK_LT(device_idx, gpu_devices.size());
+          queue_register[device_spec.Name()] = ::sycl::queue(gpu_devices[device_idx]);
+      }
+    } else {
+        if (device_spec.IsSyclCPU()) {
+            queue_register[device_spec.Name()] = ::sycl::queue(::sycl::cpu_selector_v);
+        } else if (device_spec.IsSyclGPU()) {
+            queue_register[device_spec.Name()] = ::sycl::queue(::sycl::gpu_selector_v);
+        } else {
+            queue_register[device_spec.Name()] = ::sycl::queue(::sycl::default_selector_v);
+        }
+    }
+    return queue_register.at(device_spec.Name());
+}
+
+DeviceManager::DeviceRegister& DeviceManager::GetDevicesRegister() const {
+    static DeviceRegister device_register;
+
+    if (device_register.devices.size() == 0) {
+        std::lock_guard<std::mutex> guard(device_registering_mutex);
+        std::vector<::sycl::device> devices = ::sycl::device::get_devices();
+        for (size_t i = 0; i < devices.size(); i++) {
+            LOG(INFO) << "device_index = " << i << ", name = "
+                      << devices[i].get_info<::sycl::info::device::name>();
+        }
+
+        for (size_t i = 0; i < devices.size(); i++) {
+            device_register.devices.push_back(devices[i]);
+            if (devices[i].is_cpu()) {
+                device_register.cpu_devices.push_back(devices[i]);
+            } else if (devices[i].is_gpu()) {
+                device_register.gpu_devices.push_back(devices[i]);
+            }
+        }
+    }
+    return device_register;
+}
+
+DeviceManager::QueueRegister_t& DeviceManager::GetQueueRegister() const {
+    static QueueRegister_t queue_register;
+    return queue_register;
+}
+
+}  // namespace sycl
+}  // namespace xgboost
--- a/plugin/sycl/device_manager.h
+++ b/plugin/sycl/device_manager.h
@ -0,0 +1,47 @@
+/*!
+ * Copyright 2017-2023 by Contributors
+ * \file device_manager.h
+ */
+#ifndef PLUGIN_SYCL_DEVICE_MANAGER_H_
+#define PLUGIN_SYCL_DEVICE_MANAGER_H_
+
+#include <vector>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+
+#include <CL/sycl.hpp>
+
+#include "xgboost/context.h"
+
+namespace xgboost {
+namespace sycl {
+
+class DeviceManager {
+ public:
+  ::sycl::queue GetQueue(const DeviceOrd& device_spec) const;
+
+  ::sycl::device GetDevice(const DeviceOrd& device_spec) const;
+
+ private:
+  using QueueRegister_t = std::unordered_map<std::string, ::sycl::queue>;
+  constexpr static int kDefaultOrdinal = -1;
+
+  struct DeviceRegister {
+    std::vector<::sycl::device> devices;
+    std::vector<::sycl::device> cpu_devices;
+    std::vector<::sycl::device> gpu_devices;
+  };
+
+  QueueRegister_t& GetQueueRegister() const;
+
+  DeviceRegister& GetDevicesRegister() const;
+
+  mutable std::mutex queue_registering_mutex;
+  mutable std::mutex device_registering_mutex;
+};
+
+}  // namespace sycl
+}  // namespace xgboost
+
+#endif  // PLUGIN_SYCL_DEVICE_MANAGER_H_
--- a/plugin/sycl/predictor/predictor.cc
+++ b/plugin/sycl/predictor/predictor.cc
@ -0,0 +1,342 @@
+/*!
+ * Copyright by Contributors 2017-2023
+ */
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
+#pragma GCC diagnostic ignored "-W#pragma-messages"
+#include <rabit/rabit.h>
+#pragma GCC diagnostic pop
+
+#include <cstddef>
+#include <limits>
+#include <mutex>
+
+#include <CL/sycl.hpp>
+
+#include "../data.h"
+
+#include "dmlc/registry.h"
+
+#include "xgboost/tree_model.h"
+#include "xgboost/predictor.h"
+#include "xgboost/tree_updater.h"
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
+#include "../../src/data/adapter.h"
+#pragma GCC diagnostic pop
+#include "../../src/common/math.h"
+#include "../../src/gbm/gbtree_model.h"
+
+#include "../device_manager.h"
+
+namespace xgboost {
+namespace sycl {
+namespace predictor {
+
+DMLC_REGISTRY_FILE_TAG(predictor_sycl);
+
+/* Wrapper for descriptor of a tree node */
+struct DeviceNode {
+  DeviceNode()
+      : fidx(-1), left_child_idx(-1), right_child_idx(-1) {}
+
+  union NodeValue {
+    float leaf_weight;
+    float fvalue;
+  };
+
+  int fidx;
+  int left_child_idx;
+  int right_child_idx;
+  NodeValue val;
+
+  explicit DeviceNode(const RegTree::Node& n) {
+    this->left_child_idx = n.LeftChild();
+    this->right_child_idx = n.RightChild();
+    this->fidx = n.SplitIndex();
+    if (n.DefaultLeft()) {
+      fidx |= (1U << 31);
+    }
+
+    if (n.IsLeaf()) {
+      this->val.leaf_weight = n.LeafValue();
+    } else {
+      this->val.fvalue = n.SplitCond();
+    }
+  }
+
+  bool IsLeaf() const { return left_child_idx == -1; }
+
+  int GetFidx() const { return fidx & ((1U << 31) - 1U); }
+
+  bool MissingLeft() const { return (fidx >> 31) != 0; }
+
+  int MissingIdx() const {
+    if (MissingLeft()) {
+      return this->left_child_idx;
+    } else {
+      return this->right_child_idx;
+    }
+  }
+
+  float GetFvalue() const { return val.fvalue; }
+
+  float GetWeight() const { return val.leaf_weight; }
+};
+
+/* SYCL implementation of a device model,
+ * storing tree structure in USM buffers to provide access from device kernels
+ */
+class DeviceModel {
+ public:
+  ::sycl::queue qu_;
+  USMVector<DeviceNode> nodes_;
+  USMVector<size_t> tree_segments_;
+  USMVector<int> tree_group_;
+  size_t tree_beg_;
+  size_t tree_end_;
+  int num_group_;
+
+  DeviceModel() {}
+
+  ~DeviceModel() {}
+
+  void Init(::sycl::queue qu, const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end) {
+    qu_ = qu;
+
+    tree_segments_.Resize(&qu_, (tree_end - tree_begin) + 1);
+    int sum = 0;
+    tree_segments_[0] = sum;
+    for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
+      if (model.trees[tree_idx]->HasCategoricalSplit()) {
+        LOG(FATAL) << "Categorical features are not yet supported by sycl";
+      }
+      sum += model.trees[tree_idx]->GetNodes().size();
+      tree_segments_[tree_idx - tree_begin + 1] = sum;
+    }
+
+    nodes_.Resize(&qu_, sum);
+    for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
+      auto& src_nodes = model.trees[tree_idx]->GetNodes();
+      for (size_t node_idx = 0; node_idx < src_nodes.size(); node_idx++)
+        nodes_[node_idx + tree_segments_[tree_idx - tree_begin]] =
+          static_cast<DeviceNode>(src_nodes[node_idx]);
+    }
+
+    tree_group_.Resize(&qu_, model.tree_info.size());
+    for (size_t tree_idx = 0; tree_idx < model.tree_info.size(); tree_idx++)
+      tree_group_[tree_idx] = model.tree_info[tree_idx];
+
+    tree_beg_ = tree_begin;
+    tree_end_ = tree_end;
+    num_group_ = model.learner_model_param->num_output_group;
+  }
+};
+
+float GetFvalue(int ridx, int fidx, Entry* data, size_t* row_ptr, bool* is_missing) {
+  // Binary search
+  auto begin_ptr = data + row_ptr[ridx];
+  auto end_ptr = data + row_ptr[ridx + 1];
+  Entry* previous_middle = nullptr;
+  while (end_ptr != begin_ptr) {
+    auto middle = begin_ptr + (end_ptr - begin_ptr) / 2;
+    if (middle == previous_middle) {
+      break;
+    } else {
+      previous_middle = middle;
+    }
+
+    if (middle->index == fidx) {
+      *is_missing = false;
+      return middle->fvalue;
+    } else if (middle->index < fidx) {
+      begin_ptr = middle;
+    } else {
+      end_ptr = middle;
+    }
+  }
+  *is_missing = true;
+  return 0.0;
+}
+
+float GetLeafWeight(int ridx, const DeviceNode* tree, Entry* data, size_t* row_ptr) {
+  DeviceNode n = tree[0];
+  int node_id = 0;
+  bool is_missing;
+  while (!n.IsLeaf()) {
+    float fvalue = GetFvalue(ridx, n.GetFidx(), data, row_ptr, &is_missing);
+    // Missing value
+    if (is_missing) {
+      n = tree[n.MissingIdx()];
+    } else {
+      if (fvalue < n.GetFvalue()) {
+        node_id = n.left_child_idx;
+        n = tree[n.left_child_idx];
+      } else {
+        node_id = n.right_child_idx;
+        n = tree[n.right_child_idx];
+      }
+    }
+  }
+  return n.GetWeight();
+}
+
+void DevicePredictInternal(::sycl::queue qu,
+                           sycl::DeviceMatrix* dmat,
+                           HostDeviceVector<float>* out_preds,
+                           const gbm::GBTreeModel& model,
+                           size_t tree_begin,
+                           size_t tree_end) {
+  if (tree_end - tree_begin == 0) return;
+  if (out_preds->HostVector().size() == 0) return;
+
+  DeviceModel device_model;
+  device_model.Init(qu, model, tree_begin, tree_end);
+
+  auto& out_preds_vec = out_preds->HostVector();
+
+  DeviceNode* nodes = device_model.nodes_.Data();
+  ::sycl::buffer<float, 1> out_preds_buf(out_preds_vec.data(), out_preds_vec.size());
+  size_t* tree_segments = device_model.tree_segments_.Data();
+  int* tree_group = device_model.tree_group_.Data();
+  size_t* row_ptr = dmat->row_ptr.Data();
+  Entry* data = dmat->data.Data();
+  int num_features = dmat->p_mat->Info().num_col_;
+  int num_rows = dmat->row_ptr.Size() - 1;
+  int num_group = model.learner_model_param->num_output_group;
+
+  qu.submit([&](::sycl::handler& cgh) {
+    auto out_predictions = out_preds_buf.template get_access<::sycl::access::mode::read_write>(cgh);
+    cgh.parallel_for<>(::sycl::range<1>(num_rows), [=](::sycl::id<1> pid) {
+      int global_idx = pid[0];
+      if (global_idx >= num_rows) return;
+      if (num_group == 1) {
+        float sum = 0.0;
+        for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
+          const DeviceNode* tree = nodes + tree_segments[tree_idx - tree_begin];
+          sum += GetLeafWeight(global_idx, tree, data, row_ptr);
+        }
+        out_predictions[global_idx] += sum;
+      } else {
+        for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
+          const DeviceNode* tree = nodes + tree_segments[tree_idx - tree_begin];
+          int out_prediction_idx = global_idx * num_group + tree_group[tree_idx];
+          out_predictions[out_prediction_idx] += GetLeafWeight(global_idx, tree, data, row_ptr);
+        }
+      }
+    });
+  }).wait();
+}
+
+class Predictor : public xgboost::Predictor {
+ protected:
+  void InitOutPredictions(const MetaInfo& info,
+                          HostDeviceVector<bst_float>* out_preds,
+                          const gbm::GBTreeModel& model) const override {
+    CHECK_NE(model.learner_model_param->num_output_group, 0);
+    size_t n = model.learner_model_param->num_output_group * info.num_row_;
+    const auto& base_margin = info.base_margin_.Data()->HostVector();
+    out_preds->Resize(n);
+    std::vector<bst_float>& out_preds_h = out_preds->HostVector();
+    if (base_margin.size() == n) {
+      CHECK_EQ(out_preds->Size(), n);
+      std::copy(base_margin.begin(), base_margin.end(), out_preds_h.begin());
+    } else {
+      auto base_score = model.learner_model_param->BaseScore(ctx_)(0);
+      if (!base_margin.empty()) {
+        std::ostringstream oss;
+        oss << "Ignoring the base margin, since it has incorrect length. "
+            << "The base margin must be an array of length ";
+        if (model.learner_model_param->num_output_group > 1) {
+          oss << "[num_class] * [number of data points], i.e. "
+              << model.learner_model_param->num_output_group << " * " << info.num_row_
+              << " = " << n << ". ";
+        } else {
+          oss << "[number of data points], i.e. " << info.num_row_ << ". ";
+        }
+        oss << "Instead, all data points will use "
+            << "base_score = " << base_score;
+        LOG(WARNING) << oss.str();
+      }
+      std::fill(out_preds_h.begin(), out_preds_h.end(), base_score);
+    }
+  }
+
+ public:
+  explicit Predictor(Context const* context) :
+      xgboost::Predictor::Predictor{context},
+      cpu_predictor(xgboost::Predictor::Create("cpu_predictor", context)) {}
+
+  void PredictBatch(DMatrix *dmat, PredictionCacheEntry *predts,
+                    const gbm::GBTreeModel &model, uint32_t tree_begin,
+                    uint32_t tree_end = 0) const override {
+    ::sycl::queue qu = device_manager.GetQueue(ctx_->Device());
+    // TODO(razdoburdin): remove temporary workaround after cache fix
+    sycl::DeviceMatrix device_matrix(qu, dmat);
+
+    auto* out_preds = &predts->predictions;
+    if (tree_end == 0) {
+      tree_end = model.trees.size();
+    }
+
+    if (tree_begin < tree_end) {
+      DevicePredictInternal(qu, &device_matrix, out_preds, model, tree_begin, tree_end);
+    }
+  }
+
+  bool InplacePredict(std::shared_ptr<DMatrix> p_m,
+                      const gbm::GBTreeModel &model, float missing,
+                      PredictionCacheEntry *out_preds, uint32_t tree_begin,
+                      unsigned tree_end) const override {
+    LOG(WARNING) << "InplacePredict is not yet implemented for SYCL. CPU Predictor is used.";
+    return cpu_predictor->InplacePredict(p_m, model, missing, out_preds, tree_begin, tree_end);
+  }
+
+  void PredictInstance(const SparsePage::Inst& inst,
+                       std::vector<bst_float>* out_preds,
+                       const gbm::GBTreeModel& model, unsigned ntree_limit,
+                       bool is_column_split) const override {
+    LOG(WARNING) << "PredictInstance is not yet implemented for SYCL. CPU Predictor is used.";
+    cpu_predictor->PredictInstance(inst, out_preds, model, ntree_limit, is_column_split);
+  }
+
+  void PredictLeaf(DMatrix* p_fmat, HostDeviceVector<bst_float>* out_preds,
+                   const gbm::GBTreeModel& model, unsigned ntree_limit) const override {
+    LOG(WARNING) << "PredictLeaf is not yet implemented for SYCL. CPU Predictor is used.";
+    cpu_predictor->PredictLeaf(p_fmat, out_preds, model, ntree_limit);
+  }
+
+  void PredictContribution(DMatrix* p_fmat, HostDeviceVector<float>* out_contribs,
+                           const gbm::GBTreeModel& model, uint32_t ntree_limit,
+                           const std::vector<bst_float>* tree_weights,
+                           bool approximate, int condition,
+                           unsigned condition_feature) const override {
+    LOG(WARNING) << "PredictContribution is not yet implemented for SYCL. CPU Predictor is used.";
+    cpu_predictor->PredictContribution(p_fmat, out_contribs, model, ntree_limit, tree_weights,
+                                       approximate, condition, condition_feature);
+  }
+
+  void PredictInteractionContributions(DMatrix* p_fmat, HostDeviceVector<bst_float>* out_contribs,
+                                       const gbm::GBTreeModel& model, unsigned ntree_limit,
+                                       const std::vector<bst_float>* tree_weights,
+                                       bool approximate) const override {
+    LOG(WARNING) << "PredictInteractionContributions is not yet implemented for SYCL. "
+                 << "CPU Predictor is used.";
+    cpu_predictor->PredictInteractionContributions(p_fmat, out_contribs, model, ntree_limit,
+                                                   tree_weights, approximate);
+  }
+
+ private:
+  DeviceManager device_manager;
+
+  std::unique_ptr<xgboost::Predictor> cpu_predictor;
+};
+
+XGBOOST_REGISTER_PREDICTOR(Predictor, "sycl_predictor")
+.describe("Make predictions using SYCL.")
+.set_body([](Context const* ctx) { return new Predictor(ctx); });
+
+}  // namespace predictor
+}  // namespace sycl
+}  // namespace xgboost
--- a/plugin/updater_oneapi/README.md
+++ b/plugin/updater_oneapi/README.md
@ -1,42 +0,0 @@
-# DPC++-based Algorithm for Tree Construction
-This plugin adds support of OneAPI programming model for tree construction and prediction algorithms to XGBoost.
-
-## Usage
-Specify the 'objective' parameter as one of the following options to offload computation of objective function on OneAPI device. 
-
-### Algorithms
-| objective | Description |
-| --- | --- |
-reg:squarederror_oneapi | regression with squared loss  |
-reg:squaredlogerror_oneapi | regression with root mean squared logarithmic loss |
-reg:logistic_oneapi | logistic regression for probability regression task |
-binary:logistic_oneapi | logistic regression for binary classification task |
-binary:logitraw_oneapi | logistic regression for classification, output score before logistic transformation |
-
-Specify the 'predictor' parameter as one of the following options to offload prediction stage on OneAPI device. 
-
-### Algorithms
-| predictor | Description |
-| --- | --- |
-predictor_oneapi | prediction using OneAPI device  |
-
-Please note that parameter names are not finalized and can be changed during further integration of OneAPI support.
-
-Python example:
-```python
-param['predictor'] = 'predictor_oneapi'
-param['objective'] = 'reg:squarederror_oneapi'
-```
-
-## Dependencies
-Building the plugin requires Data Parallel C++ Compiler (https://software.intel.com/content/www/us/en/develop/tools/oneapi/components/dpc-compiler.html)
-
-## Build
-From the command line on Linux starting from the xgboost directory:
-
-```bash
-$ mkdir build
-$ cd build
-$ EXPORT CXX=dpcpp && cmake .. -DPLUGIN_UPDATER_ONEAPI=ON
-$ make -j
-```
--- a/plugin/updater_oneapi/predictor_oneapi.cc
+++ b/plugin/updater_oneapi/predictor_oneapi.cc
@ -1,447 +0,0 @@
-/*!
- * Copyright by Contributors 2017-2020
- */
-#include <any>  // for any
-#include <cstddef>
-#include <limits>
-#include <mutex>
-
-#include "../../src/common/math.h"
-#include "../../src/data/adapter.h"
-#include "../../src/gbm/gbtree_model.h"
-#include "CL/sycl.hpp"
-#include "xgboost/base.h"
-#include "xgboost/data.h"
-#include "xgboost/host_device_vector.h"
-#include "xgboost/logging.h"
-#include "xgboost/predictor.h"
-#include "xgboost/tree_model.h"
-#include "xgboost/tree_updater.h"
-
-namespace xgboost {
-namespace predictor {
-
-DMLC_REGISTRY_FILE_TAG(predictor_oneapi);
-
-/*! \brief Element from a sparse vector */
-struct EntryOneAPI {
-  /*! \brief feature index */
-  bst_feature_t index;
-  /*! \brief feature value */
-  bst_float fvalue;
-  /*! \brief default constructor */
-  EntryOneAPI() = default;
-  /*!
-   * \brief constructor with index and value
-   * \param index The feature or row index.
-   * \param fvalue The feature value.
-   */
-  EntryOneAPI(bst_feature_t index, bst_float fvalue) : index(index), fvalue(fvalue) {}
-
-  EntryOneAPI(const Entry& entry) : index(entry.index), fvalue(entry.fvalue) {}
-
-  /*! \brief reversely compare feature values */
-  inline static bool CmpValue(const EntryOneAPI& a, const EntryOneAPI& b) {
-    return a.fvalue < b.fvalue;
-  }
-  inline bool operator==(const EntryOneAPI& other) const {
-    return (this->index == other.index && this->fvalue == other.fvalue);
-  }
-};
-
-struct DeviceMatrixOneAPI {
-  DMatrix* p_mat;  // Pointer to the original matrix on the host
-  cl::sycl::queue qu_;
-  size_t* row_ptr;
-  size_t row_ptr_size;
-  EntryOneAPI* data;
-
-  DeviceMatrixOneAPI(DMatrix* dmat, cl::sycl::queue qu) : p_mat(dmat), qu_(qu) {
-    size_t num_row = 0;
-    size_t num_nonzero = 0;
-    for (auto &batch : dmat->GetBatches<SparsePage>()) {
-      const auto& data_vec = batch.data.HostVector();
-      const auto& offset_vec = batch.offset.HostVector();
-      num_nonzero += data_vec.size();
-      num_row += batch.Size();
-    }
-
-    row_ptr = cl::sycl::malloc_shared<size_t>(num_row + 1, qu_);
-    data = cl::sycl::malloc_shared<EntryOneAPI>(num_nonzero, qu_);
-
-    size_t data_offset = 0;
-    for (auto &batch : dmat->GetBatches<SparsePage>()) {
-      const auto& data_vec = batch.data.HostVector();
-      const auto& offset_vec = batch.offset.HostVector();
-      size_t batch_size = batch.Size();
-      if (batch_size > 0) {
-        std::copy(offset_vec.data(), offset_vec.data() + batch_size,
-                  row_ptr + batch.base_rowid);
-        if (batch.base_rowid > 0) {
-          for(size_t i = 0; i < batch_size; i++)
-            row_ptr[i + batch.base_rowid] += batch.base_rowid;
-        }
-        std::copy(data_vec.data(), data_vec.data() + offset_vec[batch_size],
-                  data + data_offset);
-        data_offset += offset_vec[batch_size];
-      }
-    }
-    row_ptr[num_row] = data_offset;
-    row_ptr_size = num_row + 1;
-  }
-
-  ~DeviceMatrixOneAPI() {
-    if (row_ptr) {
-      cl::sycl::free(row_ptr, qu_);
-    }
-    if (data) {
-      cl::sycl::free(data, qu_);
-    }
-  }
-};
-
-struct DeviceNodeOneAPI {
-  DeviceNodeOneAPI()
-      : fidx(-1), left_child_idx(-1), right_child_idx(-1) {}
-
-  union NodeValue {
-    float leaf_weight;
-    float fvalue;
-  };
-
-  int fidx;
-  int left_child_idx;
-  int right_child_idx;
-  NodeValue val;
-
-  DeviceNodeOneAPI(const RegTree::Node& n) {  // NOLINT
-    this->left_child_idx = n.LeftChild();
-    this->right_child_idx = n.RightChild();
-    this->fidx = n.SplitIndex();
-    if (n.DefaultLeft()) {
-      fidx |= (1U << 31);
-    }
-
-    if (n.IsLeaf()) {
-      this->val.leaf_weight = n.LeafValue();
-    } else {
-      this->val.fvalue = n.SplitCond();
-    }
-  }
-
-  bool IsLeaf() const { return left_child_idx == -1; }
-
-  int GetFidx() const { return fidx & ((1U << 31) - 1U); }
-
-  bool MissingLeft() const { return (fidx >> 31) != 0; }
-
-  int MissingIdx() const {
-    if (MissingLeft()) {
-      return this->left_child_idx;
-    } else {
-      return this->right_child_idx;
-    }
-  }
-
-  float GetFvalue() const { return val.fvalue; }
-
-  float GetWeight() const { return val.leaf_weight; }
-};
-
-class DeviceModelOneAPI {
- public:
-  cl::sycl::queue qu_;
-  DeviceNodeOneAPI* nodes;
-  size_t* tree_segments;
-  int* tree_group;
-  size_t tree_beg_;
-  size_t tree_end_;
-  int num_group;
-
-  DeviceModelOneAPI() : nodes(nullptr), tree_segments(nullptr), tree_group(nullptr) {}
-
-  ~DeviceModelOneAPI() {
-    Reset();
-  }
-
-  void Reset() {
-    if (nodes)
-      cl::sycl::free(nodes, qu_);
-    if (tree_segments)
-      cl::sycl::free(tree_segments, qu_);
-    if (tree_group)
-      cl::sycl::free(tree_group, qu_);
-  }
-
-  void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, cl::sycl::queue qu) {
-    qu_ = qu;
-    CHECK_EQ(model.param.size_leaf_vector, 0);
-    Reset();
-
-    tree_segments = cl::sycl::malloc_shared<size_t>((tree_end - tree_begin) + 1, qu_);
-    int sum = 0;
-    tree_segments[0] = sum;
-    for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
-      sum += model.trees[tree_idx]->GetNodes().size();
-      tree_segments[tree_idx - tree_begin + 1] = sum;
-    }
-
-    nodes = cl::sycl::malloc_shared<DeviceNodeOneAPI>(sum, qu_);
-    for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
-      auto& src_nodes = model.trees[tree_idx]->GetNodes();
-      for (size_t node_idx = 0; node_idx < src_nodes.size(); node_idx++)
-        nodes[node_idx + tree_segments[tree_idx - tree_begin]] = src_nodes[node_idx];
-    }
-
-    tree_group = cl::sycl::malloc_shared<int>(model.tree_info.size(), qu_);
-    for (size_t tree_idx = 0; tree_idx < model.tree_info.size(); tree_idx++)
-      tree_group[tree_idx] = model.tree_info[tree_idx];
-
-    tree_beg_ = tree_begin;
-    tree_end_ = tree_end;
-    num_group = model.learner_model_param->num_output_group;
-  }
-};
-
-float GetFvalue(int ridx, int fidx, EntryOneAPI* data, size_t* row_ptr, bool& is_missing) {
-  // Binary search
-  auto begin_ptr = data + row_ptr[ridx];
-  auto end_ptr = data + row_ptr[ridx + 1];
-  EntryOneAPI* previous_middle = nullptr;
-  while (end_ptr != begin_ptr) {
-    auto middle = begin_ptr + (end_ptr - begin_ptr) / 2;
-    if (middle == previous_middle) {
-      break;
-    } else {
-      previous_middle = middle;
-    }
-
-    if (middle->index == fidx) {
-      is_missing = false;
-      return middle->fvalue;
-    } else if (middle->index < fidx) {
-      begin_ptr = middle;
-    } else {
-      end_ptr = middle;
-    }
-  }
-  is_missing = true;
-  return 0.0;
-}
-
-float GetLeafWeight(int ridx, const DeviceNodeOneAPI* tree, EntryOneAPI* data, size_t* row_ptr) {
-  DeviceNodeOneAPI n = tree[0];
-  int node_id = 0;
-  bool is_missing;
-  while (!n.IsLeaf()) {
-    float fvalue = GetFvalue(ridx, n.GetFidx(), data, row_ptr, is_missing);
-    // Missing value
-    if (is_missing) {
-      n = tree[n.MissingIdx()];
-    } else {
-      if (fvalue < n.GetFvalue()) {
-        node_id = n.left_child_idx;
-        n = tree[n.left_child_idx];
-      } else {
-        node_id = n.right_child_idx;
-        n = tree[n.right_child_idx];
-      }
-    }
-  }
-  return n.GetWeight();
-}
-
-class PredictorOneAPI : public Predictor {
- protected:
-  void InitOutPredictions(const MetaInfo& info,
-                          HostDeviceVector<bst_float>* out_preds,
-                          const gbm::GBTreeModel& model) const {
-    CHECK_NE(model.learner_model_param->num_output_group, 0);
-    size_t n = model.learner_model_param->num_output_group * info.num_row_;
-    const auto& base_margin = info.base_margin_.HostVector();
-    out_preds->Resize(n);
-    std::vector<bst_float>& out_preds_h = out_preds->HostVector();
-    if (base_margin.size() == n) {
-      CHECK_EQ(out_preds->Size(), n);
-      std::copy(base_margin.begin(), base_margin.end(), out_preds_h.begin());
-    } else {
-      if (!base_margin.empty()) {
-        std::ostringstream oss;
-        oss << "Ignoring the base margin, since it has incorrect length. "
-            << "The base margin must be an array of length ";
-        if (model.learner_model_param->num_output_group > 1) {
-          oss << "[num_class] * [number of data points], i.e. "
-              << model.learner_model_param->num_output_group << " * " << info.num_row_
-              << " = " << n << ". ";
-        } else {
-          oss << "[number of data points], i.e. " << info.num_row_ << ". ";
-        }
-        oss << "Instead, all data points will use "
-            << "base_score = " << model.learner_model_param->base_score;
-        LOG(WARNING) << oss.str();
-      }
-      std::fill(out_preds_h.begin(), out_preds_h.end(),
-                model.learner_model_param->base_score);
-    }
-  }
-
-  void DevicePredictInternal(DeviceMatrixOneAPI* dmat, HostDeviceVector<float>* out_preds,
-                             const gbm::GBTreeModel& model, size_t tree_begin,
-                             size_t tree_end) {
-    if (tree_end - tree_begin == 0) {
-      return;
-    }
-    model_.Init(model, tree_begin, tree_end, qu_);
-
-    auto& out_preds_vec = out_preds->HostVector();
-
-    DeviceNodeOneAPI* nodes = model_.nodes;
-    cl::sycl::buffer<float, 1> out_preds_buf(out_preds_vec.data(), out_preds_vec.size());
-    size_t* tree_segments = model_.tree_segments;
-    int* tree_group = model_.tree_group;
-    size_t* row_ptr = dmat->row_ptr;
-    EntryOneAPI* data = dmat->data;
-    int num_features = dmat->p_mat->Info().num_col_;
-    int num_rows = dmat->row_ptr_size - 1;
-    int num_group = model.learner_model_param->num_output_group;
-
-    qu_.submit([&](cl::sycl::handler& cgh) {
-      auto out_predictions = out_preds_buf.get_access<cl::sycl::access::mode::read_write>(cgh);
-      cgh.parallel_for<class PredictInternal>(cl::sycl::range<1>(num_rows), [=](cl::sycl::id<1> pid) {
-        int global_idx = pid[0];
-        if (global_idx >= num_rows) return;
-        if (num_group == 1) {
-          float sum = 0.0;
-          for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
-            const DeviceNodeOneAPI* tree = nodes + tree_segments[tree_idx - tree_begin];
-            sum += GetLeafWeight(global_idx, tree, data, row_ptr);
-          }
-          out_predictions[global_idx] += sum;
-        } else {
-          for (int tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
-            const DeviceNodeOneAPI* tree = nodes + tree_segments[tree_idx - tree_begin];
-            int out_prediction_idx = global_idx * num_group + tree_group[tree_idx];
-            out_predictions[out_prediction_idx] += GetLeafWeight(global_idx, tree, data, row_ptr);
-          }
-        }
-      });
-    }).wait();
-  }
-
- public:
-  explicit PredictorOneAPI(Context const* generic_param) :
-      Predictor::Predictor{generic_param}, cpu_predictor(Predictor::Create("cpu_predictor", generic_param)) {
-    cl::sycl::default_selector selector;
-    qu_ = cl::sycl::queue(selector);
-  }
-
-  // ntree_limit is a very problematic parameter, as it's ambiguous in the context of
-  // multi-output and forest.  Same problem exists for tree_begin
-  void PredictBatch(DMatrix* dmat, PredictionCacheEntry* predts,
-                    const gbm::GBTreeModel& model, int tree_begin,
-                    uint32_t const ntree_limit = 0) override {
-    if (this->device_matrix_cache_.find(dmat) ==
-        this->device_matrix_cache_.end()) {
-      this->device_matrix_cache_.emplace(
-          dmat, std::unique_ptr<DeviceMatrixOneAPI>(
-                    new DeviceMatrixOneAPI(dmat, qu_)));
-    }
-    DeviceMatrixOneAPI* device_matrix = device_matrix_cache_.find(dmat)->second.get();
-
-    // tree_begin is not used, right now we just enforce it to be 0.
-    CHECK_EQ(tree_begin, 0);
-    auto* out_preds = &predts->predictions;
-    CHECK_GE(predts->version, tree_begin);
-    if (out_preds->Size() == 0 && dmat->Info().num_row_ != 0) {
-      CHECK_EQ(predts->version, 0);
-    }
-    if (predts->version == 0) {
-      // out_preds->Size() can be non-zero as it's initialized here before any tree is
-      // built at the 0^th iterator.
-      this->InitOutPredictions(dmat->Info(), out_preds, model);
-    }
-
-    uint32_t const output_groups = model.learner_model_param->num_output_group;
-    CHECK_NE(output_groups, 0);
-    // Right now we just assume ntree_limit provided by users means number of tree layers
-    // in the context of multi-output model
-    uint32_t real_ntree_limit = ntree_limit * output_groups;
-    if (real_ntree_limit == 0 || real_ntree_limit > model.trees.size()) {
-      real_ntree_limit = static_cast<uint32_t>(model.trees.size());
-    }
-
-    uint32_t const end_version = (tree_begin + real_ntree_limit) / output_groups;
-    // When users have provided ntree_limit, end_version can be lesser, cache is violated
-    if (predts->version > end_version) {
-      CHECK_NE(ntree_limit, 0);
-      this->InitOutPredictions(dmat->Info(), out_preds, model);
-      predts->version = 0;
-    }
-    uint32_t const beg_version = predts->version;
-    CHECK_LE(beg_version, end_version);
-
-    if (beg_version < end_version) {
-      DevicePredictInternal(device_matrix, out_preds, model,
-                            beg_version * output_groups,
-                            end_version * output_groups);
-    }
-
-    // delta means {size of forest} * {number of newly accumulated layers}
-    uint32_t delta = end_version - beg_version;
-    CHECK_LE(delta, model.trees.size());
-    predts->Update(delta);
-
-    CHECK(out_preds->Size() == output_groups * dmat->Info().num_row_ ||
-          out_preds->Size() == dmat->Info().num_row_);
-  }
-
-  void InplacePredict(std::any const& x, const gbm::GBTreeModel& model, float missing,
-                      PredictionCacheEntry* out_preds, uint32_t tree_begin,
-                      unsigned tree_end) const override {
-    cpu_predictor->InplacePredict(x, model, missing, out_preds, tree_begin, tree_end);
-  }
-
-  void PredictInstance(const SparsePage::Inst& inst,
-                       std::vector<bst_float>* out_preds,
-                       const gbm::GBTreeModel& model, unsigned ntree_limit) override {
-    cpu_predictor->PredictInstance(inst, out_preds, model, ntree_limit);
-  }
-
-  void PredictLeaf(DMatrix* p_fmat, std::vector<bst_float>* out_preds,
-                   const gbm::GBTreeModel& model, unsigned ntree_limit) override {
-    cpu_predictor->PredictLeaf(p_fmat, out_preds, model, ntree_limit);
-  }
-
-  void PredictContribution(DMatrix* p_fmat, std::vector<bst_float>* out_contribs,
-                           const gbm::GBTreeModel& model, uint32_t ntree_limit,
-                           std::vector<bst_float>* tree_weights,
-                           bool approximate, int condition,
-                           unsigned condition_feature) override {
-    cpu_predictor->PredictContribution(p_fmat, out_contribs, model, ntree_limit, tree_weights, approximate, condition, condition_feature);
-  }
-
-  void PredictInteractionContributions(DMatrix* p_fmat, std::vector<bst_float>* out_contribs,
-                                       const gbm::GBTreeModel& model, unsigned ntree_limit,
-                                       std::vector<bst_float>* tree_weights,
-                                       bool approximate) override {
-    cpu_predictor->PredictInteractionContributions(p_fmat, out_contribs, model, ntree_limit, tree_weights, approximate);
-  }
-
- private:
-  cl::sycl::queue qu_;
-  DeviceModelOneAPI model_;
-
-  std::mutex lock_;
-  std::unique_ptr<Predictor> cpu_predictor;
-
-  std::unordered_map<DMatrix*, std::unique_ptr<DeviceMatrixOneAPI>>
-      device_matrix_cache_;
-};
-
-XGBOOST_REGISTER_PREDICTOR(PredictorOneAPI, "oneapi_predictor")
-.describe("Make predictions using DPC++.")
-.set_body([](Context const* generic_param) {
-            return new PredictorOneAPI(generic_param);
-          });
-}  // namespace predictor
-}  // namespace xgboost
--- a/plugin/updater_oneapi/regression_loss_oneapi.h
+++ b/plugin/updater_oneapi/regression_loss_oneapi.h
@ -1,145 +0,0 @@
-/*!
- * Copyright 2017-2020 XGBoost contributors
- */
-#ifndef XGBOOST_OBJECTIVE_REGRESSION_LOSS_ONEAPI_H_
-#define XGBOOST_OBJECTIVE_REGRESSION_LOSS_ONEAPI_H_
-
-#include <dmlc/omp.h>
-#include <xgboost/logging.h>
-#include <algorithm>
-
-#include "CL/sycl.hpp"
-
-namespace xgboost {
-namespace obj {
-
-/*!
- * \brief calculate the sigmoid of the input.
- * \param x input parameter
- * \return the transformed value.
- */
-inline float SigmoidOneAPI(float x) {
-  return 1.0f / (1.0f + cl::sycl::exp(-x));
-}
-
-// common regressions
-// linear regression
-struct LinearSquareLossOneAPI {
-  static bst_float PredTransform(bst_float x) { return x; }
-  static bool CheckLabel(bst_float x) { return true; }
-  static bst_float FirstOrderGradient(bst_float predt, bst_float label) {
-    return predt - label;
-  }
-  static bst_float SecondOrderGradient(bst_float predt, bst_float label) {
-    return 1.0f;
-  }
-  static bst_float ProbToMargin(bst_float base_score) { return base_score; }
-  static const char* LabelErrorMsg() { return ""; }
-  static const char* DefaultEvalMetric() { return "rmse"; }
-
-  static const char* Name() { return "reg:squarederror_oneapi"; }
-};
-
-// TODO: DPC++ does not fully support std math inside offloaded kernels
-struct SquaredLogErrorOneAPI {
-  static bst_float PredTransform(bst_float x) { return x; }
-  static bool CheckLabel(bst_float label) {
-    return label > -1;
-  }
-  static bst_float FirstOrderGradient(bst_float predt, bst_float label) {
-    predt = std::max(predt, (bst_float)(-1 + 1e-6));  // ensure correct value for log1p
-    return (cl::sycl::log1p(predt) - cl::sycl::log1p(label)) / (predt + 1);
-  }
-  static bst_float SecondOrderGradient(bst_float predt, bst_float label) {
-    predt = std::max(predt, (bst_float)(-1 + 1e-6));
-    float res = (-cl::sycl::log1p(predt) + cl::sycl::log1p(label) + 1) /
-                cl::sycl::pow(predt + 1, (bst_float)2);
-    res = std::max(res, (bst_float)1e-6f);
-    return res;
-  }
-  static bst_float ProbToMargin(bst_float base_score) { return base_score; }
-  static const char* LabelErrorMsg() {
-    return "label must be greater than -1 for rmsle so that log(label + 1) can be valid.";
-  }
-  static const char* DefaultEvalMetric() { return "rmsle"; }
-
-  static const char* Name() { return "reg:squaredlogerror_oneapi"; }
-};
-
-// logistic loss for probability regression task
-struct LogisticRegressionOneAPI {
-  // duplication is necessary, as __device__ specifier
-  // cannot be made conditional on template parameter
-  static bst_float PredTransform(bst_float x) { return SigmoidOneAPI(x); }
-  static bool CheckLabel(bst_float x) { return x >= 0.0f && x <= 1.0f; }
-  static bst_float FirstOrderGradient(bst_float predt, bst_float label) {
-    return predt - label;
-  }
-  static bst_float SecondOrderGradient(bst_float predt, bst_float label) {
-    const bst_float eps = 1e-16f;
-    return std::max(predt * (1.0f - predt), eps);
-  }
-  template <typename T>
-  static T PredTransform(T x) { return SigmoidOneAPI(x); }
-  template <typename T>
-  static T FirstOrderGradient(T predt, T label) { return predt - label; }
-  template <typename T>
-  static T SecondOrderGradient(T predt, T label) {
-    const T eps = T(1e-16f);
-    return std::max(predt * (T(1.0f) - predt), eps);
-  }
-  static bst_float ProbToMargin(bst_float base_score) {
-    CHECK(base_score > 0.0f && base_score < 1.0f)
-        << "base_score must be in (0,1) for logistic loss, got: " << base_score;
-    return -logf(1.0f / base_score - 1.0f);
-  }
-  static const char* LabelErrorMsg() {
-    return "label must be in [0,1] for logistic regression";
-  }
-  static const char* DefaultEvalMetric() { return "rmse"; }
-
-  static const char* Name() { return "reg:logistic_oneapi"; }
-};
-
-// logistic loss for binary classification task
-struct LogisticClassificationOneAPI : public LogisticRegressionOneAPI {
-  static const char* DefaultEvalMetric() { return "logloss"; }
-  static const char* Name() { return "binary:logistic_oneapi"; }
-};
-
-// logistic loss, but predict un-transformed margin
-struct LogisticRawOneAPI : public LogisticRegressionOneAPI {
-  // duplication is necessary, as __device__ specifier
-  // cannot be made conditional on template parameter
-  static bst_float PredTransform(bst_float x) { return x; }
-  static bst_float FirstOrderGradient(bst_float predt, bst_float label) {
-    predt = SigmoidOneAPI(predt);
-    return predt - label;
-  }
-  static bst_float SecondOrderGradient(bst_float predt, bst_float label) {
-    const bst_float eps = 1e-16f;
-    predt = SigmoidOneAPI(predt);
-    return std::max(predt * (1.0f - predt), eps);
-  }
-  template <typename T>
-    static T PredTransform(T x) { return x; }
-  template <typename T>
-    static T FirstOrderGradient(T predt, T label) {
-    predt = SigmoidOneAPI(predt);
-    return predt - label;
-  }
-  template <typename T>
-    static T SecondOrderGradient(T predt, T label) {
-    const T eps = T(1e-16f);
-    predt = SigmoidOneAPI(predt);
-    return std::max(predt * (T(1.0f) - predt), eps);
-  }
-  static const char* DefaultEvalMetric() { return "logloss"; }
-
-  static const char* Name() { return "binary:logitraw_oneapi"; }
-};
-
-}  // namespace obj
-}  // namespace xgboost
-
-#endif  // XGBOOST_OBJECTIVE_REGRESSION_LOSS_ONEAPI_H_
--- a/plugin/updater_oneapi/regression_obj_oneapi.cc
+++ b/plugin/updater_oneapi/regression_obj_oneapi.cc
@ -1,182 +0,0 @@
-#include <xgboost/logging.h>
-#include <xgboost/objective.h>
-#include <cmath>
-#include <memory>
-#include <vector>
-
-#include "xgboost/host_device_vector.h"
-#include "xgboost/json.h"
-#include "xgboost/parameter.h"
-#include "xgboost/span.h"
-
-#include "../../src/common/transform.h"
-#include "../../src/common/common.h"
-#include "./regression_loss_oneapi.h"
-
-#include "CL/sycl.hpp"
-
-namespace xgboost {
-namespace obj {
-
-DMLC_REGISTRY_FILE_TAG(regression_obj_oneapi);
-
-struct RegLossParamOneAPI : public XGBoostParameter<RegLossParamOneAPI> {
-  float scale_pos_weight;
-  // declare parameters
-  DMLC_DECLARE_PARAMETER(RegLossParamOneAPI) {
-    DMLC_DECLARE_FIELD(scale_pos_weight).set_default(1.0f).set_lower_bound(0.0f)
-      .describe("Scale the weight of positive examples by this factor");
-  }
-};
-
-template<typename Loss>
-class RegLossObjOneAPI : public ObjFunction {
- protected:
-  HostDeviceVector<int> label_correct_;
-
- public:
-  RegLossObjOneAPI() = default;
-
-  void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
-    param_.UpdateAllowUnknown(args);
-
-    cl::sycl::default_selector selector;
-    qu_ = cl::sycl::queue(selector);
-  }
-
-  void GetGradient(const HostDeviceVector<bst_float>& preds,
-                   const MetaInfo &info,
-                   int iter,
-                   HostDeviceVector<GradientPair>* out_gpair) override {
-    if (info.labels_.Size() == 0U) {
-      LOG(WARNING) << "Label set is empty.";
-    }
-    CHECK_EQ(preds.Size(), info.labels_.Size())
-        << " " << "labels are not correctly provided"
-        << "preds.size=" << preds.Size() << ", label.size=" << info.labels_.Size() << ", "
-        << "Loss: " << Loss::Name();
-
-    size_t const ndata = preds.Size();
-    out_gpair->Resize(ndata);
-
-    // TODO: add label_correct check
-    label_correct_.Resize(1);
-    label_correct_.Fill(1);
-
-    bool is_null_weight = info.weights_.Size() == 0;
-
-    cl::sycl::buffer<bst_float, 1> preds_buf(preds.HostPointer(), preds.Size());
-    cl::sycl::buffer<bst_float, 1> labels_buf(info.labels_.HostPointer(), info.labels_.Size());
-    cl::sycl::buffer<GradientPair, 1> out_gpair_buf(out_gpair->HostPointer(), out_gpair->Size());
-    cl::sycl::buffer<bst_float, 1> weights_buf(is_null_weight ? NULL : info.weights_.HostPointer(),
-                                               is_null_weight ? 1 : info.weights_.Size());
-
-	cl::sycl::buffer<int, 1> additional_input_buf(1);
-	{
-		auto additional_input_acc = additional_input_buf.get_access<cl::sycl::access::mode::write>();
-		additional_input_acc[0] = 1; // Fill the label_correct flag
-	}
-
-    auto scale_pos_weight = param_.scale_pos_weight;
-    if (!is_null_weight) {
-      CHECK_EQ(info.weights_.Size(), ndata)
-        << "Number of weights should be equal to number of data points.";
-    }
-
-    qu_.submit([&](cl::sycl::handler& cgh) {
-      auto preds_acc            = preds_buf.get_access<cl::sycl::access::mode::read>(cgh);
-      auto labels_acc           = labels_buf.get_access<cl::sycl::access::mode::read>(cgh);
-      auto weights_acc          = weights_buf.get_access<cl::sycl::access::mode::read>(cgh);
-      auto out_gpair_acc        = out_gpair_buf.get_access<cl::sycl::access::mode::write>(cgh);
-      auto additional_input_acc = additional_input_buf.get_access<cl::sycl::access::mode::write>(cgh);
-      cgh.parallel_for<>(cl::sycl::range<1>(ndata), [=](cl::sycl::id<1> pid) {
-        int idx = pid[0];
-        bst_float p = Loss::PredTransform(preds_acc[idx]);
-        bst_float w = is_null_weight ? 1.0f : weights_acc[idx];
-        bst_float label = labels_acc[idx];
-        if (label == 1.0f) {
-          w *= scale_pos_weight;
-        }
-        if (!Loss::CheckLabel(label)) {
-          // If there is an incorrect label, the host code will know.
-          additional_input_acc[0] = 0;
-        }
-        out_gpair_acc[idx] = GradientPair(Loss::FirstOrderGradient(p, label) * w,
-                                          Loss::SecondOrderGradient(p, label) * w);
-      });
-    }).wait();
-
-    int flag = 1;
-	{
-		auto additional_input_acc = additional_input_buf.get_access<cl::sycl::access::mode::read>();
-		flag = additional_input_acc[0];
-	}
-
-    if (flag == 0) {
-      LOG(FATAL) << Loss::LabelErrorMsg();
-    }
-  
-  }
-
- public:
-  const char* DefaultEvalMetric() const override {
-    return Loss::DefaultEvalMetric();
-  }
-
-  void PredTransform(HostDeviceVector<float> *io_preds) override {
-    size_t const ndata = io_preds->Size();
-
-    cl::sycl::buffer<bst_float, 1> io_preds_buf(io_preds->HostPointer(), io_preds->Size());
-
-    qu_.submit([&](cl::sycl::handler& cgh) {
-      auto io_preds_acc = io_preds_buf.get_access<cl::sycl::access::mode::read_write>(cgh);
-      cgh.parallel_for<>(cl::sycl::range<1>(ndata), [=](cl::sycl::id<1> pid) {
-        int idx = pid[0];
-        io_preds_acc[idx] = Loss::PredTransform(io_preds_acc[idx]);
-      });
-    }).wait();
-  }
-
-  float ProbToMargin(float base_score) const override {
-    return Loss::ProbToMargin(base_score);
-  }
-
-  void SaveConfig(Json* p_out) const override {
-    auto& out = *p_out;
-    out["name"] = String(Loss::Name());
-    out["reg_loss_param"] = ToJson(param_);
-  }
-
-  void LoadConfig(Json const& in) override {
-    FromJson(in["reg_loss_param"], &param_);
-  }
-
- protected:
-  RegLossParamOneAPI param_;
-
-  cl::sycl::queue qu_;
-};
-
-// register the objective functions
-DMLC_REGISTER_PARAMETER(RegLossParamOneAPI);
-
-// TODO: Find a better way to dispatch names of DPC++ kernels with various template parameters of loss function
-XGBOOST_REGISTER_OBJECTIVE(SquaredLossRegressionOneAPI, LinearSquareLossOneAPI::Name())
-.describe("Regression with squared error with DPC++ backend.")
-.set_body([]() { return new RegLossObjOneAPI<LinearSquareLossOneAPI>(); });
-XGBOOST_REGISTER_OBJECTIVE(SquareLogErrorOneAPI, SquaredLogErrorOneAPI::Name())
-.describe("Regression with root mean squared logarithmic error with DPC++ backend.")
-.set_body([]() { return new RegLossObjOneAPI<SquaredLogErrorOneAPI>(); });
-XGBOOST_REGISTER_OBJECTIVE(LogisticRegressionOneAPI, LogisticRegressionOneAPI::Name())
-.describe("Logistic regression for probability regression task with DPC++ backend.")
-.set_body([]() { return new RegLossObjOneAPI<LogisticRegressionOneAPI>(); });
-XGBOOST_REGISTER_OBJECTIVE(LogisticClassificationOneAPI, LogisticClassificationOneAPI::Name())
-.describe("Logistic regression for binary classification task with DPC++ backend.")
-.set_body([]() { return new RegLossObjOneAPI<LogisticClassificationOneAPI>(); });
-XGBOOST_REGISTER_OBJECTIVE(LogisticRawOneAPI, LogisticRawOneAPI::Name())
-.describe("Logistic regression for classification, output score "
-          "before logistic transformation with DPC++ backend.")
-.set_body([]() { return new RegLossObjOneAPI<LogisticRawOneAPI>(); });
-
-}  // namespace obj
-}  // namespace xgboost
--- a/python-package/packager/build_config.py
+++ b/python-package/packager/build_config.py
@ -15,18 +15,16 @@ class BuildConfiguration:  # pylint: disable=R0902
    use_cuda: bool = False
    # Whether to enable NCCL
    use_nccl: bool = False
+    # Whether to load nccl dynamically
+    use_dlopen_nccl: bool = False
+    # Whether to enable federated learning
+    plugin_federated: bool = False
+    # Whether to enable rmm support
+    plugin_rmm: bool = False
    # Whether to enable HIP 
    use_hip: bool = False
    # Whether to enable RCCL
    use_rccl: bool = False
-    # Whether to enable HDFS
-    use_hdfs: bool = False
-    # Whether to enable Azure Storage
-    use_azure: bool = False
-    # Whether to enable AWS S3
-    use_s3: bool = False
-    # Whether to enable the dense parser plugin
-    plugin_dense_parser: bool = False
    # Special option: See explanation below
    use_system_libxgboost: bool = False

--- a/python-package/pyproject.toml
+++ b/python-package/pyproject.toml
@ -29,7 +29,8 @@ classifiers = [
 ]
 dependencies = [
    "numpy",
-    "scipy"
+    "scipy",
+    "nvidia-nccl-cu12 ; platform_system == 'Linux' and platform_machine != 'aarch64'"
 ]

 [project.urls]
--- a/python-package/xgboost/collective.py
+++ b/python-package/xgboost/collective.py
@ -2,14 +2,15 @@
 import ctypes
 import json
 import logging
+import os
 import pickle
 from enum import IntEnum, unique
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional

 import numpy as np

 from ._typing import _T
-from .core import _LIB, _check_call, c_str, from_pystr_to_cstr, py_str
+from .core import _LIB, _check_call, build_info, c_str, from_pystr_to_cstr, py_str

 LOGGER = logging.getLogger("[xgboost.collective]")

@ -250,6 +251,31 @@ class CommunicatorContext:

    def __init__(self, **args: Any) -> None:
        self.args = args
+        key = "dmlc_nccl_path"
+        if args.get(key, None) is not None:
+            return
+
+        binfo = build_info()
+        if not binfo["USE_DLOPEN_NCCL"] and not binfo["USE_DLOPEN_RCCL"]:
+            return
+
+        try:
+            # PyPI package of NCCL.
+            from nvidia.nccl import lib
+
+            # There are two versions of nvidia-nccl, one is from PyPI, another one from
+            # nvidia-pyindex. We support only the first one as the second one is too old
+            # (2.9.8 as of writing).
+            if lib.__file__ is not None:
+                dirname: Optional[str] = os.path.dirname(lib.__file__)
+            else:
+                dirname = None
+
+            if dirname:
+                path = os.path.join(dirname, "libnccl.so.2")
+                self.args[key] = path
+        except ImportError:
+            pass

    def __enter__(self) -> Dict[str, Any]:
        init(**self.args)
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@ -184,6 +184,13 @@ def _py_version() -> str:
        return f.read().strip()


+def _register_log_callback(lib: ctypes.CDLL) -> None:
+    lib.XGBGetLastError.restype = ctypes.c_char_p
+    lib.callback = _get_log_callback_func()  # type: ignore
+    if lib.XGBRegisterLogCallback(lib.callback) != 0:
+        raise XGBoostError(lib.XGBGetLastError())
+
+
 def _load_lib() -> ctypes.CDLL:
    """Load xgboost Library."""
    lib_paths = find_lib_path()
@ -228,10 +235,7 @@ Likely causes:
 Error message(s): {os_error_list}
 """
        )
-    lib.XGBGetLastError.restype = ctypes.c_char_p
-    lib.callback = _get_log_callback_func()  # type: ignore
-    if lib.XGBRegisterLogCallback(lib.callback) != 0:
-        raise XGBoostError(lib.XGBGetLastError())
+    _register_log_callback(lib)

    def parse(ver: str) -> Tuple[int, int, int]:
        """Avoid dependency on packaging (PEP 440)."""
--- a/python-package/xgboost/dask/init.py
+++ b/python-package/xgboost/dask/init.py
@ -79,7 +79,6 @@ from xgboost.data import _is_cudf_ser, _is_cupy_array
 from xgboost.sklearn import (
    XGBClassifier,
    XGBClassifierBase,
-    XGBClassifierMixIn,
    XGBModel,
    XGBRanker,
    XGBRankerMixIn,
@ -94,6 +93,8 @@ from xgboost.sklearn import (
 from xgboost.tracker import RabitTracker, get_host_ip
 from xgboost.training import train as worker_train

+from .utils import get_n_threads
+
 if TYPE_CHECKING:
    import dask
    import distributed
@ -908,6 +909,34 @@ async def _check_workers_are_alive(
        raise RuntimeError(f"Missing required workers: {missing_workers}")


+def _get_dmatrices(
+    train_ref: dict,
+    train_id: int,
+    *refs: dict,
+    evals_id: Sequence[int],
+    evals_name: Sequence[str],
+    n_threads: int,
+) -> Tuple[DMatrix, List[Tuple[DMatrix, str]]]:
+    Xy = _dmatrix_from_list_of_parts(**train_ref, nthread=n_threads)
+    evals: List[Tuple[DMatrix, str]] = []
+    for i, ref in enumerate(refs):
+        if evals_id[i] == train_id:
+            evals.append((Xy, evals_name[i]))
+            continue
+        if ref.get("ref", None) is not None:
+            if ref["ref"] != train_id:
+                raise ValueError(
+                    "The training DMatrix should be used as a reference to evaluation"
+                    " `QuantileDMatrix`."
+                )
+            del ref["ref"]
+            eval_Xy = _dmatrix_from_list_of_parts(**ref, nthread=n_threads, ref=Xy)
+        else:
+            eval_Xy = _dmatrix_from_list_of_parts(**ref, nthread=n_threads)
+        evals.append((eval_Xy, evals_name[i]))
+    return Xy, evals
+
+
 async def _train_async(
    client: "distributed.Client",
    global_config: Dict[str, Any],
@ -940,41 +969,20 @@ async def _train_async(
    ) -> Optional[TrainReturnT]:
        worker = distributed.get_worker()
        local_param = parameters.copy()
-        n_threads = 0
-        # dask worker nthreads, "state" is available in 2022.6.1
-        dwnt = worker.state.nthreads if hasattr(worker, "state") else worker.nthreads
-        for p in ["nthread", "n_jobs"]:
-            if (
-                local_param.get(p, None) is not None
-                and local_param.get(p, dwnt) != dwnt
-            ):
-                LOGGER.info("Overriding `nthreads` defined in dask worker.")
-                n_threads = local_param[p]
-                break
-        if n_threads == 0 or n_threads is None:
-            n_threads = dwnt
+        n_threads = get_n_threads(local_param, worker)
        local_param.update({"nthread": n_threads, "n_jobs": n_threads})
+
        local_history: TrainingCallback.EvalsLog = {}
+
        with CommunicatorContext(**rabit_args), config.config_context(**global_config):
-            Xy = _dmatrix_from_list_of_parts(**train_ref, nthread=n_threads)
-            evals: List[Tuple[DMatrix, str]] = []
-            for i, ref in enumerate(refs):
-                if evals_id[i] == train_id:
-                    evals.append((Xy, evals_name[i]))
-                    continue
-                if ref.get("ref", None) is not None:
-                    if ref["ref"] != train_id:
-                        raise ValueError(
-                            "The training DMatrix should be used as a reference"
-                            " to evaluation `QuantileDMatrix`."
-                        )
-                    del ref["ref"]
-                    eval_Xy = _dmatrix_from_list_of_parts(
-                        **ref, nthread=n_threads, ref=Xy
-                    )
-                else:
-                    eval_Xy = _dmatrix_from_list_of_parts(**ref, nthread=n_threads)
-                evals.append((eval_Xy, evals_name[i]))
+            Xy, evals = _get_dmatrices(
+                train_ref,
+                train_id,
+                *refs,
+                evals_id=evals_id,
+                evals_name=evals_name,
+                n_threads=n_threads,
+            )

            booster = worker_train(
                params=local_param,
@ -1854,7 +1862,7 @@ class DaskXGBRegressor(DaskScikitLearnBase, XGBRegressorBase):
    "Implementation of the scikit-learn API for XGBoost classification.",
    ["estimators", "model"],
 )
-class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierMixIn, XGBClassifierBase):
+class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierBase):
    # pylint: disable=missing-class-docstring
    async def _fit_async(
        self,
@ -2036,10 +2044,6 @@ class DaskXGBClassifier(DaskScikitLearnBase, XGBClassifierMixIn, XGBClassifierBa
            preds = da.map_blocks(_argmax, pred_probs, drop_axis=1)
        return preds

-    def load_model(self, fname: ModelIn) -> None:
-        super().load_model(fname)
-        self._load_model_attributes(self.get_booster())
-

@xgboost_model_doc(
    """Implementation of the Scikit-Learn API for XGBoost Ranking.
--- a/python-package/xgboost/dask/utils.py
+++ b/python-package/xgboost/dask/utils.py
@ -0,0 +1,24 @@
+"""Utilities for the XGBoost Dask interface."""
+import logging
+from typing import TYPE_CHECKING, Any, Dict
+
+LOGGER = logging.getLogger("[xgboost.dask]")
+
+
+if TYPE_CHECKING:
+    import distributed
+
+
+def get_n_threads(local_param: Dict[str, Any], worker: "distributed.Worker") -> int:
+    """Get the number of threads from a worker and the user-supplied parameters."""
+    # dask worker nthreads, "state" is available in 2022.6.1
+    dwnt = worker.state.nthreads if hasattr(worker, "state") else worker.nthreads
+    n_threads = None
+    for p in ["nthread", "n_jobs"]:
+        if local_param.get(p, None) is not None and local_param.get(p, dwnt) != dwnt:
+            LOGGER.info("Overriding `nthreads` defined in dask worker.")
+            n_threads = local_param[p]
+            break
+    if n_threads == 0 or n_threads is None:
+        n_threads = dwnt
+    return n_threads
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@ -43,19 +43,6 @@ from .data import _is_cudf_df, _is_cudf_ser, _is_cupy_array, _is_pandas_df
 from .training import train


-class XGBClassifierMixIn:  # pylint: disable=too-few-public-methods
-    """MixIn for classification."""
-
-    def __init__(self, *args: Any, **kwargs: Any) -> None:
-        super().__init__(*args, **kwargs)
-
-    def _load_model_attributes(self, booster: Booster) -> None:
-        config = json.loads(booster.save_config())
-        self.n_classes_ = int(config["learner"]["learner_model_param"]["num_class"])
-        # binary classification is treated as regression in XGBoost.
-        self.n_classes_ = 2 if self.n_classes_ < 2 else self.n_classes_
-
-
 class XGBRankerMixIn:  # pylint: disable=too-few-public-methods
    """MixIn for ranking, defines the _estimator_type usually defined in scikit-learn
    base classes.
@ -808,7 +795,6 @@ class XGBModel(XGBModelBase):
            "kwargs",
            "missing",
            "n_estimators",
-            "use_label_encoder",
            "enable_categorical",
            "early_stopping_rounds",
            "callbacks",
@ -851,21 +837,38 @@ class XGBModel(XGBModelBase):
        self.get_booster().load_model(fname)

        meta_str = self.get_booster().attr("scikit_learn")
-        if meta_str is None:
-            return
+        if meta_str is not None:
+            meta = json.loads(meta_str)
+            t = meta.get("_estimator_type", None)
+            if t is not None and t != self._get_type():
+                raise TypeError(
+                    "Loading an estimator with different type. Expecting: "
+                    f"{self._get_type()}, got: {t}"
+                )

-        meta = json.loads(meta_str)
-        t = meta.get("_estimator_type", None)
-        if t is not None and t != self._get_type():
-            raise TypeError(
-                "Loading an estimator with different type. Expecting: "
-                f"{self._get_type()}, got: {t}"
-            )
        self.feature_types = self.get_booster().feature_types
        self.get_booster().set_attr(scikit_learn=None)
+        config = json.loads(self.get_booster().save_config())
+        self._load_model_attributes(config)

    load_model.__doc__ = f"""{Booster.load_model.__doc__}"""

+    def _load_model_attributes(self, config: dict) -> None:
+        """Load model attributes without hyper-parameters."""
+        from sklearn.base import is_classifier
+
+        booster = self.get_booster()
+
+        self.objective = config["learner"]["objective"]["name"]
+        self.booster = config["learner"]["gradient_booster"]["name"]
+        self.base_score = config["learner"]["learner_model_param"]["base_score"]
+        self.feature_types = booster.feature_types
+
+        if is_classifier(self):
+            self.n_classes_ = int(config["learner"]["learner_model_param"]["num_class"])
+            # binary classification is treated as regression in XGBoost.
+            self.n_classes_ = 2 if self.n_classes_ < 2 else self.n_classes_
+
    # pylint: disable=too-many-branches
    def _configure_fit(
        self,
@ -1415,7 +1418,7 @@ def _cls_predict_proba(n_classes: int, prediction: PredtT, vstack: Callable) ->
        Number of boosting rounds.
 """,
 )
-class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
+class XGBClassifier(XGBModel, XGBClassifierBase):
    # pylint: disable=missing-docstring,invalid-name,too-many-instance-attributes
    @_deprecate_positional_args
    def __init__(
@ -1643,10 +1646,6 @@ class XGBClassifier(XGBModel, XGBClassifierMixIn, XGBClassifierBase):
    def classes_(self) -> np.ndarray:
        return np.arange(self.n_classes_)

-    def load_model(self, fname: ModelIn) -> None:
-        super().load_model(fname)
-        self._load_model_attributes(self.get_booster())
-

@xgboost_model_doc(
    "scikit-learn API for XGBoost random forest classification.",
@ -2099,7 +2098,17 @@ class XGBRanker(XGBModel, XGBRankerMixIn):

        """
        X, qid = _get_qid(X, None)
-        Xyq = DMatrix(X, y, qid=qid)
+        # fixme(jiamingy): base margin and group weight is not yet supported. We might
+        # need to make extra special fields in the dataframe.
+        Xyq = DMatrix(
+            X,
+            y,
+            qid=qid,
+            missing=self.missing,
+            enable_categorical=self.enable_categorical,
+            nthread=self.n_jobs,
+            feature_types=self.feature_types,
+        )
        if callable(self.eval_metric):
            metric = ltr_metric_decorator(self.eval_metric, self.n_jobs)
            result_str = self.get_booster().eval_set([(Xyq, "eval")], feval=metric)
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@ -22,7 +22,7 @@ from typing import (

 import numpy as np
 import pandas as pd
-from pyspark import RDD, SparkContext, cloudpickle
+from pyspark import RDD, SparkConf, SparkContext, cloudpickle
 from pyspark.ml import Estimator, Model
 from pyspark.ml.functions import array_to_vector, vector_to_array
 from pyspark.ml.linalg import VectorUDT
@ -138,7 +138,6 @@ _inverse_pyspark_param_alias_map = {v: k for k, v in _pyspark_param_alias_map.it
 _unsupported_xgb_params = [
    "gpu_id",  # we have "device" pyspark param instead.
    "enable_categorical",  # Use feature_types param to specify categorical feature instead
-    "use_label_encoder",
    "n_jobs",  # Do not allow user to set it, will use `spark.task.cpus` value instead.
    "nthread",  # Ditto
 ]
@ -368,7 +367,10 @@ class _SparkXGBParams(
                        " on GPU."
                    )

-                if not (ss.version >= "3.4.0" and _is_standalone_or_localcluster(sc)):
+                if not (
+                    ss.version >= "3.4.0"
+                    and _is_standalone_or_localcluster(sc.getConf())
+                ):
                    # We will enable stage-level scheduling in spark 3.4.0+ which doesn't
                    # require spark.task.resource.gpu.amount to be set explicitly
                    gpu_per_task = sc.getConf().get("spark.task.resource.gpu.amount")
@ -907,30 +909,27 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):

        return booster_params, train_call_kwargs_params, dmatrix_kwargs

-    def _skip_stage_level_scheduling(self) -> bool:
+    def _skip_stage_level_scheduling(self, spark_version: str, conf: SparkConf) -> bool:
        # pylint: disable=too-many-return-statements
        """Check if stage-level scheduling is not needed,
        return true to skip stage-level scheduling"""

        if self._run_on_gpu():
-            ss = _get_spark_session()
-            sc = ss.sparkContext
-
-            if ss.version < "3.4.0":
+            if spark_version < "3.4.0":
                self.logger.info(
                    "Stage-level scheduling in xgboost requires spark version 3.4.0+"
                )
                return True

-            if not _is_standalone_or_localcluster(sc):
+            if not _is_standalone_or_localcluster(conf):
                self.logger.info(
                    "Stage-level scheduling in xgboost requires spark standalone or "
                    "local-cluster mode"
                )
                return True

-            executor_cores = sc.getConf().get("spark.executor.cores")
-            executor_gpus = sc.getConf().get("spark.executor.resource.gpu.amount")
+            executor_cores = conf.get("spark.executor.cores")
+            executor_gpus = conf.get("spark.executor.resource.gpu.amount")
            if executor_cores is None or executor_gpus is None:
                self.logger.info(
                    "Stage-level scheduling in xgboost requires spark.executor.cores, "
@ -955,7 +954,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
                )
                return True

-            task_gpu_amount = sc.getConf().get("spark.task.resource.gpu.amount")
+            task_gpu_amount = conf.get("spark.task.resource.gpu.amount")

            if task_gpu_amount is None:
                # The ETL tasks will not grab a gpu when spark.task.resource.gpu.amount is not set,
@ -975,14 +974,13 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):

    def _try_stage_level_scheduling(self, rdd: RDD) -> RDD:
        """Try to enable stage-level scheduling"""
-
-        if self._skip_stage_level_scheduling():
+        ss = _get_spark_session()
+        conf = ss.sparkContext.getConf()
+        if self._skip_stage_level_scheduling(ss.version, conf):
            return rdd

-        ss = _get_spark_session()
-
        # executor_cores will not be None
-        executor_cores = ss.sparkContext.getConf().get("spark.executor.cores")
+        executor_cores = conf.get("spark.executor.cores")
        assert executor_cores is not None

        # Spark-rapids is a project to leverage GPUs to accelerate spark SQL.
--- a/python-package/xgboost/spark/utils.py
+++ b/python-package/xgboost/spark/utils.py
@ -10,7 +10,7 @@ from threading import Thread
 from typing import Any, Callable, Dict, Optional, Set, Type

 import pyspark
-from pyspark import BarrierTaskContext, SparkContext, SparkFiles, TaskContext
+from pyspark import BarrierTaskContext, SparkConf, SparkContext, SparkFiles, TaskContext
 from pyspark.sql.session import SparkSession

 from xgboost import Booster, XGBModel, collective
@ -129,8 +129,8 @@ def _is_local(spark_context: SparkContext) -> bool:
    return spark_context._jsc.sc().isLocal()


-def _is_standalone_or_localcluster(spark_context: SparkContext) -> bool:
-    master = spark_context.getConf().get("spark.master")
+def _is_standalone_or_localcluster(conf: SparkConf) -> bool:
+    master = conf.get("spark.master")
    return master is not None and (
        master.startswith("spark://") or master.startswith("local-cluster")
    )
--- a/python-package/xgboost/testing/ranking.py
+++ b/python-package/xgboost/testing/ranking.py
@ -75,3 +75,28 @@ def run_ranking_qid_df(impl: ModuleType, tree_method: str) -> None:

    with pytest.raises(ValueError, match="Either `group` or `qid`."):
        ranker.fit(df, y, eval_set=[(X, y)])
+
+
+def run_ranking_categorical(device: str) -> None:
+    """Test LTR with categorical features."""
+    from sklearn.model_selection import cross_val_score
+
+    X, y = tm.make_categorical(
+        n_samples=512, n_features=10, n_categories=3, onehot=False
+    )
+    rng = np.random.default_rng(1994)
+    qid = rng.choice(3, size=y.shape[0])
+    qid = np.sort(qid)
+    X["qid"] = qid
+
+    ltr = xgb.XGBRanker(enable_categorical=True, device=device)
+    ltr.fit(X, y)
+    score = ltr.score(X, y)
+    assert score > 0.9
+
+    ltr = xgb.XGBRanker(enable_categorical=True, device=device)
+
+    # test using the score function inside sklearn.
+    scores = cross_val_score(ltr, X, y)
+    for s in scores:
+        assert s > 0.7
--- a/rabit/src/allreduce_base.cc
+++ b/rabit/src/allreduce_base.cc
@ -417,9 +417,9 @@ void AllreduceBase::SetParam(const char *name, const char *val) {
      utils::Assert(!all_link.sock.BadSocket(), "ReConnectLink: bad socket");
      // set the socket to non-blocking mode, enable TCP keepalive
      CHECK(all_link.sock.NonBlocking(true).OK());
-      all_link.sock.SetKeepAlive();
+      CHECK(all_link.sock.SetKeepAlive().OK());
      if (rabit_enable_tcp_no_delay) {
-        all_link.sock.SetNoDelay();
+        CHECK(all_link.sock.SetNoDelay().OK());
      }
      if (tree_neighbors.count(all_link.rank) != 0) {
        if (all_link.rank == parent_rank) {
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -21,6 +21,10 @@ if (USE_HIP)
  target_sources(objxgboost PRIVATE ${HIP_SOURCES})
 endif (USE_HIP)

+if(PLUGIN_SYCL)
+  target_compile_definitions(objxgboost PRIVATE -DXGBOOST_USE_SYCL=1)
+endif()
+
 target_include_directories(objxgboost
  PRIVATE
  ${xgboost_SOURCE_DIR}/include
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@ -7,8 +7,6 @@
 #include <cinttypes>                         // for strtoimax
 #include <cmath>                             // for nan
 #include <cstring>                           // for strcmp
-#include <fstream>                           // for operator<<, basic_ostream, ios, stringstream
-#include <functional>                        // for less
 #include <limits>                            // for numeric_limits
 #include <map>                               // for operator!=, _Rb_tree_const_iterator, _Rb_tre...
 #include <memory>                            // for shared_ptr, allocator, __shared_ptr_access
@ -22,7 +20,6 @@
 #include "../common/charconv.h"              // for from_chars, to_chars, NumericLimits, from_ch...
 #include "../common/hist_util.h"             // for HistogramCuts
 #include "../common/io.h"                    // for FileExtension, LoadSequentialFile, MemoryBuf...
-#include "../common/linalg_op.h"             // for ElementWiseTransformHost
 #include "../common/threading_utils.h"       // for OmpGetNumThreads, ParallelFor
 #include "../data/adapter.h"                 // for ArrayAdapter, DenseAdapter, RecordBatchesIte...
 #include "../data/ellpack_page.h"            // for EllpackPage
@ -35,14 +32,12 @@
 #include "dmlc/parameter.h"                  // for FieldAccessEntry, FieldEntry, ParamManager
 #include "dmlc/thread_local.h"               // for ThreadLocalStore
 #include "rabit/c_api.h"                     // for RabitLinkTag
-#include "rabit/rabit.h"                     // for CheckPoint, LoadCheckPoint
 #include "xgboost/base.h"                    // for bst_ulong, bst_float, GradientPair, bst_feat...
 #include "xgboost/context.h"                 // for Context
 #include "xgboost/data.h"                    // for DMatrix, MetaInfo, DataType, ExtSparsePage
 #include "xgboost/feature_map.h"             // for FeatureMap
 #include "xgboost/global_config.h"           // for GlobalConfiguration, GlobalConfigThreadLocal...
 #include "xgboost/host_device_vector.h"      // for HostDeviceVector
-#include "xgboost/intrusive_ptr.h"           // for xgboost
 #include "xgboost/json.h"                    // for Json, get, Integer, IsA, Boolean, String
 #include "xgboost/learner.h"                 // for Learner, PredictionType
 #include "xgboost/logging.h"                 // for LOG_FATAL, LogMessageFatal, CHECK, LogCheck_EQ
@ -81,6 +76,8 @@ void XGBBuildInfoDevice(Json *p_info) {
  info["USE_HIP"] = Boolean{false};
  info["USE_RCCL"] = Boolean{false};
  info["USE_RMM"] = Boolean{false};
+  info["USE_DLOPEN_NCCL"] = Boolean{false};
+  info["USE_DLOPEN_RCCL"] = Boolean{false};
 }
 }  // namespace xgboost
 #endif
--- a/src/c_api/c_api.cu
+++ b/src/c_api/c_api.cu
@ -1,5 +1,5 @@
 /**
- * Copyright 2019-2023 by XGBoost Contributors
+ * Copyright 2019-2023, XGBoost Contributors
 */
 #include <thrust/transform.h>  // for transform

@ -15,6 +15,9 @@
 #include "xgboost/data.h"
 #include "xgboost/json.h"
 #include "xgboost/learner.h"
+#if defined(XGBOOST_USE_NCCL)
+#include <nccl.h>
+#endif

 namespace xgboost {
 void XGBBuildInfoDevice(Json *p_info) {
@ -38,15 +41,27 @@ void XGBBuildInfoDevice(Json *p_info) {
  info["USE_NCCL"] = Boolean{true};
  v = {Json{Integer{NCCL_MAJOR}}, Json{Integer{NCCL_MINOR}}, Json{Integer{NCCL_PATCH}}};
  info["NCCL_VERSION"] = v;
+#if defined(XGBOOST_USE_DLOPEN_NCCL)
+  info["USE_DLOPEN_NCCL"] = Boolean{true};
+#else
+  info["USE_DLOPEN_NCCL"] = Boolean{false};
+#endif  // defined(XGBOOST_USE_DLOPEN_NCCL)
 #elif defined(XGBOOST_USE_RCCL)
  info["USE_NCCL"] = Boolean{true};
  info["USE_RCCL"] = Boolean{true};
  v = {Json{Integer{NCCL_MAJOR}}, Json{Integer{NCCL_MINOR}}, Json{Integer{NCCL_PATCH}}};
  info["RCCL_VERSION"] = v;
  info["NCCL_VERSION"] = v;
+#if defined(XGBOOST_USE_DLOPEN_RCCL)
+  info["USE_DLOPEN_RCCL"] = Boolean{true};
+#else
+  info["USE_DLOPEN_RCCL"] = Boolean{false};
+#endif  // defined(XGBOOST_USE_DLOPEN_RCCL)
 #else
  info["USE_NCCL"] = Boolean{false};
+  info["USE_DLOPEN_NCCL"] = Boolean{false};
  info["USE_RCCL"] = Boolean{false};
+  info["USE_DLOPEN_RCCL"] = Boolean{false};
 #endif

 #if defined(XGBOOST_USE_RMM)
--- a/src/c_api/coll_c_api.cc
+++ b/src/c_api/coll_c_api.cc
@ -0,0 +1,119 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include <chrono>       // for seconds
+#include <cstddef>      // for size_t
+#include <future>       // for future
+#include <memory>       // for unique_ptr
+#include <string>       // for string
+#include <type_traits>  // for is_same_v, remove_pointer_t
+#include <utility>      // for pair
+
+#include "../collective/tracker.h"  // for RabitTracker
+#include "c_api_error.h"            // for API_BEGIN
+#include "xgboost/c_api.h"
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/json.h"               // for Json
+#include "xgboost/string_view.h"        // for StringView
+
+#if defined(XGBOOST_USE_FEDERATED)
+#include "../../plugin/federated/federated_tracker.h"  // for FederatedTracker
+#else
+#include "../common/error_msg.h"  // for NoFederated
+#endif
+
+using namespace xgboost;  // NOLINT
+
+namespace {
+using TrackerHandleT =
+    std::pair<std::unique_ptr<collective::Tracker>, std::shared_future<collective::Result>>;
+
+TrackerHandleT *GetTrackerHandle(TrackerHandle handle) {
+  xgboost_CHECK_C_ARG_PTR(handle);
+  auto *ptr = static_cast<TrackerHandleT *>(handle);
+  CHECK(ptr);
+  return ptr;
+}
+
+struct CollAPIEntry {
+  std::string ret_str;
+};
+using CollAPIThreadLocalStore = dmlc::ThreadLocalStore<CollAPIEntry>;
+
+void WaitImpl(TrackerHandleT *ptr) {
+  std::chrono::seconds wait_for{100};
+  auto fut = ptr->second;
+  while (fut.valid()) {
+    auto res = fut.wait_for(wait_for);
+    CHECK(res != std::future_status::deferred);
+    if (res == std::future_status::ready) {
+      auto const &rc = ptr->second.get();
+      CHECK(rc.OK()) << rc.Report();
+      break;
+    }
+  }
+}
+}  // namespace
+
+XGB_DLL int XGTrackerCreate(char const *config, TrackerHandle *handle) {
+  API_BEGIN();
+  xgboost_CHECK_C_ARG_PTR(config);
+
+  Json jconfig = Json::Load(config);
+
+  auto type = RequiredArg<String>(jconfig, "dmlc_communicator", __func__);
+  std::unique_ptr<collective::Tracker> tptr;
+  if (type == "federated") {
+#if defined(XGBOOST_USE_FEDERATED)
+    tptr = std::make_unique<collective::FederatedTracker>(jconfig);
+#else
+    LOG(FATAL) << error::NoFederated();
+#endif  // defined(XGBOOST_USE_FEDERATED)
+  } else if (type == "rabit") {
+    tptr = std::make_unique<collective::RabitTracker>(jconfig);
+  } else {
+    LOG(FATAL) << "Unknown communicator:" << type;
+  }
+
+  auto ptr = new TrackerHandleT{std::move(tptr), std::future<collective::Result>{}};
+  static_assert(std::is_same_v<std::remove_pointer_t<decltype(ptr)>, TrackerHandleT>);
+
+  xgboost_CHECK_C_ARG_PTR(handle);
+  *handle = ptr;
+  API_END();
+}
+
+XGB_DLL int XGTrackerWorkerArgs(TrackerHandle handle, char const **args) {
+  API_BEGIN();
+  auto *ptr = GetTrackerHandle(handle);
+  auto &local = *CollAPIThreadLocalStore::Get();
+  local.ret_str = Json::Dump(ptr->first->WorkerArgs());
+  xgboost_CHECK_C_ARG_PTR(args);
+  *args = local.ret_str.c_str();
+  API_END();
+}
+
+XGB_DLL int XGTrackerRun(TrackerHandle handle) {
+  API_BEGIN();
+  auto *ptr = GetTrackerHandle(handle);
+  CHECK(!ptr->second.valid()) << "Tracker is already running.";
+  ptr->second = ptr->first->Run();
+  API_END();
+}
+
+XGB_DLL int XGTrackerWait(TrackerHandle handle, char const *config) {
+  API_BEGIN();
+  auto *ptr = GetTrackerHandle(handle);
+  xgboost_CHECK_C_ARG_PTR(config);
+  auto jconfig = Json::Load(StringView{config});
+  WaitImpl(ptr);
+  API_END();
+}
+
+XGB_DLL int XGTrackerFree(TrackerHandle handle) {
+  API_BEGIN();
+  auto *ptr = GetTrackerHandle(handle);
+  WaitImpl(ptr);
+  delete ptr;
+  API_END();
+}
--- a/src/collective/allgather.cc
+++ b/src/collective/allgather.cc
@ -26,18 +26,19 @@ Result RingAllgather(Comm const& comm, common::Span<std::int8_t> data, std::size
  }

  for (std::int32_t r = 0; r < world; ++r) {
-    auto send_rank = (rank + world - r + worker_off) % world;
-    auto send_off = send_rank * segment_size;
-    send_off = std::min(send_off, data.size_bytes());
-    auto send_seg = data.subspan(send_off, std::min(segment_size, data.size_bytes() - send_off));
-    next_ch->SendAll(send_seg.data(), send_seg.size_bytes());
-
-    auto recv_rank = (rank + world - r - 1 + worker_off) % world;
-    auto recv_off = recv_rank * segment_size;
-    recv_off = std::min(recv_off, data.size_bytes());
-    auto recv_seg = data.subspan(recv_off, std::min(segment_size, data.size_bytes() - recv_off));
-    prev_ch->RecvAll(recv_seg.data(), recv_seg.size_bytes());
-    auto rc = prev_ch->Block();
+    auto rc = Success() << [&] {
+      auto send_rank = (rank + world - r + worker_off) % world;
+      auto send_off = send_rank * segment_size;
+      send_off = std::min(send_off, data.size_bytes());
+      auto send_seg = data.subspan(send_off, std::min(segment_size, data.size_bytes() - send_off));
+      return next_ch->SendAll(send_seg.data(), send_seg.size_bytes());
+    } << [&] {
+      auto recv_rank = (rank + world - r - 1 + worker_off) % world;
+      auto recv_off = recv_rank * segment_size;
+      recv_off = std::min(recv_off, data.size_bytes());
+      auto recv_seg = data.subspan(recv_off, std::min(segment_size, data.size_bytes() - recv_off));
+      return prev_ch->RecvAll(recv_seg.data(), recv_seg.size_bytes());
+    } << [&] { return prev_ch->Block(); };
    if (!rc.OK()) {
      return rc;
    }
@ -78,19 +79,19 @@ namespace detail {
  auto next_ch = comm.Chan(next);

  for (std::int32_t r = 0; r < world; ++r) {
-    auto send_rank = (rank + world - r) % world;
-    auto send_off = offset[send_rank];
-    auto send_size = sizes[send_rank];
-    auto send_seg = erased_result.subspan(send_off, send_size);
-    next_ch->SendAll(send_seg);
-
-    auto recv_rank = (rank + world - r - 1) % world;
-    auto recv_off = offset[recv_rank];
-    auto recv_size = sizes[recv_rank];
-    auto recv_seg = erased_result.subspan(recv_off, recv_size);
-    prev_ch->RecvAll(recv_seg.data(), recv_seg.size_bytes());
-
-    auto rc = prev_ch->Block();
+    auto rc = Success() << [&] {
+      auto send_rank = (rank + world - r) % world;
+      auto send_off = offset[send_rank];
+      auto send_size = sizes[send_rank];
+      auto send_seg = erased_result.subspan(send_off, send_size);
+      return next_ch->SendAll(send_seg);
+    } << [&] {
+      auto recv_rank = (rank + world - r - 1) % world;
+      auto recv_off = offset[recv_rank];
+      auto recv_size = sizes[recv_rank];
+      auto recv_seg = erased_result.subspan(recv_off, recv_size);
+      return prev_ch->RecvAll(recv_seg.data(), recv_seg.size_bytes());
+    } << [&] { return prev_ch->Block(); };
    if (!rc.OK()) {
      return rc;
    }
--- a/src/collective/allreduce.cc
+++ b/src/collective/allreduce.cc
@ -6,6 +6,7 @@
 #include <algorithm>  // for min
 #include <cstddef>    // for size_t
 #include <cstdint>    // for int32_t, int8_t
+#include <utility>    // for move
 #include <vector>     // for vector

 #include "../data/array_interface.h"    // for Type, DispatchDType
@ -36,7 +37,10 @@ Result RingScatterReduceTyped(Comm const& comm, common::Span<std::int8_t> data,
    auto seg_nbytes = std::min(data.size_bytes() - send_off, n_bytes_in_seg);
    auto send_seg = data.subspan(send_off, seg_nbytes);

-    next_ch->SendAll(send_seg);
+    auto rc = next_ch->SendAll(send_seg);
+    if (!rc.OK()) {
+      return rc;
+    }

    // receive from ring prev
    auto recv_off = ((rank + world - r - 1) % world) * n_bytes_in_seg;
@ -46,8 +50,7 @@ Result RingScatterReduceTyped(Comm const& comm, common::Span<std::int8_t> data,
    auto recv_seg = data.subspan(recv_off, seg_nbytes);
    auto seg = s_buf.subspan(0, recv_seg.size());

-    prev_ch->RecvAll(seg);
-    auto rc = prev_ch->Block();
+    rc = std::move(rc) << [&] { return prev_ch->RecvAll(seg); } << [&] { return comm.Block(); };
    if (!rc.OK()) {
      return rc;
    }
@ -62,6 +65,9 @@ Result RingScatterReduceTyped(Comm const& comm, common::Span<std::int8_t> data,

 Result RingAllreduce(Comm const& comm, common::Span<std::int8_t> data, Func const& op,
                     ArrayInterfaceHandler::Type type) {
+  if (comm.World() == 1) {
+    return Success();
+  }
  return DispatchDType(type, [&](auto t) {
    using T = decltype(t);
    // Divide the data into segments according to the number of workers.
@ -80,11 +86,9 @@ Result RingAllreduce(Comm const& comm, common::Span<std::int8_t> data, Func cons
    auto prev_ch = comm.Chan(prev);
    auto next_ch = comm.Chan(next);

-    rc = RingAllgather(comm, data, n_bytes_in_seg, 1, prev_ch, next_ch);
-    if (!rc.OK()) {
-      return rc;
-    }
-    return comm.Block();
+    return std::move(rc) << [&] {
+      return RingAllgather(comm, data, n_bytes_in_seg, 1, prev_ch, next_ch);
+    } << [&] { return comm.Block(); };
  });
 }
 }  // namespace xgboost::collective::cpu_impl
--- a/src/collective/broadcast.cc
+++ b/src/collective/broadcast.cc
@ -62,8 +62,8 @@ Result Broadcast(Comm const& comm, common::Span<std::int8_t> data, std::int32_t

  if (shifted_rank != 0) {  // not root
    auto parent = ShiftRight(ShiftedParentRank(shifted_rank, depth), world, root);
-    comm.Chan(parent)->RecvAll(data);
-    auto rc = comm.Chan(parent)->Block();
+    auto rc = Success() << [&] { return comm.Chan(parent)->RecvAll(data); }
+                        << [&] { return comm.Chan(parent)->Block(); };
    if (!rc.OK()) {
      return Fail("broadcast failed.", std::move(rc));
    }
@ -75,7 +75,10 @@ Result Broadcast(Comm const& comm, common::Span<std::int8_t> data, std::int32_t
      auto sft_peer = shifted_rank + (1 << i);
      auto peer = ShiftRight(sft_peer, world, root);
      CHECK_NE(peer, root);
-      comm.Chan(peer)->SendAll(data);
+      auto rc = comm.Chan(peer)->SendAll(data);
+      if (!rc.OK()) {
+        return rc;
+      }
    }
  }

--- a/src/collective/coll.cu
+++ b/src/collective/coll.cu
@ -23,25 +23,6 @@ Coll* Coll::MakeCUDAVar() { return new NCCLColl{}; }

 NCCLColl::~NCCLColl() = default;
 namespace {
-Result GetNCCLResult(ncclResult_t code) {
-  if (code == ncclSuccess) {
-    return Success();
-  }
-
-  std::stringstream ss;
-  ss << "NCCL failure: " << ncclGetErrorString(code) << ".";
-  if (code == ncclUnhandledCudaError) {
-    // nccl usually preserves the last error so we can get more details.
-    auto err = cudaPeekAtLastError();
-    ss << "  CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
-  } else if (code == ncclSystemError) {
-    ss << "  This might be caused by a network configuration issue. Please consider specifying "
-          "the network interface for NCCL via environment variables listed in its reference: "
-          "`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n";
-  }
-  return Fail(ss.str());
-}
-
 auto GetNCCLType(ArrayInterfaceHandler::Type type) {
  auto fatal = [] {
    LOG(FATAL) << "Invalid type for NCCL operation.";
@ -98,11 +79,12 @@ void RunBitwiseAllreduce(dh::CUDAStreamView stream, common::Span<std::int8_t> ou
                                      common::Span<std::int8_t> data, Op op) {
  dh::device_vector<std::int8_t> buffer(data.size() * pcomm->World());
  auto* device_buffer = buffer.data().get();
+  auto stub = pcomm->Stub();

  // First gather data from all the workers.
  CHECK(handle);
-  auto rc = GetNCCLResult(
-      ncclAllGather(data.data(), device_buffer, data.size(), ncclInt8, handle, pcomm->Stream()));
+  auto rc =
+      stub->Allgather(data.data(), device_buffer, data.size(), ncclInt8, handle, pcomm->Stream());
  if (!rc.OK()) {
    return rc;
  }
@ -153,6 +135,8 @@ ncclRedOp_t GetNCCLRedOp(Op const& op) {
  }
  auto nccl = dynamic_cast<NCCLComm const*>(&comm);
  CHECK(nccl);
+  auto stub = nccl->Stub();
+
  return Success() << [&] {
    if (IsBitwiseOp(op)) {
      return BitwiseAllReduce(nccl, nccl->Handle(), data, op);
@ -160,9 +144,8 @@ ncclRedOp_t GetNCCLRedOp(Op const& op) {
      return DispatchDType(type, [=](auto t) {
        using T = decltype(t);
        auto rdata = common::RestoreType<T>(data);
-        auto rc = ncclAllReduce(data.data(), data.data(), rdata.size(), GetNCCLType(type),
-                                GetNCCLRedOp(op), nccl->Handle(), nccl->Stream());
-        return GetNCCLResult(rc);
+        return stub->Allreduce(data.data(), data.data(), rdata.size(), GetNCCLType(type),
+                               GetNCCLRedOp(op), nccl->Handle(), nccl->Stream());
      });
    }
  } << [&] { return nccl->Block(); };
@ -175,9 +158,11 @@ ncclRedOp_t GetNCCLRedOp(Op const& op) {
  }
  auto nccl = dynamic_cast<NCCLComm const*>(&comm);
  CHECK(nccl);
+  auto stub = nccl->Stub();
+
  return Success() << [&] {
-    return GetNCCLResult(ncclBroadcast(data.data(), data.data(), data.size_bytes(), ncclInt8, root,
-                                       nccl->Handle(), nccl->Stream()));
+    return stub->Broadcast(data.data(), data.data(), data.size_bytes(), ncclInt8, root,
+                           nccl->Handle(), nccl->Stream());
  } << [&] { return nccl->Block(); };
 }

@ -188,10 +173,12 @@ ncclRedOp_t GetNCCLRedOp(Op const& op) {
  }
  auto nccl = dynamic_cast<NCCLComm const*>(&comm);
  CHECK(nccl);
+  auto stub = nccl->Stub();
+
  auto send = data.subspan(comm.Rank() * size, size);
  return Success() << [&] {
-    return GetNCCLResult(
-        ncclAllGather(send.data(), data.data(), size, ncclInt8, nccl->Handle(), nccl->Stream()));
+    return stub->Allgather(send.data(), data.data(), size, ncclInt8, nccl->Handle(),
+                           nccl->Stream());
  } << [&] { return nccl->Block(); };
 }

@ -203,19 +190,20 @@ namespace cuda_impl {
 */
 Result BroadcastAllgatherV(NCCLComm const* comm, common::Span<std::int8_t const> data,
                           common::Span<std::int64_t const> sizes, common::Span<std::int8_t> recv) {
-  return Success() << [] { return GetNCCLResult(ncclGroupStart()); } << [&] {
+  auto stub = comm->Stub();
+  return Success() << [&stub] { return stub->GroupStart(); } << [&] {
    std::size_t offset = 0;
    for (std::int32_t r = 0; r < comm->World(); ++r) {
      auto as_bytes = sizes[r];
-      auto rc = ncclBroadcast(data.data(), recv.subspan(offset, as_bytes).data(), as_bytes,
-                              ncclInt8, r, comm->Handle(), dh::DefaultStream());
-      if (rc != ncclSuccess) {
-        return GetNCCLResult(rc);
+      auto rc = stub->Broadcast(data.data(), recv.subspan(offset, as_bytes).data(), as_bytes,
+                                ncclInt8, r, comm->Handle(), dh::DefaultStream());
+      if (!rc.OK()) {
+        return rc;
      }
      offset += as_bytes;
    }
    return Success();
-  } << [] { return GetNCCLResult(ncclGroupEnd()); };
+  } << [&] { return stub->GroupEnd(); };
 }
 }  // namespace cuda_impl

@ -228,10 +216,11 @@ Result BroadcastAllgatherV(NCCLComm const* comm, common::Span<std::int8_t const>
  if (!comm.IsDistributed()) {
    return Success();
  }
+  auto stub = nccl->Stub();

  switch (algo) {
    case AllgatherVAlgo::kRing: {
-      return Success() << [] { return GetNCCLResult(ncclGroupStart()); } << [&] {
+      return Success() << [&] { return stub->GroupStart(); } << [&] {
        // get worker offset
        detail::AllgatherVOffset(sizes, recv_segments);
        // copy data
@ -241,8 +230,8 @@ Result BroadcastAllgatherV(NCCLComm const* comm, common::Span<std::int8_t const>
                                        cudaMemcpyDeviceToDevice, nccl->Stream()));
        }
        return detail::RingAllgatherV(comm, sizes, recv_segments, recv);
-      } << [] {
-        return GetNCCLResult(ncclGroupEnd());
+      } << [&] {
+        return stub->GroupEnd();
      } << [&] { return nccl->Block(); };
    }
    case AllgatherVAlgo::kBcast: {
--- a/src/collective/coll.cuh
+++ b/src/collective/coll.cuh
@ -8,7 +8,8 @@
 #include "../data/array_interface.h"  // for ArrayInterfaceHandler
 #include "coll.h"                     // for Coll
 #include "comm.h"                     // for Comm
-#include "xgboost/span.h"             // for Span
+#include "nccl_stub.h"
+#include "xgboost/span.h"  // for Span

 namespace xgboost::collective {
 class NCCLColl : public Coll {
--- a/src/collective/comm.cc
+++ b/src/collective/comm.cc
@ -5,6 +5,7 @@

 #include <algorithm>  // for copy
 #include <chrono>     // for seconds
+#include <cstdlib>    // for exit
 #include <memory>     // for shared_ptr
 #include <string>     // for string
 #include <utility>    // for move, forward
@ -29,19 +30,28 @@ Comm::Comm(std::string const& host, std::int32_t port, std::chrono::seconds time
 Result ConnectTrackerImpl(proto::PeerInfo info, std::chrono::seconds timeout, std::int32_t retry,
                          std::string const& task_id, TCPSocket* out, std::int32_t rank,
                          std::int32_t world) {
-  // get information from tracker
+  // Get information from the tracker
  CHECK(!info.host.empty());
-  auto rc = Connect(info.host, info.port, retry, timeout, out);
-  if (!rc.OK()) {
-    return Fail("Failed to connect to the tracker.", std::move(rc));
-  }
-
  TCPSocket& tracker = *out;
-  return std::move(rc)
-      << [&] { return tracker.NonBlocking(false); }
-      << [&] { return tracker.RecvTimeout(timeout); }
-      << [&] { return proto::Magic{}.Verify(&tracker); }
-      << [&] { return proto::Connect{}.WorkerSend(&tracker, world, rank, task_id); };
+  return Success() << [&] {
+    auto rc = Connect(info.host, info.port, retry, timeout, out);
+    if (rc.OK()) {
+      return rc;
+    } else {
+      return Fail("Failed to connect to the tracker.", std::move(rc));
+    }
+  } << [&] {
+    return tracker.NonBlocking(false);
+  } << [&] {
+    return tracker.RecvTimeout(timeout);
+  } << [&] {
+    return proto::Magic{}.Verify(&tracker);
+  } << [&] {
+    return proto::Connect{}.WorkerSend(&tracker, world, rank, task_id);
+  } << [&] {
+    LOG(INFO) << "Task " << task_id << " connected to the tracker";
+    return Success();
+  };
 }

 [[nodiscard]] Result Comm::ConnectTracker(TCPSocket* out) const {
@ -49,14 +59,6 @@ Result ConnectTrackerImpl(proto::PeerInfo info, std::chrono::seconds timeout, st
                            this->Rank(), this->World());
 }

-#if !defined(XGBOOST_USE_NCCL) && !defined(XGBOOST_USE_RCCL)
-Comm* Comm::MakeCUDAVar(Context const*, std::shared_ptr<Coll>) const {
-  common::AssertGPUSupport();
-  common::AssertNCCLSupport();
-  return nullptr;
-}
-#endif  //  !defined(XGBOOST_USE_NCCL)
-
 [[nodiscard]] Result ConnectWorkers(Comm const& comm, TCPSocket* listener, std::int32_t lport,
                                    proto::PeerInfo ninfo, std::chrono::seconds timeout,
                                    std::int32_t retry,
@ -181,12 +183,21 @@ Comm* Comm::MakeCUDAVar(Context const*, std::shared_ptr<Coll>) const {
 }

 RabitComm::RabitComm(std::string const& host, std::int32_t port, std::chrono::seconds timeout,
-                     std::int32_t retry, std::string task_id)
-    : Comm{std::move(host), port, timeout, retry, std::move(task_id)} {
+                     std::int32_t retry, std::string task_id, StringView nccl_path)
+    : HostComm{std::move(host), port, timeout, retry, std::move(task_id)},
+      nccl_path_{std::move(nccl_path)} {
  auto rc = this->Bootstrap(timeout_, retry_, task_id_);
  CHECK(rc.OK()) << rc.Report();
 }

+#if !defined(XGBOOST_USE_NCCL) && !defined(XGBOOST_USE_RCCL)
+Comm* RabitComm::MakeCUDAVar(Context const*, std::shared_ptr<Coll>) const {
+  common::AssertGPUSupport();
+  common::AssertNCCLSupport();
+  return nullptr;
+}
+#endif  //  !defined(XGBOOST_USE_NCCL)
+
 [[nodiscard]] Result RabitComm::Bootstrap(std::chrono::seconds timeout, std::int32_t retry,
                                          std::string task_id) {
  TCPSocket tracker;
@ -209,24 +220,18 @@ RabitComm::RabitComm(std::string const& host, std::int32_t port, std::chrono::se
  std::shared_ptr<TCPSocket> error_sock{TCPSocket::CreatePtr(domain)};
  auto eport = error_sock->BindHost();
  error_sock->Listen();
-  error_worker_ = std::thread{[this, error_sock = std::move(error_sock)] {
+  error_worker_ = std::thread{[error_sock = std::move(error_sock)] {
    auto conn = error_sock->Accept();
-    // On Windows accept returns an invalid socket after network is shutdown.
+    // On Windows, accept returns a closed socket after finalize.
    if (conn.IsClosed()) {
      return;
    }
    LOG(WARNING) << "Another worker is running into error.";
-    std::string scmd;
-    conn.Recv(&scmd);
-    auto jcmd = Json::Load(scmd);
-    auto rc = this->Shutdown();
-    if (!rc.OK()) {
-      LOG(WARNING) << "Fail to shutdown worker:" << rc.Report();
-    }
 #if !defined(XGBOOST_STRICT_R_MODE) || XGBOOST_STRICT_R_MODE == 0
-    exit(-1);
+    // exit is nicer than abort as the former performs cleanups.
+    std::exit(-1);
 #else
-    LOG(FATAL) << rc.Report();
+    LOG(FATAL) << "abort";
 #endif
  }};
  error_worker_.detach();
@ -259,8 +264,8 @@ RabitComm::RabitComm(std::string const& host, std::int32_t port, std::chrono::se
  CHECK(this->channels_.empty());
  for (auto& w : workers) {
    if (w) {
-      w->SetNoDelay();
-      rc = w->NonBlocking(true);
+      rc = std::move(rc) << [&] { return w->SetNoDelay(); } << [&] { return w->NonBlocking(true); }
+                         << [&] { return w->SetKeepAlive(); };
    }
    if (!rc.OK()) {
      return rc;
--- a/src/collective/comm.cu
+++ b/src/collective/comm.cu
@ -10,21 +10,24 @@
 #include <sstream>    // for stringstream
 #include <vector>     // for vector

+#include "../common/cuda_context.cuh"    // for CUDAContext
 #include "../common/device_helpers.cuh"  // for DefaultStream
 #include "../common/type.h"              // for EraseType
-#include "broadcast.h"                   // for Broadcast
 #include "comm.cuh"                      // for NCCLComm
 #include "comm.h"                        // for Comm
+#include "nccl_stub.h"                   // for NcclStub
 #include "xgboost/collective/result.h"   // for Result
 #include "xgboost/span.h"                // for Span

 namespace xgboost::collective {
 namespace {
-Result GetUniqueId(Comm const& comm, std::shared_ptr<Coll> coll, ncclUniqueId* pid) {
+Result GetUniqueId(Comm const& comm, std::shared_ptr<NcclStub> stub, std::shared_ptr<Coll> coll,
+                   ncclUniqueId* pid) {
  static const int kRootRank = 0;
  ncclUniqueId id;
  if (comm.Rank() == kRootRank) {
-    dh::safe_nccl(ncclGetUniqueId(&id));
+    auto rc = stub->GetUniqueId(&id);
+    CHECK(rc.OK()) << rc.Report();
  }
  auto rc = coll->Broadcast(
      comm, common::Span{reinterpret_cast<std::int8_t*>(&id), sizeof(ncclUniqueId)}, kRootRank);
@ -63,14 +66,15 @@ static std::string PrintUUID(xgboost::common::Span<std::uint64_t, kUuidLength> c
 }
 }  // namespace

-Comm* Comm::MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll> pimpl) const {
-  return new NCCLComm{ctx, *this, pimpl};
+Comm* RabitComm::MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll> pimpl) const {
+  return new NCCLComm{ctx, *this, pimpl, StringView{this->nccl_path_}};
 }

-NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> pimpl)
+NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> pimpl,
+                   StringView nccl_path)
    : Comm{root.TrackerInfo().host, root.TrackerInfo().port, root.Timeout(), root.Retry(),
           root.TaskID()},
-      stream_{dh::DefaultStream()} {
+      stream_{ctx->CUDACtx()->Stream()} {
  this->world_ = root.World();
  this->rank_ = root.Rank();
  this->domain_ = root.Domain();
@ -79,6 +83,7 @@ NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> p
  }

  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
+  stub_ = std::make_shared<NcclStub>(nccl_path);

  std::vector<std::uint64_t> uuids(root.World() * kUuidLength, 0);
  auto s_uuid = xgboost::common::Span<std::uint64_t>{uuids.data(), uuids.size()};
@ -104,19 +109,22 @@ NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> p
      << "Multiple processes within communication group running on same CUDA "
      << "device is not supported. " << PrintUUID(s_this_uuid) << "\n";

-  rc = GetUniqueId(root, pimpl, &nccl_unique_id_);
+  rc = std::move(rc) << [&] { return GetUniqueId(root, this->stub_, pimpl, &nccl_unique_id_); } <<
+       [&] {
+         return this->stub_->CommInitRank(&nccl_comm_, root.World(), nccl_unique_id_, root.Rank());
+       };
  CHECK(rc.OK()) << rc.Report();
-  dh::safe_nccl(ncclCommInitRank(&nccl_comm_, root.World(), nccl_unique_id_, root.Rank()));

  for (std::int32_t r = 0; r < root.World(); ++r) {
    this->channels_.emplace_back(
-        std::make_shared<NCCLChannel>(root, r, nccl_comm_, dh::DefaultStream()));
+        std::make_shared<NCCLChannel>(root, r, nccl_comm_, stub_, dh::DefaultStream()));
  }
 }

 NCCLComm::~NCCLComm() {
  if (nccl_comm_) {
-    dh::safe_nccl(ncclCommDestroy(nccl_comm_));
+    auto rc = stub_->CommDestroy(nccl_comm_);
+    CHECK(rc.OK()) << rc.Report();
  }
 }
 }  // namespace xgboost::collective
--- a/src/collective/comm.cuh
+++ b/src/collective/comm.cuh
@ -9,9 +9,13 @@
 #include "../common/cuda_to_hip.h"
 #include "rccl.h"
 #endif  // XGBOOST_USE_NCCL
+
+#include <utility>  // for move
+
 #include "../common/device_helpers.cuh"
 #include "coll.h"
 #include "comm.h"
+#include "nccl_stub.h"  // for NcclStub
 #include "xgboost/context.h"

 namespace xgboost::collective {
@ -24,15 +28,20 @@ inline Result GetCUDAResult(cudaError rc) {
  return Fail(msg);
 }

+#if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
 class NCCLComm : public Comm {
  ncclComm_t nccl_comm_{nullptr};
+  std::shared_ptr<NcclStub> stub_;
  ncclUniqueId nccl_unique_id_{};
  dh::CUDAStreamView stream_;
+  std::string nccl_path_;

 public:
  [[nodiscard]] ncclComm_t Handle() const { return nccl_comm_; }
+  auto Stub() const { return stub_; }

-  explicit NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> pimpl);
+  explicit NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> pimpl,
+                    StringView nccl_path);
  [[nodiscard]] Result LogTracker(std::string) const override {
    LOG(FATAL) << "Device comm is used for logging.";
    return Fail("Undefined.");
@ -49,22 +58,29 @@ class NCCLComm : public Comm {
 class NCCLChannel : public Channel {
  std::int32_t rank_{-1};
  ncclComm_t nccl_comm_{};
+  std::shared_ptr<NcclStub> stub_;
  dh::CUDAStreamView stream_;

 public:
  explicit NCCLChannel(Comm const& comm, std::int32_t rank, ncclComm_t nccl_comm,
-                       dh::CUDAStreamView stream)
-      : rank_{rank}, nccl_comm_{nccl_comm}, Channel{comm, nullptr}, stream_{stream} {}
+                       std::shared_ptr<NcclStub> stub, dh::CUDAStreamView stream)
+      : rank_{rank},
+        nccl_comm_{nccl_comm},
+        stub_{std::move(stub)},
+        Channel{comm, nullptr},
+        stream_{stream} {}

-  void SendAll(std::int8_t const* ptr, std::size_t n) override {
-    dh::safe_nccl(ncclSend(ptr, n, ncclInt8, rank_, nccl_comm_, stream_));
+  [[nodiscard]] Result SendAll(std::int8_t const* ptr, std::size_t n) override {
+    return stub_->Send(ptr, n, ncclInt8, rank_, nccl_comm_, stream_);
  }
-  void RecvAll(std::int8_t* ptr, std::size_t n) override {
-    dh::safe_nccl(ncclRecv(ptr, n, ncclInt8, rank_, nccl_comm_, stream_));
+  [[nodiscard]] Result RecvAll(std::int8_t* ptr, std::size_t n) override {
+    return stub_->Recv(ptr, n, ncclInt8, rank_, nccl_comm_, stream_);
  }
  [[nodiscard]] Result Block() override {
    auto rc = stream_.Sync(false);
    return GetCUDAResult(rc);
  }
 };
+
+#endif  //  defined(XGBOOST_USE_NCCL)
 }  // namespace xgboost::collective
--- a/src/collective/comm.h
+++ b/src/collective/comm.h
@ -34,6 +34,8 @@ inline std::int32_t BootstrapPrev(std::int32_t r, std::int32_t world) {
  return nrank;
 }

+inline StringView DefaultNcclName() { return "libnccl.so.2"; }
+
 class Channel;
 class Coll;

@ -86,11 +88,21 @@ class Comm : public std::enable_shared_from_this<Comm> {
  [[nodiscard]] virtual Result LogTracker(std::string msg) const = 0;

  [[nodiscard]] virtual Result SignalError(Result const&) { return Success(); }
-
-  virtual Comm* MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll> pimpl) const;
 };

-class RabitComm : public Comm {
+/**
+ * @brief Base class for CPU-based communicator.
+ */
+class HostComm : public Comm {
+ public:
+  using Comm::Comm;
+  [[nodiscard]] virtual Comm* MakeCUDAVar(Context const* ctx,
+                                          std::shared_ptr<Coll> pimpl) const = 0;
+};
+
+class RabitComm : public HostComm {
+  std::string nccl_path_ = std::string{DefaultNcclName()};
+
  [[nodiscard]] Result Bootstrap(std::chrono::seconds timeout, std::int32_t retry,
                                 std::string task_id);
  [[nodiscard]] Result Shutdown();
@ -100,13 +112,15 @@ class RabitComm : public Comm {
  RabitComm() = default;
  // ctor for testing where environment is known.
  RabitComm(std::string const& host, std::int32_t port, std::chrono::seconds timeout,
-            std::int32_t retry, std::string task_id);
+            std::int32_t retry, std::string task_id, StringView nccl_path);
  ~RabitComm() noexcept(false) override;

  [[nodiscard]] bool IsFederated() const override { return false; }
  [[nodiscard]] Result LogTracker(std::string msg) const override;

  [[nodiscard]] Result SignalError(Result const&) override;
+
+  [[nodiscard]] Comm* MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll> pimpl) const override;
 };

 /**
@ -121,21 +135,25 @@ class Channel {
  explicit Channel(Comm const& comm, std::shared_ptr<TCPSocket> sock)
      : sock_{std::move(sock)}, comm_{comm} {}

-  virtual void SendAll(std::int8_t const* ptr, std::size_t n) {
+  [[nodiscard]] virtual Result SendAll(std::int8_t const* ptr, std::size_t n) {
    Loop::Op op{Loop::Op::kWrite, comm_.Rank(), const_cast<std::int8_t*>(ptr), n, sock_.get(), 0};
    CHECK(sock_.get());
    comm_.Submit(std::move(op));
+    return Success();
  }
-  void SendAll(common::Span<std::int8_t const> data) {
-    this->SendAll(data.data(), data.size_bytes());
+  [[nodiscard]] Result SendAll(common::Span<std::int8_t const> data) {
+    return this->SendAll(data.data(), data.size_bytes());
  }

-  virtual void RecvAll(std::int8_t* ptr, std::size_t n) {
+  [[nodiscard]] virtual Result RecvAll(std::int8_t* ptr, std::size_t n) {
    Loop::Op op{Loop::Op::kRead, comm_.Rank(), ptr, n, sock_.get(), 0};
    CHECK(sock_.get());
    comm_.Submit(std::move(op));
+    return Success();
+  }
+  [[nodiscard]] Result RecvAll(common::Span<std::int8_t> data) {
+    return this->RecvAll(data.data(), data.size_bytes());
  }
-  void RecvAll(common::Span<std::int8_t> data) { this->RecvAll(data.data(), data.size_bytes()); }

  [[nodiscard]] auto Socket() const { return sock_; }
  [[nodiscard]] virtual Result Block() { return comm_.Block(); }
--- a/src/collective/comm_group.cc
+++ b/src/collective/comm_group.cc
@ -0,0 +1,122 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include "comm_group.h"
+
+#include <algorithm>  // for transform
+#include <chrono>     // for seconds
+#include <cstdint>    // for int32_t
+#include <memory>     // for shared_ptr, unique_ptr
+#include <string>     // for string
+#include <vector>     // for vector
+
+#include "../common/json_utils.h"       // for OptionalArg
+#include "coll.h"                       // for Coll
+#include "comm.h"                       // for Comm
+#include "tracker.h"                    // for GetHostAddress
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/context.h"            // for DeviceOrd
+#include "xgboost/json.h"               // for Json
+
+#if defined(XGBOOST_USE_FEDERATED)
+#include "../../plugin/federated/federated_coll.h"
+#include "../../plugin/federated/federated_comm.h"
+#endif
+
+namespace xgboost::collective {
+[[nodiscard]] std::shared_ptr<Coll> CommGroup::Backend(DeviceOrd device) const {
+  if (device.IsCUDA()) {
+    if (!gpu_coll_) {
+      gpu_coll_.reset(backend_->MakeCUDAVar());
+    }
+    return gpu_coll_;
+  }
+  return backend_;
+}
+
+[[nodiscard]] Comm const& CommGroup::Ctx(Context const* ctx, DeviceOrd device) const {
+  if (device.IsCUDA()) {
+    CHECK(ctx->IsCUDA());
+    if (!gpu_comm_ || gpu_comm_->World() != comm_->World()) {
+      gpu_comm_.reset(comm_->MakeCUDAVar(ctx, backend_));
+    }
+    return *gpu_comm_;
+  }
+  return *comm_;
+}
+
+CommGroup::CommGroup()
+    : comm_{std::shared_ptr<RabitComm>(new RabitComm{})},  // NOLINT
+      backend_{std::shared_ptr<Coll>(new Coll{})} {}       // NOLINT
+
+[[nodiscard]] CommGroup* CommGroup::Create(Json config) {
+  if (IsA<Null>(config)) {
+    return new CommGroup;
+  }
+
+  std::string type = OptionalArg<String>(config, "dmlc_communicator", std::string{"rabit"});
+  // Try both lower and upper case for compatibility
+  auto get_param = [&](std::string name, auto dft, auto t) {
+    std::string upper;
+    std::transform(name.cbegin(), name.cend(), std::back_inserter(upper),
+                   [](char c) { return std::toupper(c); });
+    std::transform(name.cbegin(), name.cend(), name.begin(),
+                   [](char c) { return std::tolower(c); });
+
+    auto const& obj = get<Object const>(config);
+    auto it = obj.find(upper);
+    if (it != obj.cend()) {
+      return OptionalArg<decltype(t)>(config, upper, dft);
+    } else {
+      return OptionalArg<decltype(t)>(config, name, dft);
+    }
+  };
+  // Common args
+  auto retry = get_param("dmlc_retry", static_cast<Integer::Int>(DefaultRetry()), Integer{});
+  auto timeout =
+      get_param("dmlc_timeout_sec", static_cast<Integer::Int>(DefaultTimeoutSec()), Integer{});
+  auto task_id = get_param("dmlc_task_id", std::string{}, String{});
+
+  if (type == "rabit") {
+    auto host = get_param("dmlc_tracker_uri", std::string{}, String{});
+    auto port = get_param("dmlc_tracker_port", static_cast<std::int64_t>(0), Integer{});
+    auto nccl = get_param("dmlc_nccl_path", std::string{DefaultNcclName()}, String{});
+    auto ptr =
+        new CommGroup{std::shared_ptr<RabitComm>{new RabitComm{  // NOLINT
+                          host, static_cast<std::int32_t>(port), std::chrono::seconds{timeout},
+                          static_cast<std::int32_t>(retry), task_id, nccl}},
+                      std::shared_ptr<Coll>(new Coll{})};  // NOLINT
+    return ptr;
+  } else if (type == "federated") {
+#if defined(XGBOOST_USE_FEDERATED)
+    auto ptr = new CommGroup{
+        std::make_shared<FederatedComm>(retry, std::chrono::seconds{timeout}, task_id, config),
+        std::make_shared<FederatedColl>()};
+    return ptr;
+#endif  // defined(XGBOOST_USE_FEDERATED)
+  } else {
+    LOG(FATAL) << "Invalid communicator type";
+  }
+
+  return nullptr;
+}
+
+std::unique_ptr<collective::CommGroup>& GlobalCommGroup() {
+  static thread_local std::unique_ptr<collective::CommGroup> sptr;
+  if (!sptr) {
+    Json config{Null{}};
+    sptr.reset(CommGroup::Create(config));
+  }
+  return sptr;
+}
+
+void GlobalCommGroupInit(Json config) {
+  auto& sptr = GlobalCommGroup();
+  sptr.reset(CommGroup::Create(std::move(config)));
+}
+
+void GlobalCommGroupFinalize() {
+  auto& sptr = GlobalCommGroup();
+  sptr.reset();
+}
+}  // namespace xgboost::collective
--- a/src/collective/comm_group.h
+++ b/src/collective/comm_group.h
@ -0,0 +1,55 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#pragma once
+#include <memory>   // for shared_ptr, unique_ptr
+#include <string>   // for string
+#include <utility>  // for move
+
+#include "coll.h"                       // for Comm
+#include "comm.h"                       // for Coll
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/collective/socket.h"  // for GetHostName
+
+namespace xgboost::collective {
+/**
+ * @brief Communicator group used for double dispatching between communicators and
+ *        collective implementations.
+ */
+class CommGroup {
+  std::shared_ptr<HostComm> comm_;
+  mutable std::shared_ptr<Comm> gpu_comm_;
+
+  std::shared_ptr<Coll> backend_;
+  mutable std::shared_ptr<Coll> gpu_coll_;  // lazy initialization
+
+  CommGroup(std::shared_ptr<Comm> comm, std::shared_ptr<Coll> coll)
+      : comm_{std::dynamic_pointer_cast<HostComm>(comm)}, backend_{std::move(coll)} {
+    CHECK(comm_);
+  }
+
+ public:
+  CommGroup();
+
+  [[nodiscard]] auto World() const { return comm_->World(); }
+  [[nodiscard]] auto Rank() const { return comm_->Rank(); }
+  [[nodiscard]] bool IsDistributed() const { return comm_->IsDistributed(); }
+
+  [[nodiscard]] static CommGroup* Create(Json config);
+
+  [[nodiscard]] std::shared_ptr<Coll> Backend(DeviceOrd device) const;
+  [[nodiscard]] Comm const& Ctx(Context const* ctx, DeviceOrd device) const;
+  [[nodiscard]] Result SignalError(Result const& res) { return comm_->SignalError(res); }
+
+  [[nodiscard]] Result ProcessorName(std::string* out) const {
+    auto rc = GetHostName(out);
+    return rc;
+  }
+};
+
+std::unique_ptr<collective::CommGroup>& GlobalCommGroup();
+
+void GlobalCommGroupInit(Json config);
+
+void GlobalCommGroupFinalize();
+}  // namespace xgboost::collective
--- a/src/collective/communicator.cc
+++ b/src/collective/communicator.cc
@ -3,6 +3,7 @@
 */
 #include "communicator.h"

+#include "comm.h"
 #include "in_memory_communicator.h"
 #include "noop_communicator.h"
 #include "rabit_communicator.h"
@ -14,8 +15,12 @@
 namespace xgboost::collective {
 thread_local std::unique_ptr<Communicator> Communicator::communicator_{new NoOpCommunicator()};
 thread_local CommunicatorType Communicator::type_{};
+thread_local std::string Communicator::nccl_path_{};

 void Communicator::Init(Json const& config) {
+  auto nccl = OptionalArg<String>(config, "dmlc_nccl_path", std::string{DefaultNcclName()});
+  nccl_path_ = nccl;
+
  auto type = GetTypeFromEnv();
  auto const arg = GetTypeFromConfig(config);
  if (arg != CommunicatorType::kUnknown) {
--- a/src/collective/communicator.cu
+++ b/src/collective/communicator.cu
@ -31,17 +31,17 @@ DeviceCommunicator* Communicator::GetDevice(int device_ordinal) {
 #if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
    switch (type_) {
      case CommunicatorType::kRabit:
-        device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false));
+        device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false, nccl_path_));
        break;
      case CommunicatorType::kFederated:
      case CommunicatorType::kInMemory:
        device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal));
        break;
      case CommunicatorType::kInMemoryNccl:
-        device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, true));
+        device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, true, nccl_path_));
        break;
      default:
-        device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false));
+        device_communicator_.reset(new NcclDeviceCommunicator(device_ordinal, false, nccl_path_));
    }
 #else
    device_communicator_.reset(new DeviceCommunicatorAdapter(device_ordinal));
--- a/src/collective/communicator.h
+++ b/src/collective/communicator.h
@ -234,6 +234,7 @@ class Communicator {

  static thread_local std::unique_ptr<Communicator> communicator_;
  static thread_local CommunicatorType type_;
+  static thread_local std::string nccl_path_;
 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
  static thread_local std::unique_ptr<DeviceCommunicator> device_communicator_;
 #endif
--- a/src/collective/loop.cc
+++ b/src/collective/loop.cc
@ -10,21 +10,26 @@
 #include "xgboost/logging.h"            // for CHECK

 namespace xgboost::collective {
-Result Loop::EmptyQueue() {
+Result Loop::EmptyQueue(std::queue<Op>* p_queue) const {
  timer_.Start(__func__);
-  auto error = [this] {
-    this->stop_ = true;
+  auto error = [this] { timer_.Stop(__func__); };
+
+  if (stop_) {
    timer_.Stop(__func__);
-  };
+    return Success();
+  }

-  while (!queue_.empty() && !stop_) {
-    std::queue<Op> qcopy;
+  auto& qcopy = *p_queue;
+
+  // clear the copied queue
+  while (!qcopy.empty()) {
    rabit::utils::PollHelper poll;
+    std::size_t n_ops = qcopy.size();

-    // watch all ops
-    while (!queue_.empty()) {
-      auto op = queue_.front();
-      queue_.pop();
+    // Iterate through all the ops for poll
+    for (std::size_t i = 0; i < n_ops; ++i) {
+      auto op = qcopy.front();
+      qcopy.pop();

      switch (op.code) {
        case Op::kRead: {
@ -40,6 +45,7 @@ Result Loop::EmptyQueue() {
          return Fail("Invalid socket operation.");
        }
      }
+
      qcopy.push(op);
    }

@ -51,10 +57,12 @@ Result Loop::EmptyQueue() {
      error();
      return rc;
    }
+
    // we wonldn't be here if the queue is empty.
    CHECK(!qcopy.empty());

-    while (!qcopy.empty() && !stop_) {
+    // Iterate through all the ops for performing the operations
+    for (std::size_t i = 0; i < n_ops; ++i) {
      auto op = qcopy.front();
      qcopy.pop();

@ -81,20 +89,21 @@ Result Loop::EmptyQueue() {
      }

      if (n_bytes_done == -1 && !system::LastErrorWouldBlock()) {
-        stop_ = true;
        auto rc = system::FailWithCode("Invalid socket output.");
        error();
        return rc;
      }
+
      op.off += n_bytes_done;
      CHECK_LE(op.off, op.n);

      if (op.off != op.n) {
        // not yet finished, push back to queue for next round.
-        queue_.push(op);
+        qcopy.push(op);
      }
    }
  }
+
  timer_.Stop(__func__);
  return Success();
 }
@ -107,22 +116,46 @@ void Loop::Process() {
    if (stop_) {
      break;
    }
-    CHECK(!mu_.try_lock());

-    this->rc_ = this->EmptyQueue();
-    if (!rc_.OK()) {
-      stop_ = true;
+    auto unlock_notify = [&](bool is_blocking, bool stop) {
+      if (!is_blocking) {
+        std::lock_guard guard{mu_};
+        stop_ = stop;
+      } else {
+        stop_ = stop;
+        lock.unlock();
+      }
      cv_.notify_one();
-      break;
+    };
+
+    // move the queue
+    std::queue<Op> qcopy;
+    bool is_blocking = false;
+    while (!queue_.empty()) {
+      auto op = queue_.front();
+      queue_.pop();
+      if (op.code == Op::kBlock) {
+        is_blocking = true;
+      } else {
+        qcopy.push(op);
+      }
+    }
+    // unblock the queue
+    if (!is_blocking) {
+      lock.unlock();
+    }
+    // clear the queue
+    auto rc = this->EmptyQueue(&qcopy);
+    // Handle error
+    if (!rc.OK()) {
+      unlock_notify(is_blocking, true);
+      std::lock_guard<std::mutex> guard{rc_lock_};
+      this->rc_ = std::move(rc);
+      return;
    }

-    CHECK(queue_.empty());
-    CHECK(!mu_.try_lock());
-    cv_.notify_one();
-  }
-
-  if (rc_.OK()) {
-    CHECK(queue_.empty());
+    CHECK(qcopy.empty());
+    unlock_notify(is_blocking, false);
  }
 }

@ -140,6 +173,24 @@ Result Loop::Stop() {
  return Success();
 }

+[[nodiscard]] Result Loop::Block() {
+  {
+    std::lock_guard<std::mutex> guard{rc_lock_};
+    if (!rc_.OK()) {
+      return std::move(rc_);
+    }
+  }
+  this->Submit(Op{Op::kBlock});
+  {
+    std::unique_lock lock{mu_};
+    cv_.wait(lock, [this] { return (this->queue_.empty()) || stop_; });
+  }
+  {
+    std::lock_guard<std::mutex> lock{rc_lock_};
+    return std::move(rc_);
+  }
+}
+
 Loop::Loop(std::chrono::seconds timeout) : timeout_{timeout} {
  timer_.Init(__func__);
  worker_ = std::thread{[this] {
--- a/src/collective/loop.h
+++ b/src/collective/loop.h
@ -20,13 +20,14 @@ namespace xgboost::collective {
 class Loop {
 public:
  struct Op {
-    enum Code : std::int8_t { kRead = 0, kWrite = 1 } code;
+    enum Code : std::int8_t { kRead = 0, kWrite = 1, kBlock = 2 } code;
    std::int32_t rank{-1};
    std::int8_t* ptr{nullptr};
    std::size_t n{0};
    TCPSocket* sock{nullptr};
    std::size_t off{0};

+    explicit Op(Code c) : code{c} { CHECK(c == kBlock); }
    Op(Code c, std::int32_t rank, std::int8_t* ptr, std::size_t n, TCPSocket* sock, std::size_t off)
        : code{c}, rank{rank}, ptr{ptr}, n{n}, sock{sock}, off{off} {}
    Op(Op const&) = default;
@ -41,12 +42,15 @@ class Loop {
  std::mutex mu_;
  std::queue<Op> queue_;
  std::chrono::seconds timeout_;
+
  Result rc_;
+  std::mutex rc_lock_;  // lock for transferring error info.
+
  bool stop_{false};
  std::exception_ptr curr_exce_{nullptr};
-  common::Monitor timer_;
+  common::Monitor mutable timer_;

-  Result EmptyQueue();
+  Result EmptyQueue(std::queue<Op>* p_queue) const;
  void Process();

 public:
@ -60,15 +64,7 @@ class Loop {
    cv_.notify_one();
  }

-  [[nodiscard]] Result Block() {
-    {
-      std::unique_lock lock{mu_};
-      cv_.notify_all();
-    }
-    std::unique_lock lock{mu_};
-    cv_.wait(lock, [this] { return this->queue_.empty() || stop_; });
-    return std::move(rc_);
-  }
+  [[nodiscard]] Result Block();

  explicit Loop(std::chrono::seconds timeout);

--- a/src/collective/nccl_device_communicator.cu
+++ b/src/collective/nccl_device_communicator.cu
@ -2,12 +2,14 @@
 * Copyright 2023 XGBoost contributors
 */
 #if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
+#include "comm.cuh"
 #include "nccl_device_communicator.cuh"

 namespace xgboost {
 namespace collective {

-NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, bool needs_sync)
+NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, bool needs_sync,
+                                               StringView nccl_path)
    : device_ordinal_{device_ordinal},
      needs_sync_{needs_sync},
      world_size_{GetWorldSize()},
@ -18,6 +20,7 @@ NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, bool needs_sy
  if (world_size_ == 1) {
    return;
  }
+  stub_ = std::make_shared<NcclStub>(std::move(nccl_path));

  std::vector<uint64_t> uuids(world_size_ * kUuidLength, 0);
  auto s_uuid = xgboost::common::Span<uint64_t>{uuids.data(), uuids.size()};
@ -43,7 +46,8 @@ NcclDeviceCommunicator::NcclDeviceCommunicator(int device_ordinal, bool needs_sy

  nccl_unique_id_ = GetUniqueId();
  dh::safe_cuda(cudaSetDevice(device_ordinal_));
-  dh::safe_nccl(ncclCommInitRank(&nccl_comm_, world_size_, nccl_unique_id_, rank_));
+  auto rc = stub_->CommInitRank(&nccl_comm_, world_size_, nccl_unique_id_, rank_);
+  CHECK(rc.OK()) << rc.Report();
 }

 NcclDeviceCommunicator::~NcclDeviceCommunicator() {
@ -51,7 +55,8 @@ NcclDeviceCommunicator::~NcclDeviceCommunicator() {
    return;
  }
  if (nccl_comm_) {
-    dh::safe_nccl(ncclCommDestroy(nccl_comm_));
+    auto rc = stub_->CommDestroy(nccl_comm_);
+    CHECK(rc.OK()) << rc.Report();
  }
  if (xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) {
    LOG(CONSOLE) << "======== NCCL Statistics========";
@ -137,8 +142,9 @@ void NcclDeviceCommunicator::BitwiseAllReduce(void *send_receive_buffer, std::si
  auto *device_buffer = buffer.data().get();

  // First gather data from all the workers.
-  dh::safe_nccl(ncclAllGather(send_receive_buffer, device_buffer, count, GetNcclDataType(data_type),
-                              nccl_comm_, dh::DefaultStream()));
+  auto rc = stub_->Allgather(send_receive_buffer, device_buffer, count, GetNcclDataType(data_type),
+                             nccl_comm_, dh::DefaultStream());
+  CHECK(rc.OK()) << rc.Report();
  if (needs_sync_) {
    dh::DefaultStream().Sync();
  }
@ -170,9 +176,10 @@ void NcclDeviceCommunicator::AllReduce(void *send_receive_buffer, std::size_t co
  if (IsBitwiseOp(op)) {
    BitwiseAllReduce(send_receive_buffer, count, data_type, op);
  } else {
-    dh::safe_nccl(ncclAllReduce(send_receive_buffer, send_receive_buffer, count,
-                                GetNcclDataType(data_type), GetNcclRedOp(op), nccl_comm_,
-                                dh::DefaultStream()));
+    auto rc = stub_->Allreduce(send_receive_buffer, send_receive_buffer, count,
+                               GetNcclDataType(data_type), GetNcclRedOp(op), nccl_comm_,
+                               dh::DefaultStream());
+    CHECK(rc.OK()) << rc.Report();
  }
  allreduce_bytes_ += count * GetTypeSize(data_type);
  allreduce_calls_ += 1;
@ -185,8 +192,9 @@ void NcclDeviceCommunicator::AllGather(void const *send_buffer, void *receive_bu
  }

  dh::safe_cuda(cudaSetDevice(device_ordinal_));
-  dh::safe_nccl(ncclAllGather(send_buffer, receive_buffer, send_size, ncclInt8, nccl_comm_,
-                              dh::DefaultStream()));
+  auto rc = stub_->Allgather(send_buffer, receive_buffer, send_size, ncclInt8, nccl_comm_,
+                             dh::DefaultStream());
+  CHECK(rc.OK()) << rc.Report();
 }

 void NcclDeviceCommunicator::AllGatherV(void const *send_buffer, size_t length_bytes,
@ -206,14 +214,18 @@ void NcclDeviceCommunicator::AllGatherV(void const *send_buffer, size_t length_b
  receive_buffer->resize(total_bytes);

  size_t offset = 0;
-  dh::safe_nccl(ncclGroupStart());
-  for (int32_t i = 0; i < world_size_; ++i) {
-    size_t as_bytes = segments->at(i);
-    dh::safe_nccl(ncclBroadcast(send_buffer, receive_buffer->data().get() + offset, as_bytes,
-                                ncclChar, i, nccl_comm_, dh::DefaultStream()));
-    offset += as_bytes;
-  }
-  dh::safe_nccl(ncclGroupEnd());
+  auto rc = Success() << [&] { return stub_->GroupStart(); } << [&] {
+    for (int32_t i = 0; i < world_size_; ++i) {
+      size_t as_bytes = segments->at(i);
+      auto rc = stub_->Broadcast(send_buffer, receive_buffer->data().get() + offset, as_bytes,
+                                 ncclChar, i, nccl_comm_, dh::DefaultStream());
+      if (!rc.OK()) {
+        return rc;
+      }
+      offset += as_bytes;
+    }
+    return Success();
+  } << [&] { return stub_->GroupEnd(); };
 }

 void NcclDeviceCommunicator::Synchronize() {
--- a/src/collective/nccl_device_communicator.cuh
+++ b/src/collective/nccl_device_communicator.cuh
@ -4,8 +4,10 @@
 #pragma once

 #include "../common/device_helpers.cuh"
+#include "comm.cuh"
 #include "communicator.h"
 #include "device_communicator.cuh"
+#include "nccl_stub.h"

 namespace xgboost {
 namespace collective {
@ -25,7 +27,7 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
   * needed. The in-memory communicator is used in tests with multiple threads, each thread
   * representing a rank/worker, so the additional synchronization is needed to avoid deadlocks.
   */
-  explicit NcclDeviceCommunicator(int device_ordinal, bool needs_sync);
+  explicit NcclDeviceCommunicator(int device_ordinal, bool needs_sync, StringView nccl_path);
  ~NcclDeviceCommunicator() override;
  void AllReduce(void *send_receive_buffer, std::size_t count, DataType data_type,
                 Operation op) override;
@ -74,7 +76,8 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
    static const int kRootRank = 0;
    ncclUniqueId id;
    if (rank_ == kRootRank) {
-      dh::safe_nccl(ncclGetUniqueId(&id));
+      auto rc = stub_->GetUniqueId(&id);
+      CHECK(rc.OK()) << rc.Report();
    }
    Broadcast(static_cast<void *>(&id), sizeof(ncclUniqueId), static_cast<int>(kRootRank));
    return id;
@ -88,6 +91,7 @@ class NcclDeviceCommunicator : public DeviceCommunicator {
  int const world_size_;
  int const rank_;
  ncclComm_t nccl_comm_{};
+  std::shared_ptr<NcclStub> stub_;
  ncclUniqueId nccl_unique_id_{};
  size_t allreduce_bytes_{0};  // Keep statistics of the number of bytes communicated.
  size_t allreduce_calls_{0};  // Keep statistics of the number of reduce calls.
--- a/src/collective/nccl_stub.cc
+++ b/src/collective/nccl_stub.cc
@ -0,0 +1,131 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#if defined(XGBOOST_USE_NCCL) || (defined(XGBOOST_USE_RCCL) && 0)
+#include "nccl_stub.h"
+
+#include <cuda.h>              // for CUDA_VERSION
+#include <cuda_runtime_api.h>  // for cudaPeekAtLastError
+#include <dlfcn.h>             // for dlclose, dlsym, dlopen
+#include <nccl.h>
+#include <thrust/system/cuda/error.h>  // for cuda_category
+#include <thrust/system_error.h>       // for system_error
+
+#include <cstdint>  // for int32_t
+#include <sstream>  // for stringstream
+#include <string>   // for string
+#include <utility>  // for move
+
+#include "xgboost/logging.h"
+
+namespace xgboost::collective {
+Result NcclStub::GetNcclResult(ncclResult_t code) const {
+  if (code == ncclSuccess) {
+    return Success();
+  }
+
+  std::stringstream ss;
+  ss << "NCCL failure: " << this->GetErrorString(code) << ".";
+  if (code == ncclUnhandledCudaError) {
+    // nccl usually preserves the last error so we can get more details.
+    auto err = cudaPeekAtLastError();
+    ss << "  CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
+  } else if (code == ncclSystemError) {
+    ss << "  This might be caused by a network configuration issue. Please consider specifying "
+          "the network interface for NCCL via environment variables listed in its reference: "
+          "`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n";
+  }
+  return Fail(ss.str());
+}
+
+NcclStub::NcclStub(StringView path) : path_{std::move(path)} {
+#if defined(XGBOOST_USE_DLOPEN_NCCL) || defined(XGBOOST_USE_DLOPEN_RCCL)
+  CHECK(!path_.empty()) << "Empty path for NCCL.";
+
+  auto cu_major = (CUDA_VERSION) / 1000;
+  std::stringstream ss;
+  ss << R"m(
+
+If XGBoost is installed from PyPI with pip, the error can fixed by:
+
+- Run `pip install nvidia-nccl-cu)m"
+     << cu_major << "` (Or with any CUDA version that's compatible with " << cu_major << ").";
+  ss << R"m(
+
+Otherwise, please refer to:
+
+  https://xgboost.readthedocs.io/en/stable/tutorials/dask.html#troubleshooting
+
+for more info, or open an issue on GitHub. Starting from XGBoost 2.1.0, the PyPI package
+no long bundles NCCL in the binary wheel.
+
+)m";
+  auto help = ss.str();
+  std::string msg{"Failed to load NCCL from path: `" + path_ + "`. Error:\n  "};
+
+  auto safe_load = [&](auto t, StringView name) {
+    std::stringstream errs;
+    auto ptr = reinterpret_cast<decltype(t)>(dlsym(handle_, name.c_str()));
+    if (!ptr) {
+      errs << "Failed to load NCCL symbol `" << name << "` from " << path_ << ". Error:\n  "
+           << dlerror() << help;
+      LOG(FATAL) << errs.str();
+    }
+    return ptr;
+  };
+
+  handle_ = dlopen(path_.c_str(), RTLD_LAZY);
+  if (!handle_) {
+    LOG(FATAL) << msg << dlerror() << help;
+  }
+
+  allreduce_ = safe_load(allreduce_, "ncclAllReduce");
+  broadcast_ = safe_load(broadcast_, "ncclBroadcast");
+  allgather_ = safe_load(allgather_, "ncclAllGather");
+  comm_init_rank_ = safe_load(comm_init_rank_, "ncclCommInitRank");
+  comm_destroy_ = safe_load(comm_destroy_, "ncclCommDestroy");
+  get_uniqueid_ = safe_load(get_uniqueid_, "ncclGetUniqueId");
+  send_ = safe_load(send_, "ncclSend");
+  recv_ = safe_load(recv_, "ncclRecv");
+  group_start_ = safe_load(group_start_, "ncclGroupStart");
+  group_end_ = safe_load(group_end_, "ncclGroupEnd");
+  get_error_string_ = safe_load(get_error_string_, "ncclGetErrorString");
+  get_version_ = safe_load(get_version_, "ncclGetVersion");
+
+  std::int32_t v;
+  CHECK_EQ(get_version_(&v), ncclSuccess);
+  auto patch = v % 100;
+  auto minor = (v / 100) % 100;
+  auto major = v / 10000;
+
+  LOG(INFO) << "Loaded shared NCCL " << major << "." << minor << "." << patch << ":`" << path_
+            << "`" << std::endl;
+#else
+  allreduce_ = ncclAllReduce;
+  broadcast_ = ncclBroadcast;
+  allgather_ = ncclAllGather;
+  comm_init_rank_ = ncclCommInitRank;
+  comm_destroy_ = ncclCommDestroy;
+  get_uniqueid_ = ncclGetUniqueId;
+  send_ = ncclSend;
+  recv_ = ncclRecv;
+  group_start_ = ncclGroupStart;
+  group_end_ = ncclGroupEnd;
+  get_error_string_ = ncclGetErrorString;
+  get_version_ = ncclGetVersion;
+#endif
+};
+
+NcclStub::~NcclStub() {  // NOLINT
+#if defined(XGBOOST_USE_DLOPEN_NCCL) || defined(XGBOOST_USE_DLOPEN_RCCL)
+  if (handle_) {
+    auto rc = dlclose(handle_);
+    if (rc != 0) {
+      LOG(WARNING) << "Failed to close NCCL handle:" << dlerror();
+    }
+  }
+  handle_ = nullptr;
+#endif  // defined(XGBOOST_USE_DLOPEN_NCCL)
+}
+}  // namespace xgboost::collective
+#endif  // defined(XGBOOST_USE_NCCL)
--- a/src/collective/nccl_stub.h
+++ b/src/collective/nccl_stub.h
@ -0,0 +1,86 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#pragma once
+#if defined(XGBOOST_USE_NCCL) || (defined(XGBOOST_USE_RCCL) && 0)
+
+#include <cuda_runtime_api.h>
+#include <nccl.h>
+
+#include <string>  // for string
+
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/string_view.h"        // for StringView
+
+namespace xgboost::collective {
+/**
+ * @brief A stub for NCCL to facilitate dynamic loading.
+ */
+class NcclStub {
+#if defined(XGBOOST_USE_DLOPEN_NCCL) || defined(XGBOOST_USE_DLOPEN_RCCL)
+  void* handle_{nullptr};
+#endif  // defined(XGBOOST_USE_DLOPEN_NCCL)
+  std::string path_;
+
+  decltype(ncclAllReduce)* allreduce_{nullptr};
+  decltype(ncclBroadcast)* broadcast_{nullptr};
+  decltype(ncclAllGather)* allgather_{nullptr};
+  decltype(ncclCommInitRank)* comm_init_rank_{nullptr};
+  decltype(ncclCommDestroy)* comm_destroy_{nullptr};
+  decltype(ncclGetUniqueId)* get_uniqueid_{nullptr};
+  decltype(ncclSend)* send_{nullptr};
+  decltype(ncclRecv)* recv_{nullptr};
+  decltype(ncclGroupStart)* group_start_{nullptr};
+  decltype(ncclGroupEnd)* group_end_{nullptr};
+  decltype(ncclGetErrorString)* get_error_string_{nullptr};
+  decltype(ncclGetVersion)* get_version_{nullptr};
+
+ public:
+  Result GetNcclResult(ncclResult_t code) const;
+
+ public:
+  explicit NcclStub(StringView path);
+  ~NcclStub();
+
+  [[nodiscard]] Result Allreduce(const void* sendbuff, void* recvbuff, size_t count,
+                                 ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+                                 cudaStream_t stream) const {
+    return this->GetNcclResult(allreduce_(sendbuff, recvbuff, count, datatype, op, comm, stream));
+  }
+  [[nodiscard]] Result Broadcast(const void* sendbuff, void* recvbuff, size_t count,
+                                 ncclDataType_t datatype, int root, ncclComm_t comm,
+                                 cudaStream_t stream) const {
+    return this->GetNcclResult(broadcast_(sendbuff, recvbuff, count, datatype, root, comm, stream));
+  }
+  [[nodiscard]] Result Allgather(const void* sendbuff, void* recvbuff, size_t sendcount,
+                                 ncclDataType_t datatype, ncclComm_t comm,
+                                 cudaStream_t stream) const {
+    return this->GetNcclResult(allgather_(sendbuff, recvbuff, sendcount, datatype, comm, stream));
+  }
+  [[nodiscard]] Result CommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId,
+                                    int rank) const {
+    return this->GetNcclResult(this->comm_init_rank_(comm, nranks, commId, rank));
+  }
+  [[nodiscard]] Result CommDestroy(ncclComm_t comm) const {
+    return this->GetNcclResult(comm_destroy_(comm));
+  }
+  [[nodiscard]] Result GetUniqueId(ncclUniqueId* uniqueId) const {
+    return this->GetNcclResult(get_uniqueid_(uniqueId));
+  }
+  [[nodiscard]] Result Send(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
+                            ncclComm_t comm, cudaStream_t stream) {
+    return this->GetNcclResult(send_(sendbuff, count, datatype, peer, comm, stream));
+  }
+  [[nodiscard]] Result Recv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
+                            ncclComm_t comm, cudaStream_t stream) const {
+    return this->GetNcclResult(recv_(recvbuff, count, datatype, peer, comm, stream));
+  }
+  [[nodiscard]] Result GroupStart() const { return this->GetNcclResult(group_start_()); }
+  [[nodiscard]] Result GroupEnd() const { return this->GetNcclResult(group_end_()); }
+  [[nodiscard]] const char* GetErrorString(ncclResult_t result) const {
+    return get_error_string_(result);
+  }
+};
+}  // namespace xgboost::collective
+
+#endif  // defined(XGBOOST_USE_NCCL)
--- a/src/collective/tracker.cc
+++ b/src/collective/tracker.cc
@ -58,36 +58,35 @@ Result Tracker::WaitUntilReady() const {

 RabitTracker::WorkerProxy::WorkerProxy(std::int32_t world, TCPSocket sock, SockAddrV4 addr)
    : sock_{std::move(sock)} {
-  auto host = addr.Addr();
-
  std::int32_t rank{0};
-  rc_ = Success()
-        << [&] { return proto::Magic{}.Verify(&sock_); }
-        << [&] { return proto::Connect{}.TrackerRecv(&sock_, &world_, &rank, &task_id_); };
-  if (!rc_.OK()) {
-    return;
-  }
-
-  std::string cmd;
-  sock_.Recv(&cmd);
-  auto jcmd = Json::Load(StringView{cmd});
-  cmd_ = static_cast<proto::CMD>(get<Integer const>(jcmd["cmd"]));
+  Json jcmd;
  std::int32_t port{0};
-  if (cmd_ == proto::CMD::kStart) {
-    proto::Start start;
-    rc_ = start.TrackerHandle(jcmd, &world_, world, &port, &sock_, &eport_);
-  } else if (cmd_ == proto::CMD::kPrint) {
-    proto::Print print;
-    rc_ = print.TrackerHandle(jcmd, &msg_);
-  } else if (cmd_ == proto::CMD::kError) {
-    proto::ErrorCMD error;
-    rc_ = error.TrackerHandle(jcmd, &msg_, &code_);
-  }
-  if (!rc_.OK()) {
-    return;
-  }

-  info_ = proto::PeerInfo{host, port, rank};
+  rc_ = Success() << [&] { return proto::Magic{}.Verify(&sock_); } << [&] {
+    return proto::Connect{}.TrackerRecv(&sock_, &world_, &rank, &task_id_);
+  } << [&] {
+    std::string cmd;
+    sock_.Recv(&cmd);
+    jcmd = Json::Load(StringView{cmd});
+    cmd_ = static_cast<proto::CMD>(get<Integer const>(jcmd["cmd"]));
+    return Success();
+  } << [&] {
+    if (cmd_ == proto::CMD::kStart) {
+      proto::Start start;
+      return start.TrackerHandle(jcmd, &world_, world, &port, &sock_, &eport_);
+    } else if (cmd_ == proto::CMD::kPrint) {
+      proto::Print print;
+      return print.TrackerHandle(jcmd, &msg_);
+    } else if (cmd_ == proto::CMD::kError) {
+      proto::ErrorCMD error;
+      return error.TrackerHandle(jcmd, &msg_, &code_);
+    }
+    return Success();
+  } << [&] {
+    auto host = addr.Addr();
+    info_ = proto::PeerInfo{host, port, rank};
+    return Success();
+  };
 }

 RabitTracker::RabitTracker(Json const& config) : Tracker{config} {
@ -137,15 +136,18 @@ Result RabitTracker::Bootstrap(std::vector<WorkerProxy>* p_workers) {

    std::int32_t n_shutdown{0};
    bool during_restart{false};
+    bool running{false};
    std::vector<WorkerProxy> pending;

    explicit State(std::int32_t world) : n_workers{world} {}
    State(State const& that) = delete;
    State& operator=(State&& that) = delete;

+    // modifiers
    void Start(WorkerProxy&& worker) {
      CHECK_LT(pending.size(), n_workers);
      CHECK_LE(n_shutdown, n_workers);
+      CHECK(!running);

      pending.emplace_back(std::forward<WorkerProxy>(worker));

@ -155,6 +157,7 @@ Result RabitTracker::Bootstrap(std::vector<WorkerProxy>* p_workers) {
      CHECK_GE(n_shutdown, 0);
      CHECK_LT(n_shutdown, n_workers);

+      running = false;
      ++n_shutdown;

      CHECK_LE(n_shutdown, n_workers);
@ -163,21 +166,26 @@ Result RabitTracker::Bootstrap(std::vector<WorkerProxy>* p_workers) {
      CHECK_LE(pending.size(), n_workers);
      CHECK_LE(n_shutdown, n_workers);

+      running = false;
      during_restart = true;
    }
-    [[nodiscard]] bool Ready() const {
-      CHECK_LE(pending.size(), n_workers);
-      return static_cast<std::int32_t>(pending.size()) == n_workers;
-    }
    void Bootstrap() {
      CHECK_EQ(pending.size(), n_workers);
      CHECK_LE(n_shutdown, n_workers);

+      running = true;
+
      // A reset.
      n_shutdown = 0;
      during_restart = false;
      pending.clear();
    }
+
+    // observers
+    [[nodiscard]] bool Ready() const {
+      CHECK_LE(pending.size(), n_workers);
+      return static_cast<std::int32_t>(pending.size()) == n_workers;
+    }
    [[nodiscard]] bool ShouldContinue() const {
      CHECK_LE(pending.size(), n_workers);
      CHECK_LE(n_shutdown, n_workers);
@ -187,7 +195,31 @@ Result RabitTracker::Bootstrap(std::vector<WorkerProxy>* p_workers) {
    }
  };

-  return std::async(std::launch::async, [this] {
+  auto handle_error = [&](WorkerProxy const& worker) {
+    auto msg = worker.Msg();
+    auto code = worker.Code();
+    LOG(WARNING) << "Recieved error from [" << worker.Host() << ":" << worker.Rank() << "]: " << msg
+                 << " code:" << code;
+    auto host = worker.Host();
+    // We signal all workers for the error, if they haven't aborted already.
+    for (auto& w : worker_error_handles_) {
+      if (w.first == host) {
+        continue;
+      }
+      TCPSocket out;
+      // Connecting to the error port as a signal for exit.
+      //
+      // retry is set to 1, just let the worker timeout or error. Otherwise the
+      // tracker and the worker might be waiting for each other.
+      auto rc = Connect(w.first, w.second, 1, timeout_, &out);
+      if (!rc.OK()) {
+        return Fail("Failed to inform workers to stop.");
+      }
+    }
+    return Success();
+  };
+
+  return std::async(std::launch::async, [this, handle_error] {
    State state{this->n_workers_};

    while (state.ShouldContinue()) {
@ -205,6 +237,16 @@ Result RabitTracker::Bootstrap(std::vector<WorkerProxy>* p_workers) {
      }
      switch (worker.Command()) {
        case proto::CMD::kStart: {
+          if (state.running) {
+            // Something went wrong with one of the workers. It got disconnected without
+            // notice.
+            state.Error();
+            rc = handle_error(worker);
+            if (!rc.OK()) {
+              return Fail("Failed to handle abort.", std::move(rc));
+            }
+          }
+
          state.Start(std::move(worker));
          if (state.Ready()) {
            rc = this->Bootstrap(&state.pending);
@ -216,36 +258,20 @@ Result RabitTracker::Bootstrap(std::vector<WorkerProxy>* p_workers) {
          continue;
        }
        case proto::CMD::kShutdown: {
+          if (state.during_restart) {
+            // The worker can still send shutdown after call to `std::exit`.
+            continue;
+          }
          state.Shutdown();
          continue;
        }
        case proto::CMD::kError: {
          if (state.during_restart) {
+            // Ignore further errors.
            continue;
          }
          state.Error();
-          auto msg = worker.Msg();
-          auto code = worker.Code();
-          LOG(WARNING) << "Recieved error from [" << worker.Host() << ":" << worker.Rank()
-                       << "]: " << msg << " code:" << code;
-          auto host = worker.Host();
-          // We signal all workers for the error, if they haven't aborted already.
-          for (auto& w : worker_error_handles_) {
-            if (w.first == host) {
-              continue;
-            }
-            TCPSocket out;
-            // retry is set to 1, just let the worker timeout or error. Otherwise the
-            // tracker and the worker might be waiting for each other.
-            auto rc = Connect(w.first, w.second, 1, timeout_, &out);
-            // send signal to stop the worker.
-            proto::ShutdownCMD shutdown;
-            rc = shutdown.Send(&out);
-            if (!rc.OK()) {
-              return Fail("Failed to inform workers to stop.");
-            }
-          }
-
+          rc = handle_error(worker);
          continue;
        }
        case proto::CMD::kPrint: {
--- a/src/collective/tracker.h
+++ b/src/collective/tracker.h
@ -114,6 +114,9 @@ class RabitTracker : public Tracker {
  // record for how to reach out to workers if error happens.
  std::vector<std::pair<std::string, std::int32_t>> worker_error_handles_;
  // listening socket for incoming workers.
+  //
+  // At the moment, the listener calls accept without first polling. We can add an
+  // additional unix domain socket to allow cancelling the accept.
  TCPSocket listener_;

  Result Bootstrap(std::vector<WorkerProxy>* p_workers);
--- a/src/common/algorithm.cuh
+++ b/src/common/algorithm.cuh
@ -29,8 +29,7 @@
 #include "xgboost/logging.h"   // CHECK
 #include "xgboost/span.h"      // Span,byte

-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 namespace detail {

 // Wrapper around cub sort to define is_decending
@ -165,13 +164,14 @@ inline void SegmentedSortKeys(Context const *ctx, Span<V const> group_ptr,
 template <bool accending, bool per_seg_index, typename U, typename V, typename IdxT>
 void SegmentedArgSort(Context const *ctx, Span<U> values, Span<V> group_ptr,
                      Span<IdxT> sorted_idx) {
+  auto cuctx = ctx->CUDACtx();
  CHECK_GE(group_ptr.size(), 1ul);
  std::size_t n_groups = group_ptr.size() - 1;
  std::size_t bytes = 0;
  if (per_seg_index) {
    SegmentedSequence(ctx, group_ptr, sorted_idx);
  } else {
-    dh::Iota(sorted_idx);
+    dh::Iota(sorted_idx, cuctx->Stream());
  }
  dh::TemporaryArray<std::remove_const_t<U>> values_out(values.size());
  dh::TemporaryArray<std::remove_const_t<IdxT>> sorted_idx_out(sorted_idx.size());
@ -179,15 +179,16 @@ void SegmentedArgSort(Context const *ctx, Span<U> values, Span<V> group_ptr,
  detail::DeviceSegmentedRadixSortPair<!accending>(
      nullptr, bytes, values.data(), values_out.data().get(), sorted_idx.data(),
      sorted_idx_out.data().get(), sorted_idx.size(), n_groups, group_ptr.data(),
-      group_ptr.data() + 1, ctx->CUDACtx()->Stream());
+      group_ptr.data() + 1, cuctx->Stream());
  dh::TemporaryArray<byte> temp_storage(bytes);
  detail::DeviceSegmentedRadixSortPair<!accending>(
      temp_storage.data().get(), bytes, values.data(), values_out.data().get(), sorted_idx.data(),
      sorted_idx_out.data().get(), sorted_idx.size(), n_groups, group_ptr.data(),
-      group_ptr.data() + 1, ctx->CUDACtx()->Stream());
+      group_ptr.data() + 1, cuctx->Stream());

  dh::safe_cuda(cudaMemcpyAsync(sorted_idx.data(), sorted_idx_out.data().get(),
-                                sorted_idx.size_bytes(), cudaMemcpyDeviceToDevice));
+                                sorted_idx.size_bytes(), cudaMemcpyDeviceToDevice,
+                                cuctx->Stream()));
 }

 /**
@ -197,11 +198,12 @@ void SegmentedArgSort(Context const *ctx, Span<U> values, Span<V> group_ptr,
 template <typename SegIt, typename ValIt>
 void SegmentedArgMergeSort(Context const *ctx, SegIt seg_begin, SegIt seg_end, ValIt val_begin,
                           ValIt val_end, dh::device_vector<std::size_t> *p_sorted_idx) {
+  auto cuctx = ctx->CUDACtx();
  using Tup = thrust::tuple<std::int32_t, float>;
  auto &sorted_idx = *p_sorted_idx;
  std::size_t n = std::distance(val_begin, val_end);
  sorted_idx.resize(n);
-  dh::Iota(dh::ToSpan(sorted_idx));
+  dh::Iota(dh::ToSpan(sorted_idx), cuctx->Stream());
  dh::device_vector<Tup> keys(sorted_idx.size());
  auto key_it = dh::MakeTransformIterator<Tup>(thrust::make_counting_iterator(0ul),
                                               [=] XGBOOST_DEVICE(std::size_t i) -> Tup {
@ -215,7 +217,7 @@ void SegmentedArgMergeSort(Context const *ctx, SegIt seg_begin, SegIt seg_end, V
                                                 return thrust::make_tuple(seg_idx, residue);
                                               });
  thrust::copy(ctx->CUDACtx()->CTP(), key_it, key_it + keys.size(), keys.begin());
-  thrust::stable_sort_by_key(ctx->CUDACtx()->TP(), keys.begin(), keys.end(), sorted_idx.begin(),
+  thrust::stable_sort_by_key(cuctx->TP(), keys.begin(), keys.end(), sorted_idx.begin(),
                             [=] XGBOOST_DEVICE(Tup const &l, Tup const &r) {
                               if (thrust::get<0>(l) != thrust::get<0>(r)) {
                                 return thrust::get<0>(l) < thrust::get<0>(r);  // segment index
@ -223,6 +225,75 @@ void SegmentedArgMergeSort(Context const *ctx, SegIt seg_begin, SegIt seg_end, V
                               return thrust::get<1>(l) < thrust::get<1>(r);    // residue
                             });
 }
-}  // namespace common
-}  // namespace xgboost
+
+template <bool accending, typename IdxT, typename U>
+void ArgSort(xgboost::Context const *ctx, xgboost::common::Span<U> keys,
+             xgboost::common::Span<IdxT> sorted_idx) {
+  std::size_t bytes = 0;
+  auto cuctx = ctx->CUDACtx();
+  dh::Iota(sorted_idx, cuctx->Stream());
+
+  using KeyT = typename decltype(keys)::value_type;
+  using ValueT = std::remove_const_t<IdxT>;
+
+  dh::TemporaryArray<KeyT> out(keys.size());
+  cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(keys.data()), out.data().get());
+  dh::TemporaryArray<IdxT> sorted_idx_out(sorted_idx.size());
+  cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(sorted_idx.data()),
+                                     sorted_idx_out.data().get());
+
+  // track https://github.com/NVIDIA/cub/pull/340 for 64bit length support
+  using OffsetT = std::conditional_t<!dh::BuildWithCUDACub(), std::ptrdiff_t, int32_t>;
+  CHECK_LE(sorted_idx.size(), std::numeric_limits<OffsetT>::max());
+  if (accending) {
+    void *d_temp_storage = nullptr;
+#if THRUST_MAJOR_VERSION >= 2
+    dh::safe_cuda((cub::DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
+        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
+        cuctx->Stream())));
+#else
+    dh::safe_cuda((cub::DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
+        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
+        nullptr, false)));
+#endif
+    dh::TemporaryArray<char> storage(bytes);
+    d_temp_storage = storage.data().get();
+#if THRUST_MAJOR_VERSION >= 2
+    dh::safe_cuda((cub::DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
+        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
+        cuctx->Stream())));
+#else
+    dh::safe_cuda((cub::DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
+        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
+        nullptr, false)));
+#endif
+  } else {
+    void *d_temp_storage = nullptr;
+#if THRUST_MAJOR_VERSION >= 2
+    dh::safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
+        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
+        cuctx->Stream())));
+#else
+    dh::safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
+        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
+        nullptr, false)));
+#endif
+    dh::TemporaryArray<char> storage(bytes);
+    d_temp_storage = storage.data().get();
+#if THRUST_MAJOR_VERSION >= 2
+    dh::safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
+        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
+        cuctx->Stream())));
+#else
+    dh::safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
+        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0, sizeof(KeyT) * 8, false,
+        nullptr, false)));
+#endif
+  }
+
+  dh::safe_cuda(cudaMemcpyAsync(sorted_idx.data(), sorted_idx_out.data().get(),
+                                sorted_idx.size_bytes(), cudaMemcpyDeviceToDevice,
+                                cuctx->Stream()));
+}
+}  // namespace xgboost::common
 #endif  // XGBOOST_COMMON_ALGORITHM_CUH_
--- a/src/common/common.h
+++ b/src/common/common.h
@ -176,10 +176,10 @@ inline void AssertNCCLSupport() {
 #endif  // !defined(XGBOOST_USE_NCCL)
 }

-inline void AssertOneAPISupport() {
-#ifndef XGBOOST_USE_ONEAPI
-    LOG(FATAL) << "XGBoost version not compiled with OneAPI support.";
-#endif  // XGBOOST_USE_ONEAPI
+inline void AssertSYCLSupport() {
+#ifndef XGBOOST_USE_SYCL
+    LOG(FATAL) << "XGBoost version not compiled with SYCL support.";
+#endif  // XGBOOST_USE_SYCL
 }

 void SetDevice(std::int32_t device);
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@ -38,10 +38,6 @@
 #include "xgboost/logging.h"
 #include "xgboost/span.h"

-#ifdef XGBOOST_USE_NCCL
-#include "nccl.h"
-#endif  // XGBOOST_USE_NCCL
-
 #if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 #include "rmm/mr/device/per_device_resource.hpp"
 #include "rmm/mr/device/thrust_allocator_adaptor.hpp"
@ -117,30 +113,6 @@ XGBOOST_DEV_INLINE T atomicAdd(T *addr, T v) {  // NOLINT
 }
 namespace dh {

-#ifdef XGBOOST_USE_NCCL
-#define safe_nccl(ans) ThrowOnNcclError((ans), __FILE__, __LINE__)
-
-inline ncclResult_t ThrowOnNcclError(ncclResult_t code, const char *file, int line) {
-  if (code != ncclSuccess) {
-    std::stringstream ss;
-    ss << "NCCL failure: " << ncclGetErrorString(code) << ".";
-    ss << " " << file << "(" << line << ")\n";
-    if (code == ncclUnhandledCudaError) {
-      // nccl usually preserves the last error so we can get more details.
-      auto err = cudaPeekAtLastError();
-      ss << "  CUDA error: " << thrust::system_error(err, thrust::cuda_category()).what() << "\n";
-    } else if (code == ncclSystemError) {
-      ss << "  This might be caused by a network configuration issue. Please consider specifying "
-            "the network interface for NCCL via environment variables listed in its reference: "
-            "`https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html`.\n";
-    }
-    LOG(FATAL) << ss.str();
-  }
-
-  return code;
-}
-#endif
-
 inline int32_t CudaGetPointerDevice(void const *ptr) {
  int32_t device = -1;
  cudaPointerAttributes attr;
@ -315,8 +287,8 @@ inline void LaunchN(size_t n, L lambda) {
 }

 template <typename Container>
-void Iota(Container array) {
-  LaunchN(array.size(), [=] __device__(size_t i) { array[i] = i; });
+void Iota(Container array, cudaStream_t stream) {
+  LaunchN(array.size(), stream, [=] __device__(size_t i) { array[i] = i; });
 }

 namespace detail {
@ -482,7 +454,8 @@ struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
  cub::CachingDeviceAllocator& GetGlobalCachingAllocator() {
    // Configure allocator with maximum cached bin size of ~1GB and no limit on
    // maximum cached bytes
-    thread_local cub::CachingDeviceAllocator *allocator = new cub::CachingDeviceAllocator(2, 9, 29);
+    thread_local std::unique_ptr<cub::CachingDeviceAllocator> allocator{
+        std::make_unique<cub::CachingDeviceAllocator>(2, 9, 29)};
    return *allocator;
  }
  pointer allocate(size_t n) {  // NOLINT
@ -598,6 +571,16 @@ class DoubleBuffer {
  T *Other() { return buff.Alternate(); }
 };

+template <typename T>
+xgboost::common::Span<T> LazyResize(xgboost::Context const *ctx,
+                                    xgboost::HostDeviceVector<T> *buffer, std::size_t n) {
+  buffer->SetDevice(ctx->Device());
+  if (buffer->Size() < n) {
+    buffer->Resize(n);
+  }
+  return buffer->DeviceSpan().subspan(0, n);
+}
+
 /**
 * \brief Copies device span to std::vector.
 *
@ -1061,74 +1044,6 @@ void InclusiveSum(InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items)
  InclusiveScan(d_in, d_out, cub::Sum(), num_items);
 }

-template <bool accending, typename IdxT, typename U>
-void ArgSort(xgboost::common::Span<U> keys, xgboost::common::Span<IdxT> sorted_idx) {
-  size_t bytes = 0;
-  Iota(sorted_idx);
-
-  using KeyT = typename decltype(keys)::value_type;
-  using ValueT = std::remove_const_t<IdxT>;
-
-  TemporaryArray<KeyT> out(keys.size());
-  cub::DoubleBuffer<KeyT> d_keys(const_cast<KeyT *>(keys.data()),
-                                 out.data().get());
-  TemporaryArray<IdxT> sorted_idx_out(sorted_idx.size());
-  cub::DoubleBuffer<ValueT> d_values(const_cast<ValueT *>(sorted_idx.data()),
-                                     sorted_idx_out.data().get());
-
-  // track https://github.com/NVIDIA/cub/pull/340 for 64bit length support
-  using OffsetT = std::conditional_t<!BuildWithCUDACub(), std::ptrdiff_t, int32_t>;
-  CHECK_LE(sorted_idx.size(), std::numeric_limits<OffsetT>::max());
-  if (accending) {
-    void *d_temp_storage = nullptr;
-#if THRUST_MAJOR_VERSION >= 2
-    safe_cuda((cub::DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
-        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
-        sizeof(KeyT) * 8, false, nullptr)));
-#else
-    safe_cuda((cub::DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
-        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
-        sizeof(KeyT) * 8, false, nullptr, false)));
-#endif
-    TemporaryArray<char> storage(bytes);
-    d_temp_storage = storage.data().get();
-#if THRUST_MAJOR_VERSION >= 2
-    safe_cuda((cub::DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
-        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
-        sizeof(KeyT) * 8, false, nullptr)));
-#else
-    safe_cuda((cub::DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
-        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
-        sizeof(KeyT) * 8, false, nullptr, false)));
-#endif
-  } else {
-    void *d_temp_storage = nullptr;
-#if THRUST_MAJOR_VERSION >= 2
-    safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
-        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
-        sizeof(KeyT) * 8, false, nullptr)));
-#else
-    safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
-        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
-        sizeof(KeyT) * 8, false, nullptr, false)));
-#endif
-    TemporaryArray<char> storage(bytes);
-    d_temp_storage = storage.data().get();
-#if THRUST_MAJOR_VERSION >= 2
-    safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
-        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
-        sizeof(KeyT) * 8, false, nullptr)));
-#else
-    safe_cuda((cub::DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
-        d_temp_storage, bytes, d_keys, d_values, sorted_idx.size(), 0,
-        sizeof(KeyT) * 8, false, nullptr, false)));
-#endif
-  }
-
-  safe_cuda(cudaMemcpyAsync(sorted_idx.data(), sorted_idx_out.data().get(),
-                            sorted_idx.size_bytes(), cudaMemcpyDeviceToDevice));
-}
-
 class CUDAStreamView;

 class CUDAEvent {
--- a/src/common/error_msg.h
+++ b/src/common/error_msg.h
@ -97,5 +97,7 @@ constexpr StringView InvalidCUDAOrdinal() {
 }

 void MismatchedDevices(Context const* booster, Context const* data);
+
+inline auto NoFederated() { return "XGBoost is not compiled with federated learning support."; }
 }  // namespace xgboost::error
 #endif  // XGBOOST_COMMON_ERROR_MSG_H_
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@ -51,7 +51,7 @@ HistogramCuts SketchOnDMatrix(Context const *ctx, DMatrix *m, bst_bin_t max_bins
    for (auto const &page : m->GetBatches<SparsePage>()) {
      container.PushRowPage(page, info, hessian);
    }
-    container.MakeCuts(m->Info(), &out);
+    container.MakeCuts(ctx, m->Info(), &out);
  } else {
    SortedSketchContainer container{ctx,
                                    max_bins,
@ -61,7 +61,7 @@ HistogramCuts SketchOnDMatrix(Context const *ctx, DMatrix *m, bst_bin_t max_bins
    for (auto const &page : m->GetBatches<SortedCSCPage>(ctx)) {
      container.PushColPage(page, info, hessian);
    }
-    container.MakeCuts(m->Info(), &out);
+    container.MakeCuts(ctx, m->Info(), &out);
  }

  return out;
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@ -359,7 +359,7 @@ HistogramCuts DeviceSketchWithHessian(Context const* ctx, DMatrix* p_fmat, bst_b
    }
  }

-  sketch_container.MakeCuts(&cuts, p_fmat->Info().IsColumnSplit());
+  sketch_container.MakeCuts(ctx, &cuts, p_fmat->Info().IsColumnSplit());
  return cuts;
 }
 }  // namespace xgboost::common
--- a/src/common/quantile.cc
+++ b/src/common/quantile.cc
@ -11,9 +11,7 @@
 #include "categorical.h"
 #include "hist_util.h"

-namespace xgboost {
-namespace common {
-
+namespace xgboost::common {
 template <typename WQSketch>
 SketchContainerImpl<WQSketch>::SketchContainerImpl(Context const *ctx,
                                                   std::vector<bst_row_t> columns_size,
@ -129,7 +127,7 @@ struct QuantileAllreduce {
   * \param rank rank of target worker
   * \param fidx feature idx
   */
-  auto Values(int32_t rank, bst_feature_t fidx) const {
+  [[nodiscard]] auto Values(int32_t rank, bst_feature_t fidx) const {
    // get span for worker
    auto wsize = worker_indptr[rank + 1] - worker_indptr[rank];
    auto worker_values = global_values.subspan(worker_indptr[rank], wsize);
@ -145,7 +143,7 @@ struct QuantileAllreduce {

 template <typename WQSketch>
 void SketchContainerImpl<WQSketch>::GatherSketchInfo(
-    MetaInfo const& info,
+    Context const *, MetaInfo const &info,
    std::vector<typename WQSketch::SummaryContainer> const &reduced,
    std::vector<size_t> *p_worker_segments, std::vector<bst_row_t> *p_sketches_scan,
    std::vector<typename WQSketch::Entry> *p_global_sketches) {
@ -206,7 +204,7 @@ void SketchContainerImpl<WQSketch>::GatherSketchInfo(
 }

 template <typename WQSketch>
-void SketchContainerImpl<WQSketch>::AllreduceCategories(MetaInfo const& info) {
+void SketchContainerImpl<WQSketch>::AllreduceCategories(Context const*, MetaInfo const& info) {
  auto world_size = collective::GetWorldSize();
  auto rank = collective::GetRank();
  if (world_size == 1 || info.IsColumnSplit()) {
@ -274,16 +272,15 @@ void SketchContainerImpl<WQSketch>::AllreduceCategories(MetaInfo const& info) {

 template <typename WQSketch>
 void SketchContainerImpl<WQSketch>::AllReduce(
-    MetaInfo const& info,
-    std::vector<typename WQSketch::SummaryContainer> *p_reduced,
-    std::vector<int32_t>* p_num_cuts) {
+    Context const *ctx, MetaInfo const &info,
+    std::vector<typename WQSketch::SummaryContainer> *p_reduced, std::vector<int32_t> *p_num_cuts) {
  monitor_.Start(__func__);

  size_t n_columns = sketches_.size();
  collective::Allreduce<collective::Operation::kMax>(&n_columns, 1);
  CHECK_EQ(n_columns, sketches_.size()) << "Number of columns differs across workers";

-  AllreduceCategories(info);
+  AllreduceCategories(ctx, info);

  auto& num_cuts = *p_num_cuts;
  CHECK_EQ(num_cuts.size(), 0);
@ -324,7 +321,7 @@ void SketchContainerImpl<WQSketch>::AllReduce(
  std::vector<bst_row_t> sketches_scan((n_columns + 1) * world, 0);

  std::vector<typename WQSketch::Entry> global_sketches;
-  this->GatherSketchInfo(info, reduced, &worker_segments, &sketches_scan, &global_sketches);
+  this->GatherSketchInfo(ctx, info, reduced, &worker_segments, &sketches_scan, &global_sketches);

  std::vector<typename WQSketch::SummaryContainer> final_sketches(n_columns);

@ -383,11 +380,12 @@ auto AddCategories(std::set<float> const &categories, HistogramCuts *cuts) {
 }

 template <typename WQSketch>
-void SketchContainerImpl<WQSketch>::MakeCuts(MetaInfo const &info, HistogramCuts *p_cuts) {
+void SketchContainerImpl<WQSketch>::MakeCuts(Context const *ctx, MetaInfo const &info,
+                                             HistogramCuts *p_cuts) {
  monitor_.Start(__func__);
  std::vector<typename WQSketch::SummaryContainer> reduced;
  std::vector<int32_t> num_cuts;
-  this->AllReduce(info, &reduced, &num_cuts);
+  this->AllReduce(ctx, info, &reduced, &num_cuts);

  p_cuts->min_vals_.HostVector().resize(sketches_.size(), 0.0f);
  std::vector<typename WQSketch::SummaryContainer> final_summaries(reduced.size());
@ -496,5 +494,4 @@ void SortedSketchContainer::PushColPage(SparsePage const &page, MetaInfo const &
  });
  monitor_.Stop(__func__);
 }
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
--- a/src/common/quantile.cu
+++ b/src/common/quantile.cu
@ -22,9 +22,7 @@
 #include "transform_iterator.h"  // MakeIndexTransformIter
 #include "xgboost/span.h"

-namespace xgboost {
-namespace common {
-
+namespace xgboost::common {
 using WQSketch = HostSketchContainer::WQSketch;
 using SketchEntry = WQSketch::Entry;

@ -504,7 +502,7 @@ void SketchContainer::FixError() {
  });
 }

-void SketchContainer::AllReduce(bool is_column_split) {
+void SketchContainer::AllReduce(Context const*, bool is_column_split) {
  dh::safe_cuda(cudaSetDevice(device_.ordinal));
  auto world = collective::GetWorldSize();
  if (world == 1 || is_column_split) {
@ -585,13 +583,13 @@ struct InvalidCatOp {
 };
 }  // anonymous namespace

-void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) {
+void SketchContainer::MakeCuts(Context const* ctx, HistogramCuts* p_cuts, bool is_column_split) {
  timer_.Start(__func__);
  dh::safe_cuda(cudaSetDevice(device_.ordinal));
  p_cuts->min_vals_.Resize(num_columns_);

  // Sync between workers.
-  this->AllReduce(is_column_split);
+  this->AllReduce(ctx, is_column_split);

  // Prune to final number of bins.
  this->Prune(num_bins_ + 1);
@ -734,5 +732,4 @@ void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) {
  p_cuts->SetCategorical(this->has_categorical_, max_cat);
  timer_.Stop(__func__);
 }
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
--- a/src/common/quantile.cuh
+++ b/src/common/quantile.cuh
@ -151,9 +151,9 @@ class SketchContainer {
             Span<SketchEntry const> that);

  /* \brief Merge quantiles from other GPU workers. */
-  void AllReduce(bool is_column_split);
+  void AllReduce(Context const* ctx, bool is_column_split);
  /* \brief Create the final histogram cut values. */
-  void MakeCuts(HistogramCuts* cuts, bool is_column_split);
+  void MakeCuts(Context const* ctx, HistogramCuts* cuts, bool is_column_split);

  Span<SketchEntry const> Data() const {
    return {this->Current().data().get(), this->Current().size()};
--- a/Show More
+++ b/Show More