diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 0288b0c97..1f91afdc5 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -141,18 +141,14 @@ jobs:
         architecture: 'x64'
     - name: Install Python packages
       run: |
-        python -m pip install wheel setuptools cpplint pylint
+        python -m pip install wheel setuptools cmakelint cpplint pylint
     - name: Run lint
       run: |
-        python3 dmlc-core/scripts/lint.py xgboost cpp R-package/src
+        python3 tests/ci_build/lint_cpp.py xgboost cpp R-package/src
 
-        python3 dmlc-core/scripts/lint.py --exclude_path \
-            python-package/xgboost/dmlc-core \
-            python-package/xgboost/include \
-            python-package/xgboost/lib \
-            python-package/xgboost/rabit \
-            python-package/xgboost/src \
-            --pylint-rc python-package/.pylintrc \
-            xgboost \
-            cpp \
-            include src python-package
+        python3 tests/ci_build/lint_cpp.py xgboost cpp include src python-package \
+            --exclude_path python-package/xgboost/dmlc-core python-package/xgboost/include \
+                           python-package/xgboost/lib python-package/xgboost/rabit \
+                           python-package/xgboost/src
+
+        sh ./tests/ci_build/lint_cmake.sh || true
diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml
index 532c9277a..e9704c75d 100644
--- a/.github/workflows/python_tests.yml
+++ b/.github/workflows/python_tests.yml
@@ -190,7 +190,7 @@ jobs:
       run: |
         mkdir build_msvc
         cd build_msvc
-        cmake .. -G"Visual Studio 17 2022" -DCMAKE_CONFIGURATION_TYPES="Release" -A x64 -DGOOGLE_TEST=ON  -DUSE_DMLC_GTEST=ON -DBUILD_DEPRECATED_CLI=ON
+        cmake .. -G"Visual Studio 17 2022" -DCMAKE_CONFIGURATION_TYPES="Release" -A x64 -DBUILD_DEPRECATED_CLI=ON
         cmake --build . --config Release --parallel $(nproc)
 
     - name: Install Python package
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f617243b6..ba86257dc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,9 +8,9 @@ cmake_policy(SET CMP0076 NEW)
 set(CMAKE_POLICY_DEFAULT_CMP0063 NEW)
 cmake_policy(SET CMP0063 NEW)
 
-if ((${CMAKE_VERSION} VERSION_GREATER 3.13) OR (${CMAKE_VERSION} VERSION_EQUAL 3.13))
+if((${CMAKE_VERSION} VERSION_GREATER 3.13) OR (${CMAKE_VERSION} VERSION_EQUAL 3.13))
   cmake_policy(SET CMP0077 NEW)
-endif ((${CMAKE_VERSION} VERSION_GREATER 3.13) OR (${CMAKE_VERSION} VERSION_EQUAL 3.13))
+endif()
 
 message(STATUS "CMake version ${CMAKE_VERSION}")
 
@@ -90,108 +90,99 @@ option(PLUGIN_UPDATER_ONEAPI "DPC++ updater" OFF)
 option(ADD_PKGCONFIG "Add xgboost.pc into system." ON)
 
 #-- Checks for building XGBoost
-if (USE_DEBUG_OUTPUT AND (NOT (CMAKE_BUILD_TYPE MATCHES Debug)))
+if(USE_DEBUG_OUTPUT AND (NOT (CMAKE_BUILD_TYPE MATCHES Debug)))
   message(SEND_ERROR "Do not enable `USE_DEBUG_OUTPUT' with release build.")
-endif (USE_DEBUG_OUTPUT AND (NOT (CMAKE_BUILD_TYPE MATCHES Debug)))
-
-if (USE_NCCL AND NOT (USE_CUDA))
+endif()
+if(USE_NCCL AND NOT (USE_CUDA))
   message(SEND_ERROR "`USE_NCCL` must be enabled with `USE_CUDA` flag.")
-endif (USE_NCCL AND NOT (USE_CUDA))
-if (USE_DEVICE_DEBUG AND NOT (USE_CUDA))
+endif()
+if(USE_DEVICE_DEBUG AND NOT (USE_CUDA))
   message(SEND_ERROR "`USE_DEVICE_DEBUG` must be enabled with `USE_CUDA` flag.")
-endif (USE_DEVICE_DEBUG AND NOT (USE_CUDA))
-if (BUILD_WITH_SHARED_NCCL AND (NOT USE_NCCL))
+endif()
+if(BUILD_WITH_SHARED_NCCL AND (NOT USE_NCCL))
   message(SEND_ERROR "Build XGBoost with -DUSE_NCCL=ON to enable BUILD_WITH_SHARED_NCCL.")
-endif (BUILD_WITH_SHARED_NCCL AND (NOT USE_NCCL))
-
-if (USE_RCCL AND NOT (USE_HIP))
-  message(SEND_ERROR "`USE_RCCL` must be enabled with `USE_HIP` flag.")
-endif (USE_RCCL AND NOT (USE_HIP))
-if (USE_DEVICE_DEBUG AND NOT (USE_HIP))
-  message(SEND_ERROR "`USE_DEVICE_DEBUG` must be enabled with `USE_HIP` flag.")
-endif (USE_DEVICE_DEBUG AND NOT (USE_HIP))
-if (BUILD_WITH_SHARED_RCCL AND (NOT USE_RCCL))
+endif()
+if(USE_RCCL AND NOT (USE_HIP))
+    message(SEND_ERROR "`USE_RCCL` must be enabled with `USE_HIP` flag.")
+endif()
+if(BUILD_WITH_SHARED_RCCL AND (NOT USE_RCCL))
   message(SEND_ERROR "Build XGBoost with -DUSE_RCCL=ON to enable BUILD_WITH_SHARED_RCCL.")
-endif (BUILD_WITH_SHARED_RCCL AND (NOT USE_RCCL))
-
-if (JVM_BINDINGS AND R_LIB)
+endif()
+if(JVM_BINDINGS AND R_LIB)
   message(SEND_ERROR "`R_LIB' is not compatible with `JVM_BINDINGS' as they both have customized configurations.")
-endif (JVM_BINDINGS AND R_LIB)
-if (R_LIB AND GOOGLE_TEST)
-  message(WARNING "Some C++ unittests will fail with `R_LIB` enabled,
- as R package redirects some functions to R runtime implementation.")
-endif (R_LIB AND GOOGLE_TEST)
-if (PLUGIN_RMM AND NOT (USE_CUDA))
+endif()
+if(R_LIB AND GOOGLE_TEST)
+  message(
+    WARNING
+    "Some C++ tests will fail with `R_LIB` enabled, as R package redirects some functions to R runtime implementation."
+  )
+endif()
+if(PLUGIN_RMM AND NOT (USE_CUDA))
   message(SEND_ERROR "`PLUGIN_RMM` must be enabled with `USE_CUDA` flag.")
-endif (PLUGIN_RMM AND NOT (USE_CUDA))
-
-if (PLUGIN_RMM AND NOT (USE_HIP))
-  message(SEND_ERROR "`PLUGIN_RMM` must be enabled with `USE_HIP` flag.")
-endif (PLUGIN_RMM AND NOT (USE_HIP))
-
-if (PLUGIN_RMM AND NOT ((CMAKE_CXX_COMPILER_ID STREQUAL "Clang") OR (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")))
+endif()
+if(PLUGIN_RMM AND NOT ((CMAKE_CXX_COMPILER_ID STREQUAL "Clang") OR (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")))
   message(SEND_ERROR "`PLUGIN_RMM` must be used with GCC or Clang compiler.")
-endif (PLUGIN_RMM AND NOT ((CMAKE_CXX_COMPILER_ID STREQUAL "Clang") OR (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")))
-if (PLUGIN_RMM AND NOT (CMAKE_SYSTEM_NAME STREQUAL "Linux"))
+endif()
+if(PLUGIN_RMM AND NOT (CMAKE_SYSTEM_NAME STREQUAL "Linux"))
   message(SEND_ERROR "`PLUGIN_RMM` must be used with Linux.")
-endif (PLUGIN_RMM AND NOT (CMAKE_SYSTEM_NAME STREQUAL "Linux"))
-if (ENABLE_ALL_WARNINGS)
-  if ((NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang") AND (NOT CMAKE_CXX_COMPILER_ID STREQUAL "GNU"))
+endif()
+if(ENABLE_ALL_WARNINGS)
+  if((NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang") AND (NOT CMAKE_CXX_COMPILER_ID STREQUAL "GNU"))
     message(SEND_ERROR "ENABLE_ALL_WARNINGS is only available for Clang and GCC.")
-  endif ((NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang") AND (NOT CMAKE_CXX_COMPILER_ID STREQUAL "GNU"))
-endif (ENABLE_ALL_WARNINGS)
-if (BUILD_STATIC_LIB AND (R_LIB OR JVM_BINDINGS))
+  endif()
+endif()
+if(BUILD_STATIC_LIB AND (R_LIB OR JVM_BINDINGS))
   message(SEND_ERROR "Cannot build a static library libxgboost.a when R or JVM packages are enabled.")
-endif (BUILD_STATIC_LIB AND (R_LIB OR JVM_BINDINGS))
-if (PLUGIN_FEDERATED)
-  if (CMAKE_CROSSCOMPILING)
+endif()
+if(PLUGIN_FEDERATED)
+  if(CMAKE_CROSSCOMPILING)
     message(SEND_ERROR "Cannot cross compile with federated learning support")
-  endif ()
-  if (BUILD_STATIC_LIB)
+  endif()
+  if(BUILD_STATIC_LIB)
     message(SEND_ERROR "Cannot build static lib with federated learning support")
-  endif ()
-  if (R_LIB OR JVM_BINDINGS)
+  endif()
+  if(R_LIB OR JVM_BINDINGS)
     message(SEND_ERROR "Cannot enable federated learning support when R or JVM packages are enabled.")
-  endif ()
-  if (WIN32)
+  endif()
+  if(WIN32)
     message(SEND_ERROR "Federated learning not supported for Windows platform")
-  endif ()
-endif ()
+  endif()
+endif()
 
 #-- Removed options
-if (USE_AVX)
+if(USE_AVX)
   message(SEND_ERROR  "The option `USE_AVX` is deprecated as experimental AVX features have been removed from XGBoost.")
-endif (USE_AVX)
-if (PLUGIN_LZ4)
+endif()
+if(PLUGIN_LZ4)
   message(SEND_ERROR  "The option `PLUGIN_LZ4` is removed from XGBoost.")
-endif (PLUGIN_LZ4)
-if (RABIT_BUILD_MPI)
+endif()
+if(RABIT_BUILD_MPI)
   message(SEND_ERROR "The option `RABIT_BUILD_MPI` has been removed from XGBoost.")
-endif (RABIT_BUILD_MPI)
-if (USE_S3)
+endif()
+if(USE_S3)
   message(SEND_ERROR "The option `USE_S3` has been removed from XGBoost")
-endif (USE_S3)
-if (USE_AZURE)
+endif()
+if(USE_AZURE)
   message(SEND_ERROR "The option `USE_AZURE` has been removed from XGBoost")
-endif (USE_AZURE)
-if (USE_HDFS)
+endif()
+if(USE_HDFS)
   message(SEND_ERROR "The option `USE_HDFS` has been removed from XGBoost")
-endif (USE_HDFS)
+endif()
 
 #-- Sanitizer
-if (USE_SANITIZER)
+if(USE_SANITIZER)
   include(cmake/Sanitizer.cmake)
   enable_sanitizers("${ENABLED_SANITIZERS}")
-endif (USE_SANITIZER)
+endif()
 
-if (USE_CUDA)
+if(USE_CUDA)
   set(USE_OPENMP ON CACHE BOOL "CUDA requires OpenMP" FORCE)
   # `export CXX=' is ignored by CMake CUDA.
   set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
   message(STATUS "Configured CUDA host compiler: ${CMAKE_CUDA_HOST_COMPILER}")
 
   enable_language(CUDA)
-  if (${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 11.0)
+  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 11.0)
     message(FATAL_ERROR "CUDA version must be at least 11.0!")
   endif()
   set(GEN_CODE "")
@@ -199,7 +190,7 @@ if (USE_CUDA)
   add_subdirectory(${PROJECT_SOURCE_DIR}/gputreeshap)
 
   find_package(CUDAToolkit REQUIRED)
-endif (USE_CUDA)
+endif()
 
 if (USE_HIP)
   set(USE_OPENMP ON CACHE BOOL "HIP requires OpenMP" FORCE)
@@ -218,7 +209,7 @@ if (USE_HIP)
   add_subdirectory(${PROJECT_SOURCE_DIR}/rocgputreeshap)
 endif (USE_HIP)
 
-if (FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND
+if(FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND
     ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR
       (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")))
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fdiagnostics-color=always")
@@ -226,10 +217,10 @@ endif()
 
 find_package(Threads REQUIRED)
 
-if (USE_OPENMP)
-  if (APPLE)
+if(USE_OPENMP)
+  if(APPLE)
     find_package(OpenMP)
-    if (NOT OpenMP_FOUND)
+    if(NOT OpenMP_FOUND)
       # Try again with extra path info; required for libomp 15+ from Homebrew
       execute_process(COMMAND brew --prefix libomp
                       OUTPUT_VARIABLE HOMEBREW_LIBOMP_PREFIX
@@ -242,20 +233,20 @@ if (USE_OPENMP)
       set(OpenMP_CXX_LIB_NAMES omp)
       set(OpenMP_omp_LIBRARY ${HOMEBREW_LIBOMP_PREFIX}/lib/libomp.dylib)
       find_package(OpenMP REQUIRED)
-    endif ()
-  else ()
+    endif()
+  else()
     find_package(OpenMP REQUIRED)
-  endif ()
-endif (USE_OPENMP)
+  endif()
+endif()
 #Add for IBM i
-if (${CMAKE_SYSTEM_NAME} MATCHES "OS400")
+if(${CMAKE_SYSTEM_NAME} MATCHES "OS400")
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
   set(CMAKE_CXX_ARCHIVE_CREATE "<CMAKE_AR> -X64 qc <TARGET> <OBJECTS>")
 endif()
 
-if (USE_NCCL)
+if(USE_NCCL)
   find_package(Nccl REQUIRED)
-endif (USE_NCCL)
+endif()
 
 if (USE_RCCL)
   find_package(rccl REQUIRED)
@@ -263,17 +254,19 @@ endif (USE_RCCL)
 
 # dmlc-core
 msvc_use_static_runtime()
-if (FORCE_SHARED_CRT)
+if(FORCE_SHARED_CRT)
   set(DMLC_FORCE_SHARED_CRT ON)
-endif ()
+endif()
 add_subdirectory(${xgboost_SOURCE_DIR}/dmlc-core)
 
-if (MSVC)
-  if (TARGET dmlc_unit_tests)
-    target_compile_options(dmlc_unit_tests PRIVATE
-                           -D_CRT_SECURE_NO_WARNINGS -D_CRT_SECURE_NO_DEPRECATE)
-  endif (TARGET dmlc_unit_tests)
-endif (MSVC)
+if(MSVC)
+  if(TARGET dmlc_unit_tests)
+    target_compile_options(
+        dmlc_unit_tests PRIVATE
+        -D_CRT_SECURE_NO_WARNINGS -D_CRT_SECURE_NO_DEPRECATE
+    )
+  endif()
+endif()
 
 # rabit
 add_subdirectory(rabit)
@@ -282,20 +275,25 @@ add_subdirectory(rabit)
 add_subdirectory(${xgboost_SOURCE_DIR}/src)
 target_link_libraries(objxgboost PUBLIC dmlc)
 
+# Link -lstdc++fs for GCC 8.x
+if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "9.0")
+  target_link_libraries(objxgboost PUBLIC stdc++fs)
+endif()
+
 # Exports some R specific definitions and objects
-if (R_LIB)
+if(R_LIB)
   add_subdirectory(${xgboost_SOURCE_DIR}/R-package)
-endif (R_LIB)
+endif()
 
 # This creates its own shared library `xgboost4j'.
-if (JVM_BINDINGS)
+if(JVM_BINDINGS)
   add_subdirectory(${xgboost_SOURCE_DIR}/jvm-packages)
-endif (JVM_BINDINGS)
+endif()
 
 # Plugin
 add_subdirectory(${xgboost_SOURCE_DIR}/plugin)
 
-if (PLUGIN_RMM)
+if(PLUGIN_RMM)
   find_package(rmm REQUIRED)
 
   # Patch the rmm targets so they reference the static cudart
@@ -306,14 +304,14 @@ if (PLUGIN_RMM)
   list(APPEND rmm_link_libs CUDA::cudart_static)
   set_target_properties(rmm::rmm PROPERTIES INTERFACE_LINK_LIBRARIES "${rmm_link_libs}")
   get_target_property(rmm_link_libs rmm::rmm INTERFACE_LINK_LIBRARIES)
-endif (PLUGIN_RMM)
+endif()
 
 #-- library
-if (BUILD_STATIC_LIB)
+if(BUILD_STATIC_LIB)
   add_library(xgboost STATIC)
-else (BUILD_STATIC_LIB)
+else()
   add_library(xgboost SHARED)
-endif (BUILD_STATIC_LIB)
+endif()
 target_link_libraries(xgboost PRIVATE objxgboost)
 target_include_directories(xgboost
   INTERFACE
@@ -322,7 +320,7 @@ target_include_directories(xgboost
 #-- End shared library
 
 #-- CLI for xgboost
-if (BUILD_DEPRECATED_CLI)
+if(BUILD_DEPRECATED_CLI)
   add_executable(runxgboost ${xgboost_SOURCE_DIR}/src/cli_main.cc)
   target_link_libraries(runxgboost PRIVATE objxgboost)
   target_include_directories(runxgboost
@@ -336,12 +334,12 @@ if (BUILD_DEPRECATED_CLI)
   xgboost_target_link_libraries(runxgboost)
   xgboost_target_defs(runxgboost)
 
-  if (KEEP_BUILD_ARTIFACTS_IN_BINARY_DIR)
+  if(KEEP_BUILD_ARTIFACTS_IN_BINARY_DIR)
     set_output_directory(runxgboost ${xgboost_BINARY_DIR})
-  else ()
+  else()
     set_output_directory(runxgboost ${xgboost_SOURCE_DIR})
-  endif (KEEP_BUILD_ARTIFACTS_IN_BINARY_DIR)
-endif (BUILD_DEPRECATED_CLI)
+  endif()
+endif()
 #-- End CLI for xgboost
 
 # Common setup for all targets
@@ -351,41 +349,41 @@ foreach(target xgboost objxgboost dmlc)
   xgboost_target_defs(${target})
 endforeach()
 
-if (JVM_BINDINGS)
+if(JVM_BINDINGS)
   xgboost_target_properties(xgboost4j)
   xgboost_target_link_libraries(xgboost4j)
   xgboost_target_defs(xgboost4j)
-endif (JVM_BINDINGS)
+endif()
 
-if (KEEP_BUILD_ARTIFACTS_IN_BINARY_DIR)
+if(KEEP_BUILD_ARTIFACTS_IN_BINARY_DIR)
   set_output_directory(xgboost ${xgboost_BINARY_DIR}/lib)
-else ()
+else()
   set_output_directory(xgboost ${xgboost_SOURCE_DIR}/lib)
-endif ()
+endif()
 
 # Ensure these two targets do not build simultaneously, as they produce outputs with conflicting names
-if (BUILD_DEPRECATED_CLI)
+if(BUILD_DEPRECATED_CLI)
   add_dependencies(xgboost runxgboost)
-endif (BUILD_DEPRECATED_CLI)
+endif()
 
 #-- Installing XGBoost
-if (R_LIB)
+if(R_LIB)
   include(cmake/RPackageInstallTargetSetup.cmake)
   set_target_properties(xgboost PROPERTIES PREFIX "")
-  if (APPLE)
+  if(APPLE)
     set_target_properties(xgboost PROPERTIES SUFFIX ".so")
-  endif (APPLE)
+  endif()
   setup_rpackage_install_target(xgboost "${CMAKE_CURRENT_BINARY_DIR}/R-package-install")
   set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/dummy_inst")
-endif (R_LIB)
-if (MINGW)
+endif()
+if(MINGW)
   set_target_properties(xgboost PROPERTIES PREFIX "")
-endif (MINGW)
+endif()
 
-if (BUILD_C_DOC)
+if(BUILD_C_DOC)
   include(cmake/Doc.cmake)
   run_doxygen()
-endif (BUILD_C_DOC)
+endif()
 
 include(CPack)
 
@@ -401,19 +399,19 @@ install(DIRECTORY ${xgboost_SOURCE_DIR}/include/xgboost
 #  > in any export set.
 #
 # https://github.com/dmlc/xgboost/issues/6085
-if (BUILD_STATIC_LIB)
-  if (BUILD_DEPRECATED_CLI)
+if(BUILD_STATIC_LIB)
+  if(BUILD_DEPRECATED_CLI)
     set(INSTALL_TARGETS xgboost runxgboost objxgboost dmlc)
   else()
     set(INSTALL_TARGETS xgboost objxgboost dmlc)
-  endif (BUILD_DEPRECATED_CLI)
-else (BUILD_STATIC_LIB)
-  if (BUILD_DEPRECATED_CLI)
+  endif()
+else()
+  if(BUILD_DEPRECATED_CLI)
     set(INSTALL_TARGETS xgboost runxgboost)
-  else(BUILD_DEPRECATED_CLI)
+  else()
     set(INSTALL_TARGETS xgboost)
-  endif (BUILD_DEPRECATED_CLI)
-endif (BUILD_STATIC_LIB)
+  endif()
+endif()
 
 install(TARGETS ${INSTALL_TARGETS}
   EXPORT XGBoostTargets
@@ -442,7 +440,7 @@ install(
   DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/xgboost)
 
 #-- Test
-if (GOOGLE_TEST)
+if(GOOGLE_TEST)
   enable_testing()
   # Unittests.
   add_executable(testxgboost)
@@ -462,7 +460,7 @@ if (GOOGLE_TEST)
     ${xgboost_SOURCE_DIR}/tests/cli/machine.conf.in
     ${xgboost_BINARY_DIR}/tests/cli/machine.conf
     @ONLY)
-  if (BUILD_DEPRECATED_CLI)
+  if(BUILD_DEPRECATED_CLI)
     add_test(
       NAME TestXGBoostCLI
       COMMAND runxgboost ${xgboost_BINARY_DIR}/tests/cli/machine.conf
@@ -470,8 +468,8 @@ if (GOOGLE_TEST)
     set_tests_properties(TestXGBoostCLI
       PROPERTIES
       PASS_REGULAR_EXPRESSION ".*test-rmse:0.087.*")
-  endif (BUILD_DEPRECATED_CLI)
-endif (GOOGLE_TEST)
+  endif()
+endif()
 
 # For MSVC: Call msvc_use_static_runtime() once again to completely
 # replace /MD with /MT. See https://github.com/dmlc/xgboost/issues/4462
@@ -479,10 +477,10 @@ endif (GOOGLE_TEST)
 msvc_use_static_runtime()
 
 # Add xgboost.pc
-if (ADD_PKGCONFIG)
+if(ADD_PKGCONFIG)
   configure_file(${xgboost_SOURCE_DIR}/cmake/xgboost.pc.in ${xgboost_BINARY_DIR}/xgboost.pc @ONLY)
 
   install(
     FILES ${xgboost_BINARY_DIR}/xgboost.pc
     DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
-endif (ADD_PKGCONFIG)
+endif()
diff --git a/NEWS.md b/NEWS.md
index 2a1000e55..43019d877 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -3,6 +3,207 @@ XGBoost Change Log
 
 This file records the changes in xgboost library in reverse chronological order.
 
+## 2.0.0 (2023 Aug 16)
+
+We are excited to announce the release of XGBoost 2.0. This note will begin by covering some overall changes and then highlight specific updates to the package.
+
+### Initial work on multi-target trees with vector-leaf outputs
+We have been working on vector-leaf tree models for multi-target regression, multi-label classification, and multi-class classification in version 2.0. Previously, XGBoost would build a separate model for each target. However, with this new feature that's still being developed, XGBoost can build one tree for all targets. The feature has multiple benefits and trade-offs compared to the existing approach. It can help prevent overfitting, produce smaller models, and build trees that consider the correlation between targets. In addition, users can combine vector leaf and scalar leaf trees during a training session using a callback. Please note that the feature is still a working in progress, and many parts are not yet available. See #9043 for the current status. Related PRs: (#8538, #8697, #8902, #8884, #8895, #8898, #8612, #8652, #8698, #8908, #8928, #8968, #8616, #8922, #8890, #8872, #8889, #9509) Please note that, only the `hist` (default) tree method on CPU can be used for building vector leaf trees at the moment.
+
+### New `device` parameter.
+
+A new `device` parameter is set to replace the existing `gpu_id`, `gpu_hist`, `gpu_predictor`, `cpu_predictor`, `gpu_coord_descent`, and the PySpark specific parameter `use_gpu`. Onward, users need only the `device` parameter to select which device to run along with the ordinal of the device. For more information, please see our document page (https://xgboost.readthedocs.io/en/stable/parameter.html#general-parameters) . For example, with  `device="cuda", tree_method="hist"`, XGBoost will run the `hist` tree method on GPU. (#9363, #8528, #8604, #9354, #9274, #9243, #8896, #9129, #9362, #9402, #9385, #9398, #9390, #9386, #9412, #9507, #9536). The old behavior of ``gpu_hist``  is preserved but deprecated. In addition, the `predictor` parameter is removed.
+
+
+### `hist` is now the default tree method
+Starting from 2.0, the `hist` tree method will be the default. In previous versions, XGBoost chooses `approx` or `exact` depending on the input data and training environment. The new default can help XGBoost train models more efficiently and consistently. (#9320, #9353)
+
+### GPU-based approx tree method
+There's initial support for using the `approx` tree method on GPU. The performance of the `approx` is not yet well optimized but is feature complete except for the JVM packages. It can be accessed through the use of the parameter combination `device="cuda", tree_method="approx"`. (#9414, #9399, #9478). Please note that the Scala-based Spark interface is not yet supported.
+
+### Optimize and bound the size of the histogram on CPU, to control memory footprint
+
+XGBoost has a new parameter `max_cached_hist_node` for users to limit the CPU cache size for histograms. It can help prevent XGBoost from caching histograms too aggressively. Without the cache, performance is likely to decrease. However, the size of the cache grows exponentially with the depth of the tree. The limit can be crucial when growing deep trees. In most cases, users need not configure this parameter as it does not affect the model's accuracy. (#9455, #9441, #9440, #9427, #9400).
+
+Along with the cache limit, XGBoost also reduces the memory usage of the `hist` and `approx` tree method on distributed systems by cutting the size of the cache by half. (#9433)
+
+### Improved external memory support
+There is some exciting development around external memory support in XGBoost. It's still an experimental feature, but the performance has been significantly improved with the default `hist` tree method. We replaced the old file IO logic with memory map. In addition to performance, we have reduced CPU memory usage and added extensive documentation. Beginning from 2.0.0, we encourage users to try it with the `hist` tree method when the memory saving by `QuantileDMatrix` is not sufficient. (#9361, #9317, #9282, #9315, #8457)
+
+### Learning to rank
+We created a brand-new implementation for the learning-to-rank task. With the latest version, XGBoost gained a set of new features for ranking task including:
+
+- A new parameter `lambdarank_pair_method` for choosing the pair construction strategy.
+- A new parameter `lambdarank_num_pair_per_sample` for controlling the number of samples for each group.
+- An experimental implementation of unbiased learning-to-rank, which can be accessed using the `lambdarank_unbiased` parameter.
+- Support for custom gain function with `NDCG` using the `ndcg_exp_gain` parameter.
+- Deterministic GPU computation for all objectives and metrics.
+- `NDCG` is now the default objective function.
+- Improved performance of metrics using caches.
+- Support scikit-learn utilities for `XGBRanker`.
+- Extensive documentation on how learning-to-rank works with XGBoost.
+
+For more information, please see the [tutorial](https://xgboost.readthedocs.io/en/latest/tutorials/learning_to_rank.html). Related PRs: (#8771, #8692, #8783, #8789, #8790, #8859, #8887, #8893, #8906, #8931, #9075, #9015, #9381, #9336, #8822, #9222, #8984, #8785, #8786, #8768)
+
+### Automatically estimated intercept
+
+In the previous version, `base_score` was a constant that could be set as a training parameter. In the new version, XGBoost can automatically estimate this parameter based on input labels for optimal accuracy. (#8539, #8498, #8272, #8793, #8607)
+
+### Quantile regression
+The XGBoost algorithm now supports quantile regression, which involves minimizing the quantile loss (also called "pinball loss"). Furthermore, XGBoost allows for training with multiple target quantiles simultaneously with one tree per quantile. (#8775, #8761, #8760, #8758, #8750)
+
+### L1 and Quantile regression now supports learning rate
+Both objectives use adaptive trees due to the lack of proper Hessian values. In the new version, XGBoost can scale the leaf value with the learning rate accordingly. (#8866)
+
+### Export cut value
+
+Using the Python or the C package, users can export the quantile values (not to be confused with quantile regression) used for the `hist` tree method. (#9356)
+
+### column-based split and federated learning
+We made progress on column-based split for federated learning. In 2.0, both `approx`, `hist`, and `hist` with vector leaf can work with column-based data split, along with support for vertical federated learning. Work on GPU support is still on-going, stay tuned. (#8576, #8468, #8442, #8847, #8811, #8985, #8623, #8568, #8828, #8932, #9081, #9102, #9103, #9124, #9120, #9367, #9370, #9343, #9171, #9346, #9270, #9244, #8494, #8434, #8742, #8804, #8710, #8676, #9020, #9002, #9058, #9037, #9018, #9295, #9006, #9300, #8765, #9365, #9060)
+
+### PySpark
+After the initial introduction of the PySpark interface, it has gained some new features and optimizations in 2.0.
+
+- GPU-based prediction. (#9292, #9542)
+- Optimization for data initialization by avoiding the stack operation. (#9088)
+- Support predict feature contribution. (#8633)
+- Python typing support. (#9156, #9172, #9079, #8375)
+- `use_gpu` is deprecated. The `device` parameter is preferred.
+- Update eval_metric validation to support list of strings (#8826)
+- Improved logs for training (#9449)
+- Maintenance, including refactoring and document updates (#8324, #8465, #8605, #9202, #9460, #9302, #8385, #8630, #8525, #8496)
+- Fix for GPU setup. (#9495)
+
+### Other General New Features
+Here's a list of new features that don't have their own section and yet are general to all language bindings.
+
+- Use array interface for CSC matrix. This helps XGBoost to use a consistent number of threads and align the interface of the CSC matrix with other interfaces. In addition, memory usage is likely to decrease with CSC input thanks to on-the-fly type conversion. (#8672)
+- CUDA compute 90 is now part of the default build.. (#9397)
+
+### Other General Optimization
+These optimizations are general to all language bindings. For language-specific optimization, please visit the corresponding sections.
+
+- Performance for input with `array_interface` on CPU (like `numpy`) is significantly improved. (#9090)
+- Some optimization with CUDA for data initialization. (#9199, #9209, #9144)
+- Use the latest thrust policy to prevent synchronizing GPU devices. (#9212)
+- XGBoost now uses a per-thread CUDA stream, which prevents synchronization with other streams. (#9416, #9396, #9413)
+
+### Notable breaking change
+
+Other than the aforementioned change with the `device` parameter, here's a list of breaking changes affecting all packages.
+
+- Users must specify the format for text input (#9077). However, we suggest using third-party data structures such as `numpy.ndarray` instead of relying on text inputs. See https://github.com/dmlc/xgboost/issues/9472 for more info.
+
+### Notable bug fixes
+
+Some noteworthy bug fixes that are not related to specific language bindings are listed in this section.
+
+- Some language environments use a different thread to perform garbage collection, which breaks the thread-local cache used in XGBoost. XGBoost 2.0 implements a new thread-safe cache using a light weight lock to replace the thread-local cache. (#8851)
+- Fix model IO by clearing the prediction cache. (#8904)
+- `inf` is checked during data construction. (#8911)
+- Preserve order of saved updaters configuration. Usually, this is not an issue unless the `updater` parameter is used instead of the `tree_method` parameter (#9355)
+- Fix GPU memory allocation issue with categorical splits. (#9529)
+- Handle escape sequence like `\t\n` in feature names for JSON model dump. (#9474)
+- Normalize file path for model IO and text input. This handles short paths on Windows and paths that contain `~` on Unix (#9463). In addition, all path inputs are required to be encoded in UTF-8 (#9448, #9443)
+- Fix integer overflow on H100. (#9380)
+- Fix weighted sketching on GPU with categorical features. (#9341)
+- Fix metric serialization. The bug might cause some of the metrics to be dropped during evaluation. (#9405)
+- Fixes compilation errors on MSVC x86 targets (#8823)
+- Pick up the dmlc-core fix for the CSV parser. (#8897)
+
+
+### Documentation
+Aside from documents for new features, we have many smaller updates to improve user experience, from troubleshooting guides to typo fixes.
+
+- Explain CPU/GPU interop. (#8450)
+- Guide to troubleshoot NCCL errors. (#8943, #9206)
+- Add a note for rabit port selection. (#8879)
+- How to build the docs using conda (#9276)
+- Explain how to obtain reproducible results on distributed systems. (#8903)
+
+* Fixes and small updates to document and demonstration scripts. (#8626, #8436, #8995, #8907, #8923, #8926, #9358, #9232, #9201, #9469, #9462, #9458, #8543, #8597, #8401, #8784, #9213, #9098, #9008, #9223, #9333, #9434, #9435, #9415, #8773, #8752, #9291, #9549)
+
+### Python package
+* New Features and Improvements
+- Support primitive types of pyarrow-backed pandas dataframe. (#8653)
+- Warning messages emitted by XGBoost are now emitted using Python warnings. (#9387)
+- User can now format the value printed near the bars on the `plot_importance` plot (#8540)
+- XGBoost has improved half-type support (float16) with pandas, cupy, and cuDF. With GPU input, the handling is through CUDA `__half` type, and no data copy is made. (#8487, #9207, #8481)
+- Support `Series` and Python primitive types in `inplace_predict` and `QuantileDMatrix` (#8547, #8542)
+- Support all pandas' nullable integer types. (#8480)
+- Custom metric with the scikit-learn interface now supports `sample_weight`. (#8706)
+- Enable Installation of Python Package with System lib in a Virtual Environment (#9349)
+- Raise if expected workers are not alive in `xgboost.dask.train` (#9421)
+
+* Optimization
+- Cache transformed data in `QuantileDMatrix` for efficiency. (#8666, #9445)
+- Take datatable as row-major input. (#8472)
+- Remove unnecessary conversions between data structures (#8546)
+
+* Adopt modern Python packaging conventions (PEP 517, PEP 518, PEP 621)
+-  XGBoost adopted the modern Python packaging conventions. The old setup script `setup.py` is now replaced with the new configuration file `pyproject.toml`. Along with this, XGBoost now supports Python 3.11. (#9021, #9112, #9114, #9115) Consult the latest documentation for the updated instructions to build and install XGBoost.
+
+* Fixes
+- `DataIter` now accepts only keyword arguments. (#9431)
+- Fix empty DMatrix with categorical features. (#8739)
+- Convert ``DaskXGBClassifier.classes_`` to an array (#8452)
+- Define `best_iteration` only if early stopping is used to be consistent with documented behavior. (#9403)
+- Make feature validation immutable. (#9388)
+
+* Breaking changes
+- Discussed in the new `device` parameter section,  the `predictor` parameter is now removed. (#9129)
+- Remove support for single-string feature info. Feature type and names should be a sequence of strings (#9401)
+- Remove parameters in the `save_model` call for the scikit-learn interface. (#8963)
+- Remove the `ntree_limit` in the python package. This has been deprecated in previous versions. (#8345)
+
+* Maintenance including formatting and refactoring along with type hints.
+- More consistent use of `black` and `isort` for code formatting (#8420, #8748, #8867)
+- Improved type support. Most of the type changes happen in the PySpark module; here, we list the remaining changes. (#8444, #8617, #9197, #9005)
+- Set `enable_categorical` to True in predict. (#8592)
+- Some refactoring and updates for tests (#8395, #8372, #8557, #8379, #8702, #9459, #9316, #8446, #8695, #8409, #8993, #9480)
+
+* Documentation
+- Add introduction and notes for the sklearn interface. (#8948)
+- Demo for using dask for hyper-parameter optimization. (#8891)
+- Document all supported Python input types. (#8643)
+- Other documentation updates (#8944, #9304)
+
+### R package
+- Use the new data consumption interface for CSR and CSC. This provides better control for the number of threads and improves performance. (#8455, #8673)
+- Accept multiple evaluation metrics during training. (#8657)
+- Fix integer inputs with `NA`. (#9522)
+- Some refactoring for the R package (#8545, #8430, #8614, #8624, #8613, #9457, #8689, #8563, #9461, #8647, #8564, #8565, #8736, #8610, #8609, #8599, #8704, #9456, #9450, #9476, #9477, #9481). Special thanks to @jameslamb.
+- Document updates (#8886, #9323, #9437, #8998)
+
+### JVM packages
+Following are changes specific to various JVM-based packages.
+
+- Stop using Rabit in prediction (#9054)
+- Set feature_names and feature_types in jvm-packages. This is to prepare support for categorical features (#9364)
+- Scala 2.13 support. (#9099)
+- Change training stage from `ResultStage` to `ShuffleMapStage` (#9423)
+- Automatically set the max/min direction for the best score during early stopping. (#9404)
+* Revised support for `flink` (#9046)
+
+* Breaking changes
+- Scala-based tracker is removed. (#9078, #9045)
+- Change `DeviceQuantileDmatrix` into `QuantileDMatrix` (#8461)
+
+* Maintenance (#9253, #9166, #9395, #9389, #9224, #9233, #9351, #9479)
+
+* CI bot PRs
+We employed GitHub dependent bot to help us keep the dependencies up-to-date for JVM packages. With the help from the bot, we have cleared up all the dependencies that are lagging behind (#8501, #8507).
+
+Here's a list of dependency update PRs including those made by dependent bots (#8456, #8560, #8571, #8561, #8562, #8600, #8594, #8524, #8509, #8548, #8549, #8533, #8521, #8534, #8532, #8516, #8503, #8531, #8530, #8518, #8512, #8515, #8517, #8506, #8504, #8502, #8629, #8815, #8813, #8814, #8877, #8876, #8875, #8874, #8873, #9049, #9070, #9073, #9039, #9083, #8917, #8952, #8980, #8973, #8962, #9252, #9208, #9131, #9136, #9219, #9160, #9158, #9163, #9184, #9192, #9265, #9268, #8882, #8837, #8662, #8661, #8390, #9056, #8508, #8925, #8920, #9149, #9230, #9097, #8648, #9203, #8593).
+
+### Maintenance
+Maintenance work includes refactoring, fixing small issues that don't affect end users. (#9256, #8627, #8756, #8735, #8966, #8864, #8747, #8892, #9057, #8921, #8949, #8941, #8942, #9108, #9125, #9155, #9153, #9176, #9447, #9444, #9436, #9438, #9430, #9200, #9210, #9055, #9014, #9004, #8999, #9154, #9148, #9283, #9246, #8888, #8900, #8871, #8861, #8858, #8791, #8807, #8751, #8703, #8696, #8693, #8677, #8686, #8665, #8660, #8386, #8371, #8410, #8578, #8574, #8483, #8443, #8454, #8733)
+
+### CI
+- Build pip wheel with RMM support (#9383)
+- Other CI updates including updating dependencies and work on the CI infrastructure. (#9464, #9428, #8767, #9394, #9278, #9214, #9234, #9205, #9034, #9104, #8878, #9294, #8625, #8806, #8741, #8707, #8381, #8382, #8388, #8402, #8397, #8445, #8602, #8628, #8583, #8460, #9544)
+
 ## 1.7.6 (2023 Jun 16)
 
 This is a patch release for bug fixes. The CRAN package for the R binding is kept at 1.7.5.
diff --git a/R-package/.Rbuildignore b/R-package/.Rbuildignore
index b37d627ba..b1932e324 100644
--- a/R-package/.Rbuildignore
+++ b/R-package/.Rbuildignore
@@ -4,3 +4,5 @@
 ^.*\.Rproj$
 ^\.Rproj\.user$
 README.md
+^doc$
+^Meta$
diff --git a/R-package/R/callbacks.R b/R-package/R/callbacks.R
index 7265967b2..54f821a79 100644
--- a/R-package/R/callbacks.R
+++ b/R-package/R/callbacks.R
@@ -70,7 +70,7 @@ cb.print.evaluation <- function(period = 1, showsd = TRUE) {
         i == env$begin_iteration ||
         i == env$end_iteration) {
       stdev <- if (showsd) env$bst_evaluation_err else NULL
-      msg <- format.eval.string(i, env$bst_evaluation, stdev)
+      msg <- .format_eval_string(i, env$bst_evaluation, stdev)
       cat(msg, '\n')
     }
   }
@@ -380,7 +380,9 @@ cb.early.stop <- function(stopping_rounds, maximize = FALSE,
     if ((maximize && score > best_score) ||
         (!maximize && score < best_score)) {
 
-      best_msg <<- format.eval.string(i, env$bst_evaluation, env$bst_evaluation_err)
+      best_msg <<- .format_eval_string(
+        i, env$bst_evaluation, env$bst_evaluation_err
+      )
       best_score <<- score
       best_iteration <<- i
       best_ntreelimit <<- best_iteration * env$num_parallel_tree
@@ -555,14 +557,18 @@ cb.cv.predict <- function(save_models = FALSE) {
 #'
 #' @examples
 #' #### Binary classification:
-#' #
+#'
+#' ## Keep the number of threads to 1 for examples
+#' nthread <- 1
+#' data.table::setDTthreads(nthread)
+#'
 #' # In the iris dataset, it is hard to linearly separate Versicolor class from the rest
 #' # without considering the 2nd order interactions:
 #' x <- model.matrix(Species ~ .^2, iris)[,-1]
 #' colnames(x)
-#' dtrain <- xgb.DMatrix(scale(x), label = 1*(iris$Species == "versicolor"), nthread = 2)
+#' dtrain <- xgb.DMatrix(scale(x), label = 1*(iris$Species == "versicolor"), nthread = nthread)
 #' param <- list(booster = "gblinear", objective = "reg:logistic", eval_metric = "auc",
-#'               lambda = 0.0003, alpha = 0.0003, nthread = 2)
+#'               lambda = 0.0003, alpha = 0.0003, nthread = nthread)
 #' # For 'shotgun', which is a default linear updater, using high eta values may result in
 #' # unstable behaviour in some datasets. With this simple dataset, however, the high learning
 #' # rate does not break the convergence, but allows us to illustrate the typical pattern of
@@ -592,9 +598,9 @@ cb.cv.predict <- function(save_models = FALSE) {
 #'
 #' #### Multiclass classification:
 #' #
-#' dtrain <- xgb.DMatrix(scale(x), label = as.numeric(iris$Species) - 1, nthread = 1)
+#' dtrain <- xgb.DMatrix(scale(x), label = as.numeric(iris$Species) - 1, nthread = nthread)
 #' param <- list(booster = "gblinear", objective = "multi:softprob", num_class = 3,
-#'               lambda = 0.0003, alpha = 0.0003, nthread = 1)
+#'               lambda = 0.0003, alpha = 0.0003, nthread = nthread)
 #' # For the default linear updater 'shotgun' it sometimes is helpful
 #' # to use smaller eta to reduce instability
 #' bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 50, eta = 0.5,
@@ -754,7 +760,7 @@ xgb.gblinear.history <- function(model, class_index = NULL) {
 #
 
 # Format the evaluation metric string
-format.eval.string <- function(iter, eval_res, eval_err = NULL) {
+.format_eval_string <- function(iter, eval_res, eval_err = NULL) {
   if (length(eval_res) == 0)
     stop('no evaluation results')
   enames <- names(eval_res)
diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R
index 5ffbbc31c..37cfc199e 100644
--- a/R-package/R/xgb.Booster.R
+++ b/R-package/R/xgb.Booster.R
@@ -21,13 +21,13 @@ xgb.Booster.handle <- function(params, cachelist, modelfile, handle) {
       ## A memory buffer
       bst <- xgb.unserialize(modelfile, handle)
       xgb.parameters(bst) <- params
-      return (bst)
+      return(bst)
     } else if (inherits(modelfile, "xgb.Booster")) {
       ## A booster object
       bst <- xgb.Booster.complete(modelfile, saveraw = TRUE)
       bst <- xgb.unserialize(bst$raw)
       xgb.parameters(bst) <- params
-      return (bst)
+      return(bst)
     } else {
       stop("modelfile must be either character filename, or raw booster dump, or xgb.Booster object")
     }
@@ -267,11 +267,16 @@ xgb.Booster.complete <- function(object, saveraw = TRUE) {
 #'
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
+#'
+#' ## Keep the number of threads to 2 for examples
+#' nthread <- 2
+#' data.table::setDTthreads(nthread)
+#'
 #' train <- agaricus.train
 #' test <- agaricus.test
 #'
 #' bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
-#'                eta = 0.5, nthread = 2, nrounds = 5, objective = "binary:logistic")
+#'                eta = 0.5, nthread = nthread, nrounds = 5, objective = "binary:logistic")
 #' # use all trees by default
 #' pred <- predict(bst, test$data)
 #' # use only the 1st tree
@@ -337,8 +342,14 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
                                 reshape = FALSE, training = FALSE, iterationrange = NULL, strict_shape = FALSE, ...) {
   object <- xgb.Booster.complete(object, saveraw = FALSE)
 
-  if (!inherits(newdata, "xgb.DMatrix"))
-    newdata <- xgb.DMatrix(newdata, missing = missing, nthread = NVL(object$params[["nthread"]], -1))
+  if (!inherits(newdata, "xgb.DMatrix")) {
+    config <- jsonlite::fromJSON(xgb.config(object))
+    nthread <- strtoi(config$learner$generic_param$nthread)
+    newdata <- xgb.DMatrix(
+      newdata,
+      missing = missing, nthread = NVL(nthread, -1)
+    )
+  }
   if (!is.null(object[["feature_names"]]) &&
       !is.null(colnames(newdata)) &&
       !identical(object[["feature_names"]], colnames(newdata)))
@@ -371,7 +382,7 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
       cval[0] <- val
       return(cval)
     }
-    return (val)
+    return(val)
   }
 
   ## We set strict_shape to TRUE then drop the dimensions conditionally
@@ -628,10 +639,15 @@ xgb.attributes <- function(object) {
 #'
 #' @examples
 #' data(agaricus.train, package='xgboost')
+#' ## Keep the number of threads to 1 for examples
+#' nthread <- 1
+#' data.table::setDTthreads(nthread)
 #' train <- agaricus.train
 #'
-#' bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
-#'                eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
+#' bst <- xgboost(
+#'   data = train$data, label = train$label, max_depth = 2,
+#'   eta = 1, nthread = nthread, nrounds = 2, objective = "binary:logistic"
+#' )
 #' config <- xgb.config(bst)
 #'
 #' @rdname xgb.config
diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R
index fc2609416..b01e98637 100644
--- a/R-package/R/xgb.DMatrix.R
+++ b/R-package/R/xgb.DMatrix.R
@@ -18,7 +18,12 @@
 #'
 #' @examples
 #' data(agaricus.train, package='xgboost')
-#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
+#' ## Keep the number of threads to 1 for examples
+#' nthread <- 1
+#' data.table::setDTthreads(nthread)
+#' dtrain <- with(
+#'   agaricus.train, xgb.DMatrix(data, label = label, nthread = nthread)
+#' )
 #' xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
 #' dtrain <- xgb.DMatrix('xgb.DMatrix.data')
 #' if (file.exists('xgb.DMatrix.data')) file.remove('xgb.DMatrix.data')
@@ -112,7 +117,7 @@ xgb.get.DMatrix <- function(data, label, missing, weight, nthread) {
       stop("xgboost: invalid input data")
     }
   }
-  return (dtrain)
+  return(dtrain)
 }
 
 
diff --git a/R-package/R/xgb.load.R b/R-package/R/xgb.load.R
index cfbf0b2d8..cbdbdacc3 100644
--- a/R-package/R/xgb.load.R
+++ b/R-package/R/xgb.load.R
@@ -22,14 +22,23 @@
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
+#'
+#' ## Keep the number of threads to 1 for examples
+#' nthread <- 1
+#' data.table::setDTthreads(nthread)
+#'
 #' train <- agaricus.train
 #' test <- agaricus.test
-#' bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
-#'                eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
+#' bst <- xgboost(
+#'   data = train$data, label = train$label, max_depth = 2, eta = 1,
+#'   nthread = nthread,
+#'   nrounds = 2,
+#'   objective = "binary:logistic"
+#' )
+#'
 #' xgb.save(bst, 'xgb.model')
 #' bst <- xgb.load('xgb.model')
 #' if (file.exists('xgb.model')) file.remove('xgb.model')
-#' pred <- predict(bst, test$data)
 #' @export
 xgb.load <- function(modelfile) {
   if (is.null(modelfile))
diff --git a/R-package/R/xgb.load.raw.R b/R-package/R/xgb.load.raw.R
index d531da6c9..b159e9de1 100644
--- a/R-package/R/xgb.load.raw.R
+++ b/R-package/R/xgb.load.raw.R
@@ -18,6 +18,6 @@ xgb.load.raw <- function(buffer, as_booster = FALSE) {
     booster <- xgb.Booster.complete(booster, saveraw = TRUE)
     return(booster)
   } else {
-    return (handle)
+    return(handle)
   }
 }
diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R
index fa11c50fb..d69169b89 100644
--- a/R-package/R/xgb.model.dt.tree.R
+++ b/R-package/R/xgb.model.dt.tree.R
@@ -46,9 +46,12 @@
 #' # Basic use:
 #'
 #' data(agaricus.train, package='xgboost')
+#' ## Keep the number of threads to 1 for examples
+#' nthread <- 1
+#' data.table::setDTthreads(nthread)
 #'
 #' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2,
-#'                eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
+#'                eta = 1, nthread = nthread, nrounds = 2,objective = "binary:logistic")
 #'
 #' (dt <- xgb.model.dt.tree(colnames(agaricus.train$data), bst))
 #'
diff --git a/R-package/R/xgb.plot.deepness.R b/R-package/R/xgb.plot.deepness.R
index f6230e1ab..f0fe0f134 100644
--- a/R-package/R/xgb.plot.deepness.R
+++ b/R-package/R/xgb.plot.deepness.R
@@ -45,10 +45,13 @@
 #' @examples
 #'
 #' data(agaricus.train, package='xgboost')
+#' ## Keep the number of threads to 2 for examples
+#' nthread <- 2
+#' data.table::setDTthreads(nthread)
 #'
-#' # Change max_depth to a higher number to get a more significant result
+#' ## Change max_depth to a higher number to get a more significant result
 #' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 6,
-#'                eta = 0.1, nthread = 2, nrounds = 50, objective = "binary:logistic",
+#'                eta = 0.1, nthread = nthread, nrounds = 50, objective = "binary:logistic",
 #'                subsample = 0.5, min_child_weight = 2)
 #'
 #' xgb.plot.deepness(bst)
diff --git a/R-package/R/xgb.plot.importance.R b/R-package/R/xgb.plot.importance.R
index 7104d701f..2c02d5a42 100644
--- a/R-package/R/xgb.plot.importance.R
+++ b/R-package/R/xgb.plot.importance.R
@@ -45,9 +45,14 @@
 #'
 #' @examples
 #' data(agaricus.train)
+#' ## Keep the number of threads to 2 for examples
+#' nthread <- 2
+#' data.table::setDTthreads(nthread)
 #'
-#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 3,
-#'                eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
+#' bst <- xgboost(
+#'   data = agaricus.train$data, label = agaricus.train$label, max_depth = 3,
+#'   eta = 1, nthread = nthread, nrounds = 2, objective = "binary:logistic"
+#' )
 #'
 #' importance_matrix <- xgb.importance(colnames(agaricus.train$data), model = bst)
 #'
diff --git a/R-package/R/xgb.plot.multi.trees.R b/R-package/R/xgb.plot.multi.trees.R
index 63c66008d..f4d797a61 100644
--- a/R-package/R/xgb.plot.multi.trees.R
+++ b/R-package/R/xgb.plot.multi.trees.R
@@ -43,10 +43,15 @@
 #' @examples
 #'
 #' data(agaricus.train, package='xgboost')
+#' ## Keep the number of threads to 2 for examples
+#' nthread <- 2
+#' data.table::setDTthreads(nthread)
 #'
-#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 15,
-#'                eta = 1, nthread = 2, nrounds = 30, objective = "binary:logistic",
-#'                min_child_weight = 50, verbose = 0)
+#' bst <- xgboost(
+#'   data = agaricus.train$data, label = agaricus.train$label, max_depth = 15,
+#'   eta = 1, nthread = nthread, nrounds = 30, objective = "binary:logistic",
+#'   min_child_weight = 50, verbose = 0
+#' )
 #'
 #' p <- xgb.plot.multi.trees(model = bst, features_keep = 3)
 #' print(p)
diff --git a/R-package/R/xgb.plot.shap.R b/R-package/R/xgb.plot.shap.R
index 9efcb66ec..d9afd5546 100644
--- a/R-package/R/xgb.plot.shap.R
+++ b/R-package/R/xgb.plot.shap.R
@@ -74,9 +74,14 @@
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
 #'
-#' bst <- xgboost(agaricus.train$data, agaricus.train$label, nrounds = 50,
+#' ## Keep the number of threads to 1 for examples
+#' nthread <- 1
+#' data.table::setDTthreads(nthread)
+#' nrounds <- 20
+#'
+#' bst <- xgboost(agaricus.train$data, agaricus.train$label, nrounds = nrounds,
 #'                eta = 0.1, max_depth = 3, subsample = .5,
-#'                method = "hist", objective = "binary:logistic", nthread = 2, verbose = 0)
+#'                method = "hist", objective = "binary:logistic", nthread = nthread, verbose = 0)
 #'
 #' xgb.plot.shap(agaricus.test$data, model = bst, features = "odor=none")
 #' contr <- predict(bst, agaricus.test$data, predcontrib = TRUE)
@@ -85,12 +90,11 @@
 #'
 #' # multiclass example - plots for each class separately:
 #' nclass <- 3
-#' nrounds <- 20
 #' x <- as.matrix(iris[, -5])
 #' set.seed(123)
 #' is.na(x[sample(nrow(x) * 4, 30)]) <- TRUE # introduce some missing values
 #' mbst <- xgboost(data = x, label = as.numeric(iris$Species) - 1, nrounds = nrounds,
-#'                 max_depth = 2, eta = 0.3, subsample = .5, nthread = 2,
+#'                 max_depth = 2, eta = 0.3, subsample = .5, nthread = nthread,
 #'                 objective = "multi:softprob", num_class = nclass, verbose = 0)
 #' trees0 <- seq(from=0, by=nclass, length.out=nrounds)
 #' col <- rgb(0, 0, 1, 0.5)
diff --git a/R-package/R/xgb.save.R b/R-package/R/xgb.save.R
index 14be0f065..ab55bc4a9 100644
--- a/R-package/R/xgb.save.R
+++ b/R-package/R/xgb.save.R
@@ -25,14 +25,22 @@
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
+#'
+#' ## Keep the number of threads to 1 for examples
+#' nthread <- 1
+#' data.table::setDTthreads(nthread)
+#'
 #' train <- agaricus.train
 #' test <- agaricus.test
-#' bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
-#'                eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
+#' bst <- xgboost(
+#'   data = train$data, label = train$label, max_depth = 2, eta = 1,
+#'   nthread = nthread,
+#'   nrounds = 2,
+#'   objective = "binary:logistic"
+#' )
 #' xgb.save(bst, 'xgb.model')
 #' bst <- xgb.load('xgb.model')
 #' if (file.exists('xgb.model')) file.remove('xgb.model')
-#' pred <- predict(bst, test$data)
 #' @export
 xgb.save <- function(model, fname) {
   if (typeof(fname) != "character")
diff --git a/R-package/R/xgb.save.raw.R b/R-package/R/xgb.save.raw.R
index 48fdbca45..cad0fb0e0 100644
--- a/R-package/R/xgb.save.raw.R
+++ b/R-package/R/xgb.save.raw.R
@@ -16,13 +16,18 @@
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
+#'
+#' ## Keep the number of threads to 2 for examples
+#' nthread <- 2
+#' data.table::setDTthreads(nthread)
+#'
 #' train <- agaricus.train
 #' test <- agaricus.test
 #' bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
-#'                eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
+#'                eta = 1, nthread = nthread, nrounds = 2,objective = "binary:logistic")
+#'
 #' raw <- xgb.save.raw(bst)
 #' bst <- xgb.load.raw(raw)
-#' pred <- predict(bst, test$data)
 #'
 #' @export
 xgb.save.raw <- function(model, raw_format = "deprecated") {
diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R
index 7fe64ab34..d93a0643d 100644
--- a/R-package/R/xgb.train.R
+++ b/R-package/R/xgb.train.R
@@ -168,7 +168,8 @@
 #' than the \code{xgboost} interface.
 #'
 #' Parallelization is automatically enabled if \code{OpenMP} is present.
-#' Number of threads can also be manually specified via \code{nthread} parameter.
+#' Number of threads can also be manually specified via the \code{nthread}
+#' parameter.
 #'
 #' The evaluation metric is chosen automatically by XGBoost (according to the objective)
 #' when the \code{eval_metric} parameter is not provided.
@@ -237,17 +238,25 @@
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
 #'
-#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
-#' dtest <- with(agaricus.test, xgb.DMatrix(data, label = label, nthread = 2))
+#' ## Keep the number of threads to 1 for examples
+#' nthread <- 1
+#' data.table::setDTthreads(nthread)
+#'
+#' dtrain <- with(
+#'   agaricus.train, xgb.DMatrix(data, label = label, nthread = nthread)
+#' )
+#' dtest <- with(
+#'   agaricus.test, xgb.DMatrix(data, label = label, nthread = nthread)
+#' )
 #' watchlist <- list(train = dtrain, eval = dtest)
 #'
 #' ## A simple xgb.train example:
-#' param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = 2,
+#' param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = nthread,
 #'               objective = "binary:logistic", eval_metric = "auc")
 #' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist)
 #'
-#'
-#' ## An xgb.train example where custom objective and evaluation metric are used:
+#' ## An xgb.train example where custom objective and evaluation metric are
+#' ## used:
 #' logregobj <- function(preds, dtrain) {
 #'    labels <- getinfo(dtrain, "label")
 #'    preds <- 1/(1 + exp(-preds))
@@ -263,12 +272,12 @@
 #'
 #' # These functions could be used by passing them either:
 #' #  as 'objective' and 'eval_metric' parameters in the params list:
-#' param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = 2,
+#' param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = nthread,
 #'               objective = logregobj, eval_metric = evalerror)
 #' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist)
 #'
 #' #  or through the ... arguments:
-#' param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = 2)
+#' param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = nthread)
 #' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
 #'                  objective = logregobj, eval_metric = evalerror)
 #'
@@ -278,7 +287,7 @@
 #'
 #'
 #' ## An xgb.train example of using variable learning rates at each iteration:
-#' param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = 2,
+#' param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = nthread,
 #'               objective = "binary:logistic", eval_metric = "auc")
 #' my_etas <- list(eta = c(0.5, 0.1))
 #' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
@@ -290,7 +299,7 @@
 #'
 #' ## An 'xgboost' interface example:
 #' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label,
-#'                max_depth = 2, eta = 1, nthread = 2, nrounds = 2,
+#'                max_depth = 2, eta = 1, nthread = nthread, nrounds = 2,
 #'                objective = "binary:logistic")
 #' pred <- predict(bst, agaricus.test$data)
 #'
diff --git a/R-package/R/xgb.unserialize.R b/R-package/R/xgb.unserialize.R
index e666eb055..291d3e7da 100644
--- a/R-package/R/xgb.unserialize.R
+++ b/R-package/R/xgb.unserialize.R
@@ -37,5 +37,5 @@ xgb.unserialize <- function(buffer, handle = NULL) {
       }
     })
   class(handle) <- "xgb.Booster.handle"
-  return (handle)
+  return(handle)
 }
diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R
index db4fd67aa..e60ea2de8 100644
--- a/R-package/R/xgboost.R
+++ b/R-package/R/xgboost.R
@@ -24,7 +24,7 @@ xgboost <- function(data = NULL, label = NULL, missing = NA, weight = NULL,
                    early_stopping_rounds = early_stopping_rounds, maximize = maximize,
                    save_period = save_period, save_name = save_name,
                    xgb_model = xgb_model, callbacks = callbacks, ...)
-  return (bst)
+  return(bst)
 }
 
 #' Training part from Mushroom Data Set
diff --git a/R-package/demo/cross_validation.R b/R-package/demo/cross_validation.R
index 33c70a3be..cf048c5ed 100644
--- a/R-package/demo/cross_validation.R
+++ b/R-package/demo/cross_validation.R
@@ -25,7 +25,7 @@ xgb.cv(param, dtrain, nrounds, nfold = 5,
 # you can also do cross validation with customized loss function
 # See custom_objective.R
 ##
-print ('running cross validation, with customized loss function')
+print('running cross validation, with customized loss function')
 
 logregobj <- function(preds, dtrain) {
   labels <- getinfo(dtrain, "label")
diff --git a/R-package/demo/custom_objective.R b/R-package/demo/custom_objective.R
index 2d0914ab5..35201332c 100644
--- a/R-package/demo/custom_objective.R
+++ b/R-package/demo/custom_objective.R
@@ -35,7 +35,7 @@ evalerror <- function(preds, dtrain) {
 
 param <- list(max_depth = 2, eta = 1, nthread  =  2, verbosity = 0,
               objective = logregobj, eval_metric = evalerror)
-print ('start training with user customized objective')
+print('start training with user customized objective')
 # training with customized objective, we can also do step by step training
 # simply look at xgboost.py's implementation of train
 bst <- xgb.train(param, dtrain, num_round, watchlist)
@@ -59,7 +59,7 @@ logregobjattr <- function(preds, dtrain) {
 }
 param <- list(max_depth = 2, eta = 1, nthread  =  2, verbosity = 0,
               objective = logregobjattr, eval_metric = evalerror)
-print ('start training with user customized objective, with additional attributes in DMatrix')
+print('start training with user customized objective, with additional attributes in DMatrix')
 # training with customized objective, we can also do step by step training
 # simply look at xgboost.py's implementation of train
 bst <- xgb.train(param, dtrain, num_round, watchlist)
diff --git a/R-package/demo/early_stopping.R b/R-package/demo/early_stopping.R
index f733dce8d..04da1382f 100644
--- a/R-package/demo/early_stopping.R
+++ b/R-package/demo/early_stopping.R
@@ -30,7 +30,7 @@ evalerror <- function(preds, dtrain) {
   err <- as.numeric(sum(labels != (preds > 0))) / length(labels)
   return(list(metric = "error", value = err))
 }
-print ('start training with early Stopping setting')
+print('start training with early Stopping setting')
 
 bst <- xgb.train(param, dtrain, num_round, watchlist,
                  objective = logregobj, eval_metric = evalerror, maximize = FALSE,
diff --git a/R-package/man/cb.gblinear.history.Rd b/R-package/man/cb.gblinear.history.Rd
index f050fc7f1..2a03c14db 100644
--- a/R-package/man/cb.gblinear.history.Rd
+++ b/R-package/man/cb.gblinear.history.Rd
@@ -35,14 +35,18 @@ Callback function expects the following values to be set in its calling frame:
 }
 \examples{
 #### Binary classification:
-#
+
+## Keep the number of threads to 1 for examples
+nthread <- 1
+data.table::setDTthreads(nthread)
+
 # In the iris dataset, it is hard to linearly separate Versicolor class from the rest
 # without considering the 2nd order interactions:
 x <- model.matrix(Species ~ .^2, iris)[,-1]
 colnames(x)
-dtrain <- xgb.DMatrix(scale(x), label = 1*(iris$Species == "versicolor"), nthread = 2)
+dtrain <- xgb.DMatrix(scale(x), label = 1*(iris$Species == "versicolor"), nthread = nthread)
 param <- list(booster = "gblinear", objective = "reg:logistic", eval_metric = "auc",
-              lambda = 0.0003, alpha = 0.0003, nthread = 2)
+              lambda = 0.0003, alpha = 0.0003, nthread = nthread)
 # For 'shotgun', which is a default linear updater, using high eta values may result in
 # unstable behaviour in some datasets. With this simple dataset, however, the high learning
 # rate does not break the convergence, but allows us to illustrate the typical pattern of
@@ -72,9 +76,9 @@ matplot(xgb.gblinear.history(bst)[[3]], type = 'l')
 
 #### Multiclass classification:
 #
-dtrain <- xgb.DMatrix(scale(x), label = as.numeric(iris$Species) - 1, nthread = 1)
+dtrain <- xgb.DMatrix(scale(x), label = as.numeric(iris$Species) - 1, nthread = nthread)
 param <- list(booster = "gblinear", objective = "multi:softprob", num_class = 3,
-              lambda = 0.0003, alpha = 0.0003, nthread = 1)
+              lambda = 0.0003, alpha = 0.0003, nthread = nthread)
 # For the default linear updater 'shotgun' it sometimes is helpful
 # to use smaller eta to reduce instability
 bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 50, eta = 0.5,
diff --git a/R-package/man/predict.xgb.Booster.Rd b/R-package/man/predict.xgb.Booster.Rd
index 87f06d451..ee3b370c4 100644
--- a/R-package/man/predict.xgb.Booster.Rd
+++ b/R-package/man/predict.xgb.Booster.Rd
@@ -132,11 +132,16 @@ Note also that converting a matrix to \code{\link{xgb.DMatrix}} uses multiple th
 
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
+
+## Keep the number of threads to 2 for examples
+nthread <- 2
+data.table::setDTthreads(nthread)
+
 train <- agaricus.train
 test <- agaricus.test
 
 bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
-               eta = 0.5, nthread = 2, nrounds = 5, objective = "binary:logistic")
+               eta = 0.5, nthread = nthread, nrounds = 5, objective = "binary:logistic")
 # use all trees by default
 pred <- predict(bst, test$data)
 # use only the 1st tree
diff --git a/R-package/man/xgb.DMatrix.Rd b/R-package/man/xgb.DMatrix.Rd
index 742073fad..59ef0b3be 100644
--- a/R-package/man/xgb.DMatrix.Rd
+++ b/R-package/man/xgb.DMatrix.Rd
@@ -38,7 +38,12 @@ Supported input file formats are either a LIBSVM text file or a binary file that
 }
 \examples{
 data(agaricus.train, package='xgboost')
-dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
+## Keep the number of threads to 1 for examples
+nthread <- 1
+data.table::setDTthreads(nthread)
+dtrain <- with(
+  agaricus.train, xgb.DMatrix(data, label = label, nthread = nthread)
+)
 xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
 dtrain <- xgb.DMatrix('xgb.DMatrix.data')
 if (file.exists('xgb.DMatrix.data')) file.remove('xgb.DMatrix.data')
diff --git a/R-package/man/xgb.config.Rd b/R-package/man/xgb.config.Rd
index a5187c8ea..35545cc77 100644
--- a/R-package/man/xgb.config.Rd
+++ b/R-package/man/xgb.config.Rd
@@ -19,10 +19,15 @@ Accessors for model parameters as JSON string.
 }
 \examples{
 data(agaricus.train, package='xgboost')
+## Keep the number of threads to 1 for examples
+nthread <- 1
+data.table::setDTthreads(nthread)
 train <- agaricus.train
 
-bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
-               eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
+bst <- xgboost(
+  data = train$data, label = train$label, max_depth = 2,
+  eta = 1, nthread = nthread, nrounds = 2, objective = "binary:logistic"
+)
 config <- xgb.config(bst)
 
 }
diff --git a/R-package/man/xgb.load.Rd b/R-package/man/xgb.load.Rd
index f644bc408..1a406cc21 100644
--- a/R-package/man/xgb.load.Rd
+++ b/R-package/man/xgb.load.Rd
@@ -27,14 +27,23 @@ not \code{xgb.load}.
 \examples{
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
+
+## Keep the number of threads to 1 for examples
+nthread <- 1
+data.table::setDTthreads(nthread)
+
 train <- agaricus.train
 test <- agaricus.test
-bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
-               eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
+bst <- xgboost(
+  data = train$data, label = train$label, max_depth = 2, eta = 1,
+  nthread = nthread,
+  nrounds = 2,
+  objective = "binary:logistic"
+)
+
 xgb.save(bst, 'xgb.model')
 bst <- xgb.load('xgb.model')
 if (file.exists('xgb.model')) file.remove('xgb.model')
-pred <- predict(bst, test$data)
 }
 \seealso{
 \code{\link{xgb.save}}, \code{\link{xgb.Booster.complete}}.
diff --git a/R-package/man/xgb.model.dt.tree.Rd b/R-package/man/xgb.model.dt.tree.Rd
index b89d298b6..5a17f9d90 100644
--- a/R-package/man/xgb.model.dt.tree.Rd
+++ b/R-package/man/xgb.model.dt.tree.Rd
@@ -66,9 +66,12 @@ Parse a boosted tree model text dump into a \code{data.table} structure.
 # Basic use:
 
 data(agaricus.train, package='xgboost')
+## Keep the number of threads to 1 for examples
+nthread <- 1
+data.table::setDTthreads(nthread)
 
 bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2,
-               eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
+               eta = 1, nthread = nthread, nrounds = 2,objective = "binary:logistic")
 
 (dt <- xgb.model.dt.tree(colnames(agaricus.train$data), bst))
 
diff --git a/R-package/man/xgb.plot.deepness.Rd b/R-package/man/xgb.plot.deepness.Rd
index 39e291a81..9e23ac130 100644
--- a/R-package/man/xgb.plot.deepness.Rd
+++ b/R-package/man/xgb.plot.deepness.Rd
@@ -61,10 +61,13 @@ This function was inspired by the blog post
 \examples{
 
 data(agaricus.train, package='xgboost')
+## Keep the number of threads to 2 for examples
+nthread <- 2
+data.table::setDTthreads(nthread)
 
-# Change max_depth to a higher number to get a more significant result
+## Change max_depth to a higher number to get a more significant result
 bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 6,
-               eta = 0.1, nthread = 2, nrounds = 50, objective = "binary:logistic",
+               eta = 0.1, nthread = nthread, nrounds = 50, objective = "binary:logistic",
                subsample = 0.5, min_child_weight = 2)
 
 xgb.plot.deepness(bst)
diff --git a/R-package/man/xgb.plot.importance.Rd b/R-package/man/xgb.plot.importance.Rd
index 1ee58b7ad..4dba62afe 100644
--- a/R-package/man/xgb.plot.importance.Rd
+++ b/R-package/man/xgb.plot.importance.Rd
@@ -77,9 +77,14 @@ with bar colors corresponding to different clusters that have somewhat similar i
 }
 \examples{
 data(agaricus.train)
+## Keep the number of threads to 2 for examples
+nthread <- 2
+data.table::setDTthreads(nthread)
 
-bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 3,
-               eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
+bst <- xgboost(
+  data = agaricus.train$data, label = agaricus.train$label, max_depth = 3,
+  eta = 1, nthread = nthread, nrounds = 2, objective = "binary:logistic"
+)
 
 importance_matrix <- xgb.importance(colnames(agaricus.train$data), model = bst)
 
diff --git a/R-package/man/xgb.plot.multi.trees.Rd b/R-package/man/xgb.plot.multi.trees.Rd
index 74c4a0604..4fa526b90 100644
--- a/R-package/man/xgb.plot.multi.trees.Rd
+++ b/R-package/man/xgb.plot.multi.trees.Rd
@@ -63,10 +63,15 @@ This function is inspired by this blog post:
 \examples{
 
 data(agaricus.train, package='xgboost')
+## Keep the number of threads to 2 for examples
+nthread <- 2
+data.table::setDTthreads(nthread)
 
-bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 15,
-               eta = 1, nthread = 2, nrounds = 30, objective = "binary:logistic",
-               min_child_weight = 50, verbose = 0)
+bst <- xgboost(
+  data = agaricus.train$data, label = agaricus.train$label, max_depth = 15,
+  eta = 1, nthread = nthread, nrounds = 30, objective = "binary:logistic",
+  min_child_weight = 50, verbose = 0
+)
 
 p <- xgb.plot.multi.trees(model = bst, features_keep = 3)
 print(p)
diff --git a/R-package/man/xgb.plot.shap.Rd b/R-package/man/xgb.plot.shap.Rd
index a55a551de..6f2d0dfa6 100644
--- a/R-package/man/xgb.plot.shap.Rd
+++ b/R-package/man/xgb.plot.shap.Rd
@@ -124,9 +124,14 @@ a meaningful thing to do.
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
 
-bst <- xgboost(agaricus.train$data, agaricus.train$label, nrounds = 50,
+## Keep the number of threads to 1 for examples
+nthread <- 1
+data.table::setDTthreads(nthread)
+nrounds <- 20
+
+bst <- xgboost(agaricus.train$data, agaricus.train$label, nrounds = nrounds,
                eta = 0.1, max_depth = 3, subsample = .5,
-               method = "hist", objective = "binary:logistic", nthread = 2, verbose = 0)
+               method = "hist", objective = "binary:logistic", nthread = nthread, verbose = 0)
 
 xgb.plot.shap(agaricus.test$data, model = bst, features = "odor=none")
 contr <- predict(bst, agaricus.test$data, predcontrib = TRUE)
@@ -135,12 +140,11 @@ xgb.ggplot.shap.summary(agaricus.test$data, contr, model = bst, top_n = 12)  # S
 
 # multiclass example - plots for each class separately:
 nclass <- 3
-nrounds <- 20
 x <- as.matrix(iris[, -5])
 set.seed(123)
 is.na(x[sample(nrow(x) * 4, 30)]) <- TRUE # introduce some missing values
 mbst <- xgboost(data = x, label = as.numeric(iris$Species) - 1, nrounds = nrounds,
-                max_depth = 2, eta = 0.3, subsample = .5, nthread = 2,
+                max_depth = 2, eta = 0.3, subsample = .5, nthread = nthread,
                 objective = "multi:softprob", num_class = nclass, verbose = 0)
 trees0 <- seq(from=0, by=nclass, length.out=nrounds)
 col <- rgb(0, 0, 1, 0.5)
diff --git a/R-package/man/xgb.save.Rd b/R-package/man/xgb.save.Rd
index 235fc504c..a7e160a12 100644
--- a/R-package/man/xgb.save.Rd
+++ b/R-package/man/xgb.save.Rd
@@ -31,14 +31,22 @@ releases of XGBoost.
 \examples{
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
+
+## Keep the number of threads to 1 for examples
+nthread <- 1
+data.table::setDTthreads(nthread)
+
 train <- agaricus.train
 test <- agaricus.test
-bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
-               eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
+bst <- xgboost(
+  data = train$data, label = train$label, max_depth = 2, eta = 1,
+  nthread = nthread,
+  nrounds = 2,
+  objective = "binary:logistic"
+)
 xgb.save(bst, 'xgb.model')
 bst <- xgb.load('xgb.model')
 if (file.exists('xgb.model')) file.remove('xgb.model')
-pred <- predict(bst, test$data)
 }
 \seealso{
 \code{\link{xgb.load}}, \code{\link{xgb.Booster.complete}}.
diff --git a/R-package/man/xgb.save.raw.Rd b/R-package/man/xgb.save.raw.Rd
index ad188eb83..c7c93a734 100644
--- a/R-package/man/xgb.save.raw.Rd
+++ b/R-package/man/xgb.save.raw.Rd
@@ -25,12 +25,17 @@ Save xgboost model from xgboost or xgb.train
 \examples{
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
+
+## Keep the number of threads to 2 for examples
+nthread <- 2
+data.table::setDTthreads(nthread)
+
 train <- agaricus.train
 test <- agaricus.test
 bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
-               eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
+               eta = 1, nthread = nthread, nrounds = 2,objective = "binary:logistic")
+
 raw <- xgb.save.raw(bst)
 bst <- xgb.load.raw(raw)
-pred <- predict(bst, test$data)
 
 }
diff --git a/R-package/man/xgb.train.Rd b/R-package/man/xgb.train.Rd
index d2eeadfd0..105009cf8 100644
--- a/R-package/man/xgb.train.Rd
+++ b/R-package/man/xgb.train.Rd
@@ -250,7 +250,8 @@ customized objective and evaluation metric functions, therefore it is more flexi
 than the \code{xgboost} interface.
 
 Parallelization is automatically enabled if \code{OpenMP} is present.
-Number of threads can also be manually specified via \code{nthread} parameter.
+Number of threads can also be manually specified via the \code{nthread}
+parameter.
 
 The evaluation metric is chosen automatically by XGBoost (according to the objective)
 when the \code{eval_metric} parameter is not provided.
@@ -286,17 +287,25 @@ The following callbacks are automatically created when certain parameters are se
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
 
-dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
-dtest <- with(agaricus.test, xgb.DMatrix(data, label = label, nthread = 2))
+## Keep the number of threads to 1 for examples
+nthread <- 1
+data.table::setDTthreads(nthread)
+
+dtrain <- with(
+  agaricus.train, xgb.DMatrix(data, label = label, nthread = nthread)
+)
+dtest <- with(
+  agaricus.test, xgb.DMatrix(data, label = label, nthread = nthread)
+)
 watchlist <- list(train = dtrain, eval = dtest)
 
 ## A simple xgb.train example:
-param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = 2,
+param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = nthread,
               objective = "binary:logistic", eval_metric = "auc")
 bst <- xgb.train(param, dtrain, nrounds = 2, watchlist)
 
-
-## An xgb.train example where custom objective and evaluation metric are used:
+## An xgb.train example where custom objective and evaluation metric are
+## used:
 logregobj <- function(preds, dtrain) {
    labels <- getinfo(dtrain, "label")
    preds <- 1/(1 + exp(-preds))
@@ -312,12 +321,12 @@ evalerror <- function(preds, dtrain) {
 
 # These functions could be used by passing them either:
 #  as 'objective' and 'eval_metric' parameters in the params list:
-param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = 2,
+param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = nthread,
               objective = logregobj, eval_metric = evalerror)
 bst <- xgb.train(param, dtrain, nrounds = 2, watchlist)
 
 #  or through the ... arguments:
-param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = 2)
+param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = nthread)
 bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
                  objective = logregobj, eval_metric = evalerror)
 
@@ -327,7 +336,7 @@ bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
 
 
 ## An xgb.train example of using variable learning rates at each iteration:
-param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = 2,
+param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = nthread,
               objective = "binary:logistic", eval_metric = "auc")
 my_etas <- list(eta = c(0.5, 0.1))
 bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
@@ -339,7 +348,7 @@ bst <- xgb.train(param, dtrain, nrounds = 25, watchlist,
 
 ## An 'xgboost' interface example:
 bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label,
-               max_depth = 2, eta = 1, nthread = 2, nrounds = 2,
+               max_depth = 2, eta = 1, nthread = nthread, nrounds = 2,
                objective = "binary:logistic")
 pred <- predict(bst, agaricus.test$data)
 
diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in
index f42c94501..37511ec62 100644
--- a/R-package/src/Makevars.in
+++ b/R-package/src/Makevars.in
@@ -62,6 +62,7 @@ OBJECTS= \
     $(PKGROOT)/src/gbm/gbtree_model.o \
     $(PKGROOT)/src/gbm/gblinear.o \
     $(PKGROOT)/src/gbm/gblinear_model.o \
+    $(PKGROOT)/src/data/adapter.o \
     $(PKGROOT)/src/data/simple_dmatrix.o \
     $(PKGROOT)/src/data/data.o \
     $(PKGROOT)/src/data/sparse_page_raw_format.o \
@@ -97,9 +98,15 @@ OBJECTS= \
     $(PKGROOT)/src/context.o \
     $(PKGROOT)/src/logging.o \
     $(PKGROOT)/src/global_config.o \
+    $(PKGROOT)/src/collective/allgather.o \
+    $(PKGROOT)/src/collective/allreduce.o \
+    $(PKGROOT)/src/collective/broadcast.o \
+    $(PKGROOT)/src/collective/comm.o \
+    $(PKGROOT)/src/collective/tracker.o \
     $(PKGROOT)/src/collective/communicator.o \
     $(PKGROOT)/src/collective/in_memory_communicator.o \
     $(PKGROOT)/src/collective/in_memory_handler.o \
+    $(PKGROOT)/src/collective/loop.o \
     $(PKGROOT)/src/collective/socket.o \
     $(PKGROOT)/src/common/charconv.o \
     $(PKGROOT)/src/common/column_matrix.o \
diff --git a/R-package/src/Makevars.win b/R-package/src/Makevars.win
index 1b620751f..611cff874 100644
--- a/R-package/src/Makevars.win
+++ b/R-package/src/Makevars.win
@@ -62,6 +62,7 @@ OBJECTS= \
     $(PKGROOT)/src/gbm/gbtree_model.o \
     $(PKGROOT)/src/gbm/gblinear.o \
     $(PKGROOT)/src/gbm/gblinear_model.o \
+    $(PKGROOT)/src/data/adapter.o \
     $(PKGROOT)/src/data/simple_dmatrix.o \
     $(PKGROOT)/src/data/data.o \
     $(PKGROOT)/src/data/sparse_page_raw_format.o \
@@ -97,9 +98,15 @@ OBJECTS= \
     $(PKGROOT)/src/context.o \
     $(PKGROOT)/src/logging.o \
     $(PKGROOT)/src/global_config.o \
+    $(PKGROOT)/src/collective/allgather.o \
+    $(PKGROOT)/src/collective/allreduce.o \
+    $(PKGROOT)/src/collective/broadcast.o \
+    $(PKGROOT)/src/collective/comm.o \
+    $(PKGROOT)/src/collective/tracker.o \
     $(PKGROOT)/src/collective/communicator.o \
     $(PKGROOT)/src/collective/in_memory_communicator.o \
     $(PKGROOT)/src/collective/in_memory_handler.o \
+    $(PKGROOT)/src/collective/loop.o \
     $(PKGROOT)/src/collective/socket.o \
     $(PKGROOT)/src/common/charconv.o \
     $(PKGROOT)/src/common/column_matrix.o \
diff --git a/R-package/src/init.c b/R-package/src/init.c
index 09174222e..5c8e179d6 100644
--- a/R-package/src/init.c
+++ b/R-package/src/init.c
@@ -5,7 +5,6 @@
  * and edited to conform to xgboost C linter requirements. For details, see
  * https://cran.r-project.org/doc/manuals/r-release/R-exts.html#Registering-native-routines
  */
-#include <R.h>
 #include <Rinternals.h>
 #include <stdlib.h>
 #include <R_ext/Rdynload.h>
diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc
index 44082f255..2938d4b6e 100644
--- a/R-package/src/xgboost_R.cc
+++ b/R-package/src/xgboost_R.cc
@@ -20,7 +20,6 @@
 #include "../../src/common/threading_utils.h"
 
 #include "./xgboost_R.h"  // Must follow other includes.
-#include "Rinternals.h"
 
 /*!
  * \brief macro to annotate begin of api
diff --git a/R-package/tests/helper_scripts/generate_models.R b/R-package/tests/helper_scripts/generate_models.R
index 5d64fa6c5..ef2aeded6 100644
--- a/R-package/tests/helper_scripts/generate_models.R
+++ b/R-package/tests/helper_scripts/generate_models.R
@@ -19,15 +19,15 @@ w <- runif(metadata$kRows)
 version <- packageVersion('xgboost')
 target_dir <- 'models'
 
-save_booster <- function (booster, model_name) {
-  booster_bin <- function (model_name) {
-    return (file.path(target_dir, paste('xgboost-', version, '.', model_name, '.bin', sep = '')))
+save_booster <- function(booster, model_name) {
+  booster_bin <- function(model_name) {
+    return(file.path(target_dir, paste('xgboost-', version, '.', model_name, '.bin', sep = '')))
   }
-  booster_json <- function (model_name) {
-    return (file.path(target_dir, paste('xgboost-', version, '.', model_name, '.json', sep = '')))
+  booster_json <- function(model_name) {
+    return(file.path(target_dir, paste('xgboost-', version, '.', model_name, '.json', sep = '')))
   }
-  booster_rds <- function (model_name) {
-    return (file.path(target_dir, paste('xgboost-', version, '.', model_name, '.rds', sep = '')))
+  booster_rds <- function(model_name) {
+    return(file.path(target_dir, paste('xgboost-', version, '.', model_name, '.rds', sep = '')))
   }
   xgb.save(booster, booster_bin(model_name))
   saveRDS(booster, booster_rds(model_name))
@@ -36,7 +36,7 @@ save_booster <- function (booster, model_name) {
   }
 }
 
-generate_regression_model <- function () {
+generate_regression_model <- function() {
   print('Regression')
   y <- rnorm(metadata$kRows)
 
@@ -47,7 +47,7 @@ generate_regression_model <- function () {
   save_booster(booster, 'reg')
 }
 
-generate_logistic_model <- function () {
+generate_logistic_model <- function() {
   print('Binary classification with logistic loss')
   y <- sample(0:1, size = metadata$kRows, replace = TRUE)
   stopifnot(max(y) == 1, min(y) == 0)
@@ -64,7 +64,7 @@ generate_logistic_model <- function () {
   }
 }
 
-generate_classification_model <- function () {
+generate_classification_model <- function() {
   print('Multi-class classification')
   y <- sample(0:(metadata$kClasses - 1), size = metadata$kRows, replace = TRUE)
   stopifnot(max(y) == metadata$kClasses - 1, min(y) == 0)
@@ -77,7 +77,7 @@ generate_classification_model <- function () {
   save_booster(booster, 'cls')
 }
 
-generate_ranking_model <- function () {
+generate_ranking_model <- function() {
   print('Learning to rank')
   y <- sample(0:4, size = metadata$kRows, replace = TRUE)
   stopifnot(max(y) == 4, min(y) == 0)
diff --git a/R-package/tests/helper_scripts/run-examples.R b/R-package/tests/helper_scripts/run-examples.R
new file mode 100644
index 000000000..08dd3d2a0
--- /dev/null
+++ b/R-package/tests/helper_scripts/run-examples.R
@@ -0,0 +1,25 @@
+## Helper script for running individual examples.
+library(pkgload)
+library(xgboost)
+
+files <- list.files("./man")
+
+
+run_example_timeit <- function(f) {
+  path <- paste("./man/", f, sep = "")
+  print(paste("Test", f))
+  flush.console()
+  t0 <- proc.time()
+  run_example(path)
+  t1 <- proc.time()
+  list(file = f, time = t1 - t0)
+}
+
+timings <- lapply(files, run_example_timeit)
+
+for (t in timings) {
+  ratio <- t$time[1] / t$time[3]
+  if (!is.na(ratio) && !is.infinite(ratio) && ratio >= 2.5) {
+    print(paste("Offending example:", t$file, ratio))
+  }
+}
diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
index a21b03d77..b7e819738 100644
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -1,23 +1,28 @@
 context("basic functions")
 
-data(agaricus.train, package = 'xgboost')
-data(agaricus.test, package = 'xgboost')
+data(agaricus.train, package = "xgboost")
+data(agaricus.test, package = "xgboost")
 train <- agaricus.train
 test <- agaricus.test
 set.seed(1994)
 
 # disable some tests for Win32
 windows_flag <- .Platform$OS.type == "windows" &&
-               .Machine$sizeof.pointer != 8
-solaris_flag <- (Sys.info()['sysname'] == "SunOS")
+  .Machine$sizeof.pointer != 8
+solaris_flag <- (Sys.info()["sysname"] == "SunOS")
+n_threads <- 1
+
 
 test_that("train and predict binary classification", {
   nrounds <- 2
   expect_output(
-    bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
-                  eta = 1, nthread = 2, nrounds = nrounds, objective = "binary:logistic",
-                  eval_metric = "error")
-  , "train-error")
+    bst <- xgboost(
+      data = train$data, label = train$label, max_depth = 2,
+      eta = 1, nthread = n_threads, nrounds = nrounds,
+      objective = "binary:logistic", eval_metric = "error"
+    ),
+    "train-error"
+  )
   expect_equal(class(bst), "xgb.Booster")
   expect_equal(bst$niter, nrounds)
   expect_false(is.null(bst$evaluation_log))
@@ -46,26 +51,39 @@ test_that("parameter validation works", {
   d <- cbind(
     x1 = rnorm(10),
     x2 = rnorm(10),
-    x3 = rnorm(10))
+    x3 = rnorm(10)
+  )
   y <- d[, "x1"] + d[, "x2"]^2 +
     ifelse(d[, "x3"] > .5, d[, "x3"]^2, 2^d[, "x3"]) +
     rnorm(10)
-  dtrain <- xgb.DMatrix(data = d, info = list(label = y))
+  dtrain <- xgb.DMatrix(data = d, info = list(label = y), nthread = n_threads)
 
   correct <- function() {
-    params <- list(max_depth = 2, booster = "dart",
-                   rate_drop = 0.5, one_drop = TRUE,
-                   objective = "reg:squarederror")
+    params <- list(
+      max_depth = 2,
+      booster = "dart",
+      rate_drop = 0.5,
+      one_drop = TRUE,
+      nthread = n_threads,
+      objective = "reg:squarederror"
+    )
     xgb.train(params = params, data = dtrain, nrounds = nrounds)
   }
   expect_silent(correct())
   incorrect <- function() {
-    params <- list(max_depth = 2, booster = "dart",
-                   rate_drop = 0.5, one_drop = TRUE,
-                   objective = "reg:squarederror",
-                   foo = "bar", bar = "foo")
+    params <- list(
+      max_depth = 2,
+      booster = "dart",
+      rate_drop = 0.5,
+      one_drop = TRUE,
+      objective = "reg:squarederror",
+      nthread = n_threads,
+      foo = "bar",
+      bar = "foo"
+    )
     output <- capture.output(
-      xgb.train(params = params, data = dtrain, nrounds = nrounds))
+      xgb.train(params = params, data = dtrain, nrounds = nrounds)
+    )
     print(output)
   }
   expect_output(incorrect(), '\\\\"bar\\\\", \\\\"foo\\\\"')
@@ -79,7 +97,8 @@ test_that("dart prediction works", {
   d <- cbind(
     x1 = rnorm(100),
     x2 = rnorm(100),
-    x3 = rnorm(100))
+    x3 = rnorm(100)
+  )
   y <- d[, "x1"] + d[, "x2"]^2 +
     ifelse(d[, "x3"] > .5, d[, "x3"]^2, 2^d[, "x3"]) +
     rnorm(100)
@@ -93,7 +112,7 @@ test_that("dart prediction works", {
     rate_drop = 0.5,
     one_drop = TRUE,
     eta = 1,
-    nthread = 2,
+    nthread = n_threads,
     nrounds = nrounds,
     objective = "reg:squarederror"
   )
@@ -105,7 +124,7 @@ test_that("dart prediction works", {
   expect_false(all(matrix(pred_by_xgboost_0, byrow = TRUE) == matrix(pred_by_xgboost_2, byrow = TRUE)))
 
   set.seed(1994)
-  dtrain <- xgb.DMatrix(data = d, info = list(label = y))
+  dtrain <- xgb.DMatrix(data = d, info = list(label = y), nthread = n_threads)
   booster_by_train <- xgb.train(
     params = list(
       booster = "dart",
@@ -113,7 +132,7 @@ test_that("dart prediction works", {
       eta = 1,
       rate_drop = 0.5,
       one_drop = TRUE,
-      nthread = 1,
+      nthread = n_threads,
       objective = "reg:squarederror"
     ),
     data = dtrain,
@@ -132,10 +151,13 @@ test_that("train and predict softprob", {
   lb <- as.numeric(iris$Species) - 1
   set.seed(11)
   expect_output(
-    bst <- xgboost(data = as.matrix(iris[, -5]), label = lb,
-                   max_depth = 3, eta = 0.5, nthread = 2, nrounds = 5,
-                   objective = "multi:softprob", num_class = 3, eval_metric = "merror")
-  , "train-merror")
+    bst <- xgboost(
+      data = as.matrix(iris[, -5]), label = lb,
+      max_depth = 3, eta = 0.5, nthread = n_threads, nrounds = 5,
+      objective = "multi:softprob", num_class = 3, eval_metric = "merror"
+    ),
+    "train-merror"
+  )
   expect_false(is.null(bst$evaluation_log))
   expect_lt(bst$evaluation_log[, min(train_merror)], 0.025)
   expect_equal(bst$niter * 3, xgb.ntree(bst))
@@ -164,9 +186,10 @@ test_that("train and predict softprob", {
     x3 = rnorm(100)
   )
   y <- sample.int(10, 100, replace = TRUE) - 1
-  dtrain <- xgb.DMatrix(data = d, info = list(label = y))
+  dtrain <- xgb.DMatrix(data = d, info = list(label = y), nthread = n_threads)
   booster <- xgb.train(
-    params = list(tree_method = "hist"), data = dtrain, nrounds = 4, num_class = 10,
+    params = list(tree_method = "hist", nthread = n_threads),
+    data = dtrain, nrounds = 4, num_class = 10,
     objective = "multi:softprob"
   )
   predt <- predict(booster, as.matrix(d), reshape = TRUE, strict_shape = FALSE)
@@ -178,10 +201,13 @@ test_that("train and predict softmax", {
   lb <- as.numeric(iris$Species) - 1
   set.seed(11)
   expect_output(
-    bst <- xgboost(data = as.matrix(iris[, -5]), label = lb,
-                   max_depth = 3, eta = 0.5, nthread = 2, nrounds = 5,
-                   objective = "multi:softmax", num_class = 3, eval_metric = "merror")
-  , "train-merror")
+    bst <- xgboost(
+      data = as.matrix(iris[, -5]), label = lb,
+      max_depth = 3, eta = 0.5, nthread = n_threads, nrounds = 5,
+      objective = "multi:softmax", num_class = 3, eval_metric = "merror"
+    ),
+    "train-merror"
+  )
   expect_false(is.null(bst$evaluation_log))
   expect_lt(bst$evaluation_log[, min(train_merror)], 0.025)
   expect_equal(bst$niter * 3, xgb.ntree(bst))
@@ -196,16 +222,19 @@ test_that("train and predict RF", {
   set.seed(11)
   lb <- train$label
   # single iteration
-  bst <- xgboost(data = train$data, label = lb, max_depth = 5,
-                 nthread = 2, nrounds = 1, objective = "binary:logistic", eval_metric = "error",
-                 num_parallel_tree = 20, subsample = 0.6, colsample_bytree = 0.1)
+  bst <- xgboost(
+    data = train$data, label = lb, max_depth = 5,
+    nthread = n_threads,
+    nrounds = 1, objective = "binary:logistic", eval_metric = "error",
+    num_parallel_tree = 20, subsample = 0.6, colsample_bytree = 0.1
+  )
   expect_equal(bst$niter, 1)
   expect_equal(xgb.ntree(bst), 20)
 
   pred <- predict(bst, train$data)
   pred_err <- sum((pred > 0.5) != lb) / length(lb)
   expect_lt(abs(bst$evaluation_log[1, train_error] - pred_err), 10e-6)
-  #expect_lt(pred_err, 0.03)
+  # expect_lt(pred_err, 0.03)
 
   pred <- predict(bst, train$data, ntreelimit = 20)
   pred_err_20 <- sum((pred > 0.5) != lb) / length(lb)
@@ -219,11 +248,13 @@ test_that("train and predict RF with softprob", {
   lb <- as.numeric(iris$Species) - 1
   nrounds <- 15
   set.seed(11)
-  bst <- xgboost(data = as.matrix(iris[, -5]), label = lb,
-                 max_depth = 3, eta = 0.9, nthread = 2, nrounds = nrounds,
-                 objective = "multi:softprob", eval_metric = "merror",
-                 num_class = 3, verbose = 0,
-                 num_parallel_tree = 4, subsample = 0.5, colsample_bytree = 0.5)
+  bst <- xgboost(
+    data = as.matrix(iris[, -5]), label = lb,
+    max_depth = 3, eta = 0.9, nthread = n_threads, nrounds = nrounds,
+    objective = "multi:softprob", eval_metric = "merror",
+    num_class = 3, verbose = 0,
+    num_parallel_tree = 4, subsample = 0.5, colsample_bytree = 0.5
+  )
   expect_equal(bst$niter, 15)
   expect_equal(xgb.ntree(bst), 15 * 3 * 4)
   # predict for all iterations:
@@ -240,18 +271,24 @@ test_that("train and predict RF with softprob", {
 
 test_that("use of multiple eval metrics works", {
   expect_output(
-    bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
-                   eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic",
-                   eval_metric = 'error', eval_metric = 'auc', eval_metric = "logloss")
-  , "train-error.*train-auc.*train-logloss")
+    bst <- xgboost(
+      data = train$data, label = train$label, max_depth = 2,
+      eta = 1, nthread = n_threads, nrounds = 2, objective = "binary:logistic",
+      eval_metric = "error", eval_metric = "auc", eval_metric = "logloss"
+    ),
+    "train-error.*train-auc.*train-logloss"
+  )
   expect_false(is.null(bst$evaluation_log))
   expect_equal(dim(bst$evaluation_log), c(2, 4))
   expect_equal(colnames(bst$evaluation_log), c("iter", "train_error", "train_auc", "train_logloss"))
   expect_output(
-    bst2 <- xgboost(data = train$data, label = train$label, max_depth = 2,
-                    eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic",
-                    eval_metric = list("error", "auc", "logloss"))
-  , "train-error.*train-auc.*train-logloss")
+    bst2 <- xgboost(
+      data = train$data, label = train$label, max_depth = 2,
+      eta = 1, nthread = n_threads, nrounds = 2, objective = "binary:logistic",
+      eval_metric = list("error", "auc", "logloss")
+    ),
+    "train-error.*train-auc.*train-logloss"
+  )
   expect_false(is.null(bst2$evaluation_log))
   expect_equal(dim(bst2$evaluation_log), c(2, 4))
   expect_equal(colnames(bst2$evaluation_log), c("iter", "train_error", "train_auc", "train_logloss"))
@@ -259,9 +296,11 @@ test_that("use of multiple eval metrics works", {
 
 
 test_that("training continuation works", {
-  dtrain <- xgb.DMatrix(train$data, label = train$label)
+  dtrain <- xgb.DMatrix(train$data, label = train$label, nthread = n_threads)
   watchlist <- list(train = dtrain)
-  param <- list(objective = "binary:logistic", max_depth = 2, eta = 1, nthread = 2)
+  param <- list(
+    objective = "binary:logistic", max_depth = 2, eta = 1, nthread = n_threads
+  )
 
   # for the reference, use 4 iterations at once:
   set.seed(11)
@@ -271,30 +310,33 @@ test_that("training continuation works", {
   bst1 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0)
   # continue for two more:
   bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = bst1)
-  if (!windows_flag && !solaris_flag)
+  if (!windows_flag && !solaris_flag) {
     expect_equal(bst$raw, bst2$raw)
+  }
   expect_false(is.null(bst2$evaluation_log))
   expect_equal(dim(bst2$evaluation_log), c(4, 2))
   expect_equal(bst2$evaluation_log, bst$evaluation_log)
   # test continuing from raw model data
   bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = bst1$raw)
-  if (!windows_flag && !solaris_flag)
+  if (!windows_flag && !solaris_flag) {
     expect_equal(bst$raw, bst2$raw)
+  }
   expect_equal(dim(bst2$evaluation_log), c(2, 2))
   # test continuing from a model in file
   xgb.save(bst1, "xgboost.json")
   bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = "xgboost.json")
-  if (!windows_flag && !solaris_flag)
+  if (!windows_flag && !solaris_flag) {
     expect_equal(bst$raw, bst2$raw)
+  }
   expect_equal(dim(bst2$evaluation_log), c(2, 2))
   file.remove("xgboost.json")
 })
 
 test_that("model serialization works", {
   out_path <- "model_serialization"
-  dtrain <- xgb.DMatrix(train$data, label = train$label)
+  dtrain <- xgb.DMatrix(train$data, label = train$label, nthread = n_threads)
   watchlist <- list(train = dtrain)
-  param <- list(objective = "binary:logistic")
+  param <- list(objective = "binary:logistic", nthread = n_threads)
   booster <- xgb.train(param, dtrain, nrounds = 4, watchlist)
   raw <- xgb.serialize(booster)
   saveRDS(raw, out_path)
@@ -309,11 +351,14 @@ test_that("model serialization works", {
 test_that("xgb.cv works", {
   set.seed(11)
   expect_output(
-    cv <- xgb.cv(data = train$data, label = train$label, max_depth = 2, nfold = 5,
-                 eta = 1., nthread = 2, nrounds = 2, objective = "binary:logistic",
-                 eval_metric = "error", verbose = TRUE)
-  , "train-error:")
-  expect_is(cv, 'xgb.cv.synchronous')
+    cv <- xgb.cv(
+      data = train$data, label = train$label, max_depth = 2, nfold = 5,
+      eta = 1., nthread = n_threads, nrounds = 2, objective = "binary:logistic",
+      eval_metric = "error", verbose = TRUE
+    ),
+    "train-error:"
+  )
+  expect_is(cv, "xgb.cv.synchronous")
   expect_false(is.null(cv$evaluation_log))
   expect_lt(cv$evaluation_log[, min(test_error_mean)], 0.03)
   expect_lt(cv$evaluation_log[, min(test_error_std)], 0.008)
@@ -326,15 +371,19 @@ test_that("xgb.cv works", {
 })
 
 test_that("xgb.cv works with stratified folds", {
-  dtrain <- xgb.DMatrix(train$data, label = train$label)
+  dtrain <- xgb.DMatrix(train$data, label = train$label, nthread = n_threads)
   set.seed(314159)
-  cv <- xgb.cv(data = dtrain, max_depth = 2, nfold = 5,
-               eta = 1., nthread = 2, nrounds = 2, objective = "binary:logistic",
-               verbose = TRUE, stratified = FALSE)
+  cv <- xgb.cv(
+    data = dtrain, max_depth = 2, nfold = 5,
+    eta = 1., nthread = n_threads, nrounds = 2, objective = "binary:logistic",
+    verbose = TRUE, stratified = FALSE
+  )
   set.seed(314159)
-  cv2 <- xgb.cv(data = dtrain, max_depth = 2, nfold = 5,
-                eta = 1., nthread = 2, nrounds = 2, objective = "binary:logistic",
-                verbose = TRUE, stratified = TRUE)
+  cv2 <- xgb.cv(
+    data = dtrain, max_depth = 2, nfold = 5,
+    eta = 1., nthread = n_threads, nrounds = 2, objective = "binary:logistic",
+    verbose = TRUE, stratified = TRUE
+  )
   # Stratified folds should result in a different evaluation logs
   expect_true(all(cv$evaluation_log[, test_logloss_mean] != cv2$evaluation_log[, test_logloss_mean]))
 })
@@ -342,40 +391,57 @@ test_that("xgb.cv works with stratified folds", {
 test_that("train and predict with non-strict classes", {
   # standard dense matrix input
   train_dense <- as.matrix(train$data)
-  bst <- xgboost(data = train_dense, label = train$label, max_depth = 2,
-                 eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", verbose = 0)
+  bst <- xgboost(
+    data = train_dense, label = train$label, max_depth = 2,
+    eta = 1, nthread = n_threads, nrounds = 2, objective = "binary:logistic",
+    verbose = 0
+  )
   pr0 <- predict(bst, train_dense)
 
   # dense matrix-like input of non-matrix class
-  class(train_dense) <- 'shmatrix'
+  class(train_dense) <- "shmatrix"
   expect_true(is.matrix(train_dense))
   expect_error(
-    bst <- xgboost(data = train_dense, label = train$label, max_depth = 2,
-                   eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", verbose = 0)
-    , regexp = NA)
+    bst <- xgboost(
+      data = train_dense, label = train$label, max_depth = 2,
+      eta = 1, nthread = n_threads, nrounds = 2, objective = "binary:logistic",
+      verbose = 0
+    ),
+    regexp = NA
+  )
   expect_error(pr <- predict(bst, train_dense), regexp = NA)
   expect_equal(pr0, pr)
 
   # dense matrix-like input of non-matrix class with some inheritance
-  class(train_dense) <- c('pphmatrix', 'shmatrix')
+  class(train_dense) <- c("pphmatrix", "shmatrix")
   expect_true(is.matrix(train_dense))
   expect_error(
-    bst <- xgboost(data = train_dense, label = train$label, max_depth = 2,
-                   eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", verbose = 0)
-    , regexp = NA)
+    bst <- xgboost(
+      data = train_dense, label = train$label, max_depth = 2,
+      eta = 1, nthread = n_threads, nrounds = 2, objective = "binary:logistic",
+      verbose = 0
+    ),
+    regexp = NA
+  )
   expect_error(pr <- predict(bst, train_dense), regexp = NA)
   expect_equal(pr0, pr)
 
   # when someone inherits from xgb.Booster, it should still be possible to use it as xgb.Booster
-  class(bst) <- c('super.Booster', 'xgb.Booster')
+  class(bst) <- c("super.Booster", "xgb.Booster")
   expect_error(pr <- predict(bst, train_dense), regexp = NA)
   expect_equal(pr0, pr)
 })
 
 test_that("max_delta_step works", {
-  dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
+  dtrain <- xgb.DMatrix(
+    agaricus.train$data, label = agaricus.train$label, nthread = n_threads
+  )
   watchlist <- list(train = dtrain)
-  param <- list(objective = "binary:logistic", eval_metric = "logloss", max_depth = 2, nthread = 2, eta = 0.5)
+  param <- list(
+    objective = "binary:logistic", eval_metric = "logloss", max_depth = 2,
+    nthread = n_threads,
+    eta = 0.5
+  )
   nrounds <- 5
   # model with no restriction on max_delta_step
   bst1 <- xgb.train(param, dtrain, nrounds, watchlist, verbose = 1)
@@ -395,14 +461,16 @@ test_that("colsample_bytree works", {
   test_y <- as.numeric(rowSums(test_x) > 0)
   colnames(train_x) <- paste0("Feature_", sprintf("%03d", 1:100))
   colnames(test_x) <- paste0("Feature_", sprintf("%03d", 1:100))
-  dtrain <- xgb.DMatrix(train_x, label = train_y)
-  dtest <- xgb.DMatrix(test_x, label = test_y)
+  dtrain <- xgb.DMatrix(train_x, label = train_y, nthread = n_threads)
+  dtest <- xgb.DMatrix(test_x, label = test_y, nthread = n_threads)
   watchlist <- list(train = dtrain, eval = dtest)
   ## Use colsample_bytree = 0.01, so that roughly one out of 100 features is chosen for
   ## each tree
-  param <- list(max_depth = 2, eta = 0, nthread = 2,
-                colsample_bytree = 0.01, objective = "binary:logistic",
-                eval_metric = "auc")
+  param <- list(
+    max_depth = 2, eta = 0, nthread = n_threads,
+    colsample_bytree = 0.01, objective = "binary:logistic",
+    eval_metric = "auc"
+  )
   set.seed(2)
   bst <- xgb.train(param, dtrain, nrounds = 100, watchlist, verbose = 0)
   xgb.importance(model = bst)
@@ -412,9 +480,11 @@ test_that("colsample_bytree works", {
 })
 
 test_that("Configuration works", {
-  bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
-                 eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic",
-                 eval_metric = 'error', eval_metric = 'auc', eval_metric = "logloss")
+  bst <- xgboost(
+    data = train$data, label = train$label, max_depth = 2,
+    eta = 1, nthread = n_threads, nrounds = 2, objective = "binary:logistic",
+    eval_metric = "error", eval_metric = "auc", eval_metric = "logloss"
+  )
   config <- xgb.config(bst)
   xgb.config(bst) <- config
   reloaded_config <- xgb.config(bst)
@@ -451,22 +521,26 @@ test_that("strict_shape works", {
     y <- as.numeric(iris$Species) - 1
     X <- as.matrix(iris[, -5])
 
-    bst <- xgboost(data = X, label = y,
-                   max_depth = 2, nrounds = n_rounds,
-                   objective = "multi:softprob", num_class = 3, eval_metric = "merror")
+    bst <- xgboost(
+      data = X, label = y,
+      max_depth = 2, nrounds = n_rounds, nthread = n_threads,
+      objective = "multi:softprob", num_class = 3, eval_metric = "merror"
+    )
 
     test_strict_shape(bst, X, 3)
   }
 
 
   test_agaricus <- function() {
-    data(agaricus.train, package = 'xgboost')
+    data(agaricus.train, package = "xgboost")
     X <- agaricus.train$data
     y <- agaricus.train$label
 
-    bst <- xgboost(data = X, label = y, max_depth = 2,
-                   nrounds = n_rounds, objective = "binary:logistic",
-                   eval_metric = 'error', eval_metric = 'auc', eval_metric = "logloss")
+    bst <- xgboost(
+      data = X, label = y, max_depth = 2, nthread = n_threads,
+      nrounds = n_rounds, objective = "binary:logistic",
+      eval_metric = "error", eval_metric = "auc", eval_metric = "logloss"
+    )
 
     test_strict_shape(bst, X, 1)
   }
@@ -481,8 +555,10 @@ test_that("'predict' accepts CSR data", {
   x_csc <- as(X[1L, , drop = FALSE], "CsparseMatrix")
   x_csr <- as(x_csc, "RsparseMatrix")
   x_spv <- as(x_csc, "sparseVector")
-  bst <- xgboost(data = X, label = y, objective = "binary:logistic",
-                 nrounds = 5L, verbose = FALSE)
+  bst <- xgboost(
+    data = X, label = y, objective = "binary:logistic",
+    nrounds = 5L, verbose = FALSE, nthread = n_threads,
+  )
   p_csc <- predict(bst, x_csc)
   p_csr <- predict(bst, x_csr)
   p_spv <- predict(bst, x_spv)
diff --git a/R-package/tests/testthat/test_callbacks.R b/R-package/tests/testthat/test_callbacks.R
index e6fe14c6b..b5d3c5310 100644
--- a/R-package/tests/testthat/test_callbacks.R
+++ b/R-package/tests/testthat/test_callbacks.R
@@ -6,6 +6,8 @@ data(agaricus.test, package = 'xgboost')
 train <- agaricus.train
 test <- agaricus.test
 
+n_threads <- 2
+
 # add some label noise for early stopping tests
 add.noise <- function(label, frac) {
   inoise <- sample(length(label), length(label) * frac)
@@ -15,15 +17,15 @@ add.noise <- function(label, frac) {
 set.seed(11)
 ltrain <- add.noise(train$label, 0.2)
 ltest <- add.noise(test$label, 0.2)
-dtrain <- xgb.DMatrix(train$data, label = ltrain)
-dtest <- xgb.DMatrix(test$data, label = ltest)
+dtrain <- xgb.DMatrix(train$data, label = ltrain, nthread = n_threads)
+dtest <- xgb.DMatrix(test$data, label = ltest, nthread = n_threads)
 watchlist <- list(train = dtrain, test = dtest)
 
 
 err <- function(label, pr) sum((pr > 0.5) != label) / length(label)
 
 param <- list(objective = "binary:logistic", eval_metric = "error",
-              max_depth = 2, nthread = 2)
+              max_depth = 2, nthread = n_threads)
 
 
 test_that("cb.print.evaluation works as expected", {
@@ -103,7 +105,7 @@ test_that("cb.evaluation.log works as expected", {
 
 
 param <- list(objective = "binary:logistic", eval_metric = "error",
-              max_depth = 4, nthread = 2)
+              max_depth = 4, nthread = n_threads)
 
 test_that("can store evaluation_log without printing", {
   expect_silent(
@@ -179,8 +181,10 @@ test_that("cb.save.model works as expected", {
   expect_true(file.exists('xgboost_01.json'))
   expect_true(file.exists('xgboost_02.json'))
   b1 <- xgb.load('xgboost_01.json')
+  xgb.parameters(b1) <- list(nthread = 2)
   expect_equal(xgb.ntree(b1), 1)
   b2 <- xgb.load('xgboost_02.json')
+  xgb.parameters(b2) <- list(nthread = 2)
   expect_equal(xgb.ntree(b2), 2)
 
   xgb.config(b2) <- xgb.config(bst)
@@ -267,7 +271,8 @@ test_that("early stopping works with titanic", {
     objective = "binary:logistic",
     eval_metric = "auc",
     nrounds = 100,
-    early_stopping_rounds = 3
+    early_stopping_rounds = 3,
+    nthread = n_threads
   )
 
   expect_true(TRUE)  # should not crash
@@ -308,7 +313,7 @@ test_that("prediction in xgb.cv works", {
 
 test_that("prediction in xgb.cv works for gblinear too", {
   set.seed(11)
-  p <- list(booster = 'gblinear', objective = "reg:logistic", nthread = 2)
+  p <- list(booster = 'gblinear', objective = "reg:logistic", nthread = n_threads)
   cv <- xgb.cv(p, dtrain, nfold = 5, eta = 0.5, nrounds = 2, prediction = TRUE, verbose = 0)
   expect_false(is.null(cv$evaluation_log))
   expect_false(is.null(cv$pred))
@@ -341,7 +346,7 @@ test_that("prediction in xgb.cv for softprob works", {
   set.seed(11)
   expect_warning(
     cv <- xgb.cv(data = as.matrix(iris[, -5]), label = lb, nfold = 4,
-                 eta = 0.5, nrounds = 5, max_depth = 3, nthread = 2,
+                 eta = 0.5, nrounds = 5, max_depth = 3, nthread = n_threads,
                  subsample = 0.8, gamma = 2, verbose = 0,
                  prediction = TRUE, objective = "multi:softprob", num_class = 3)
   , NA)
diff --git a/R-package/tests/testthat/test_custom_objective.R b/R-package/tests/testthat/test_custom_objective.R
index 42f43cede..ff8eb1d6d 100644
--- a/R-package/tests/testthat/test_custom_objective.R
+++ b/R-package/tests/testthat/test_custom_objective.R
@@ -2,10 +2,16 @@ context('Test models with custom objective')
 
 set.seed(1994)
 
+n_threads <- 2
+
 data(agaricus.train, package = 'xgboost')
 data(agaricus.test, package = 'xgboost')
-dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
-dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
+dtrain <- xgb.DMatrix(
+  agaricus.train$data, label = agaricus.train$label, nthread = n_threads
+)
+dtest <- xgb.DMatrix(
+  agaricus.test$data, label = agaricus.test$label, nthread = n_threads
+)
 watchlist <- list(eval = dtest, train = dtrain)
 
 logregobj <- function(preds, dtrain) {
@@ -22,7 +28,7 @@ evalerror <- function(preds, dtrain) {
   return(list(metric = "error", value = err))
 }
 
-param <- list(max_depth = 2, eta = 1, nthread = 2,
+param <- list(max_depth = 2, eta = 1, nthread = n_threads,
               objective = logregobj, eval_metric = evalerror)
 num_round <- 2
 
@@ -67,7 +73,7 @@ test_that("custom objective using DMatrix attr works", {
 test_that("custom objective with multi-class shape", {
   data <- as.matrix(iris[, -5])
   label <-  as.numeric(iris$Species) - 1
-  dtrain <- xgb.DMatrix(data = data, label = label)
+  dtrain <- xgb.DMatrix(data = data, label = label, nthread = n_threads)
   n_classes <- 3
 
   fake_softprob <- function(preds, dtrain) {
diff --git a/R-package/tests/testthat/test_dmatrix.R b/R-package/tests/testthat/test_dmatrix.R
index 8d74a0357..461b7d158 100644
--- a/R-package/tests/testthat/test_dmatrix.R
+++ b/R-package/tests/testthat/test_dmatrix.R
@@ -5,19 +5,21 @@ data(agaricus.test, package = "xgboost")
 test_data <- agaricus.test$data[1:100, ]
 test_label <- agaricus.test$label[1:100]
 
+n_threads <- 2
+
 test_that("xgb.DMatrix: basic construction", {
   # from sparse matrix
-  dtest1 <- xgb.DMatrix(test_data, label = test_label)
+  dtest1 <- xgb.DMatrix(test_data, label = test_label, nthread = n_threads)
 
   # from dense matrix
-  dtest2 <- xgb.DMatrix(as.matrix(test_data), label = test_label)
+  dtest2 <- xgb.DMatrix(as.matrix(test_data), label = test_label, nthread = n_threads)
   expect_equal(getinfo(dtest1, "label"), getinfo(dtest2, "label"))
   expect_equal(dim(dtest1), dim(dtest2))
 
   # from dense integer matrix
   int_data <- as.matrix(test_data)
   storage.mode(int_data) <- "integer"
-  dtest3 <- xgb.DMatrix(int_data, label = test_label)
+  dtest3 <- xgb.DMatrix(int_data, label = test_label, nthread = n_threads)
   expect_equal(dim(dtest1), dim(dtest3))
 
   n_samples <- 100
@@ -29,15 +31,15 @@ test_that("xgb.DMatrix: basic construction", {
   X <- matrix(X, nrow = n_samples)
   y <- rbinom(n = n_samples, size = 1, prob = 1 / 2)
 
-  fd <- xgb.DMatrix(X, label = y, missing = 1)
+  fd <- xgb.DMatrix(X, label = y, missing = 1, nthread = n_threads)
 
   dgc <- as(X, "dgCMatrix")
-  fdgc <- xgb.DMatrix(dgc, label = y, missing = 1.0)
+  fdgc <- xgb.DMatrix(dgc, label = y, missing = 1.0, nthread = n_threads)
 
   dgr <- as(X, "dgRMatrix")
-  fdgr <- xgb.DMatrix(dgr, label = y, missing = 1)
+  fdgr <- xgb.DMatrix(dgr, label = y, missing = 1, nthread = n_threads)
 
-  params <- list(tree_method = "hist")
+  params <- list(tree_method = "hist", nthread = n_threads)
   bst_fd <- xgb.train(
     params, nrounds = 8, fd, watchlist = list(train = fd)
   )
@@ -64,12 +66,12 @@ test_that("xgb.DMatrix: NA", {
   )
   x[1, "x1"] <- NA
 
-  m <- xgb.DMatrix(x)
+  m <- xgb.DMatrix(x, nthread = n_threads)
   xgb.DMatrix.save(m, "int.dmatrix")
 
   x <- matrix(as.numeric(x), nrow = n_samples, ncol = 2)
   colnames(x) <- c("x1", "x2")
-  m <- xgb.DMatrix(x)
+  m <- xgb.DMatrix(x, nthread = n_threads)
 
   xgb.DMatrix.save(m, "float.dmatrix")
 
@@ -94,7 +96,7 @@ test_that("xgb.DMatrix: NA", {
 
 test_that("xgb.DMatrix: saving, loading", {
   # save to a local file
-  dtest1 <- xgb.DMatrix(test_data, label = test_label)
+  dtest1 <- xgb.DMatrix(test_data, label = test_label, nthread = n_threads)
   tmp_file <- tempfile('xgb.DMatrix_')
   on.exit(unlink(tmp_file))
   expect_true(xgb.DMatrix.save(dtest1, tmp_file))
@@ -109,13 +111,17 @@ test_that("xgb.DMatrix: saving, loading", {
   tmp_file <- tempfile(fileext = ".libsvm")
   writeLines(tmp, tmp_file)
   expect_true(file.exists(tmp_file))
-  dtest4 <- xgb.DMatrix(paste(tmp_file, "?format=libsvm", sep = ""), silent = TRUE)
+  dtest4 <- xgb.DMatrix(
+    paste(tmp_file, "?format=libsvm", sep = ""), silent = TRUE, nthread = n_threads
+  )
   expect_equal(dim(dtest4), c(3, 4))
   expect_equal(getinfo(dtest4, 'label'), c(0, 1, 0))
 
   # check that feature info is saved
   data(agaricus.train, package = 'xgboost')
-  dtrain <- xgb.DMatrix(data = agaricus.train$data, label = agaricus.train$label)
+  dtrain <- xgb.DMatrix(
+    data = agaricus.train$data, label = agaricus.train$label, nthread = n_threads
+  )
   cnames <- colnames(dtrain)
   expect_equal(length(cnames), 126)
   tmp_file <- tempfile('xgb.DMatrix_')
@@ -129,7 +135,7 @@ test_that("xgb.DMatrix: saving, loading", {
 })
 
 test_that("xgb.DMatrix: getinfo & setinfo", {
-  dtest <- xgb.DMatrix(test_data)
+  dtest <- xgb.DMatrix(test_data, nthread = n_threads)
   expect_true(setinfo(dtest, 'label', test_label))
   labels <- getinfo(dtest, 'label')
   expect_equal(test_label, getinfo(dtest, 'label'))
@@ -156,7 +162,7 @@ test_that("xgb.DMatrix: getinfo & setinfo", {
 })
 
 test_that("xgb.DMatrix: slice, dim", {
-  dtest <- xgb.DMatrix(test_data, label = test_label)
+  dtest <- xgb.DMatrix(test_data, label = test_label, nthread = n_threads)
   expect_equal(dim(dtest), dim(test_data))
   dsub1 <- slice(dtest, 1:42)
   expect_equal(nrow(dsub1), 42)
@@ -171,16 +177,20 @@ test_that("xgb.DMatrix: slice, trailing empty rows", {
   data(agaricus.train, package = 'xgboost')
   train_data <- agaricus.train$data
   train_label <- agaricus.train$label
-  dtrain <- xgb.DMatrix(data = train_data, label = train_label)
+  dtrain <- xgb.DMatrix(
+    data = train_data, label = train_label, nthread = n_threads
+  )
   slice(dtrain, 6513L)
   train_data[6513, ] <- 0
-  dtrain <- xgb.DMatrix(data = train_data, label = train_label)
+  dtrain <- xgb.DMatrix(
+    data = train_data, label = train_label, nthread = n_threads
+  )
   slice(dtrain, 6513L)
   expect_equal(nrow(dtrain), 6513)
 })
 
 test_that("xgb.DMatrix: colnames", {
-  dtest <- xgb.DMatrix(test_data, label = test_label)
+  dtest <- xgb.DMatrix(test_data, label = test_label, nthread = n_threads)
   expect_equal(colnames(dtest), colnames(test_data))
   expect_error(colnames(dtest) <- 'asdf')
   new_names <- make.names(seq_len(ncol(test_data)))
@@ -196,7 +206,7 @@ test_that("xgb.DMatrix: nrow is correct for a very sparse matrix", {
   x <- Matrix::rsparsematrix(nr, 100, density = 0.0005)
   # we want it very sparse, so that last rows are empty
   expect_lt(max(x@i), nr)
-  dtest <- xgb.DMatrix(x)
+  dtest <- xgb.DMatrix(x, nthread = n_threads)
   expect_equal(dim(dtest), dim(x))
 })
 
@@ -205,8 +215,8 @@ test_that("xgb.DMatrix: print", {
 
     # core DMatrix with just data and labels
     dtrain <- xgb.DMatrix(
-        data = agaricus.train$data
-        , label = agaricus.train$label
+      data = agaricus.train$data, label = agaricus.train$label,
+      nthread = n_threads
     )
     txt <- capture.output({
         print(dtrain)
@@ -222,10 +232,11 @@ test_that("xgb.DMatrix: print", {
 
     # DMatrix with weights and base_margin
     dtrain <- xgb.DMatrix(
-        data = agaricus.train$data
-        , label = agaricus.train$label
-        , weight = seq_along(agaricus.train$label)
-        , base_margin = agaricus.train$label
+      data = agaricus.train$data,
+      label = agaricus.train$label,
+      weight = seq_along(agaricus.train$label),
+      base_margin = agaricus.train$label,
+      nthread = n_threads
     )
     txt <- capture.output({
         print(dtrain)
@@ -234,7 +245,8 @@ test_that("xgb.DMatrix: print", {
 
     # DMatrix with just features
     dtrain <- xgb.DMatrix(
-        data = agaricus.train$data
+      data = agaricus.train$data,
+      nthread = n_threads
     )
     txt <- capture.output({
         print(dtrain)
@@ -245,7 +257,8 @@ test_that("xgb.DMatrix: print", {
     data_no_colnames <- agaricus.train$data
     colnames(data_no_colnames) <- NULL
     dtrain <- xgb.DMatrix(
-        data = data_no_colnames
+      data = data_no_colnames,
+      nthread = n_threads
     )
     txt <- capture.output({
         print(dtrain)
diff --git a/R-package/tests/testthat/test_feature_weights.R b/R-package/tests/testthat/test_feature_weights.R
index bb3802979..4ed78c9b6 100644
--- a/R-package/tests/testthat/test_feature_weights.R
+++ b/R-package/tests/testthat/test_feature_weights.R
@@ -1,5 +1,7 @@
 context("feature weights")
 
+n_threads <- 2
+
 test_that("training with feature weights works", {
   nrows <- 1000
   ncols <- 9
@@ -10,8 +12,12 @@ test_that("training with feature weights works", {
 
   test <- function(tm) {
     names <- paste0("f", 1:ncols)
-    xy <- xgb.DMatrix(data = x, label = y, feature_weights = weights)
-    params <- list(colsample_bynode = 0.4, tree_method = tm, nthread = 1)
+    xy <- xgb.DMatrix(
+      data = x, label = y, feature_weights = weights, nthread = n_threads
+    )
+    params <- list(
+      colsample_bynode = 0.4, tree_method = tm, nthread = n_threads
+    )
     model <- xgb.train(params = params, data = xy, nrounds = 32)
     importance <- xgb.importance(model = model, feature_names = names)
     expect_equal(dim(importance), c(ncols, 4))
diff --git a/R-package/tests/testthat/test_glm.R b/R-package/tests/testthat/test_glm.R
index 2d050945a..9e0a3551f 100644
--- a/R-package/tests/testthat/test_glm.R
+++ b/R-package/tests/testthat/test_glm.R
@@ -1,13 +1,19 @@
 context('Test generalized linear models')
 
+n_threads <- 2
+
 test_that("gblinear works", {
   data(agaricus.train, package = 'xgboost')
   data(agaricus.test, package = 'xgboost')
-  dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
-  dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
+  dtrain <- xgb.DMatrix(
+    agaricus.train$data, label = agaricus.train$label, nthread = n_threads
+  )
+  dtest <- xgb.DMatrix(
+    agaricus.test$data, label = agaricus.test$label, nthread = n_threads
+  )
 
   param <- list(objective = "binary:logistic", eval_metric = "error", booster = "gblinear",
-                nthread = 2, eta = 0.8, alpha = 0.0001, lambda = 0.0001)
+                nthread = n_threads, eta = 0.8, alpha = 0.0001, lambda = 0.0001)
   watchlist <- list(eval = dtest, train = dtrain)
 
   n <- 5         # iterations
@@ -48,12 +54,16 @@ test_that("gblinear works", {
 test_that("gblinear early stopping works", {
   data(agaricus.train, package = 'xgboost')
   data(agaricus.test, package = 'xgboost')
-  dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
-  dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
+  dtrain <- xgb.DMatrix(
+    agaricus.train$data, label = agaricus.train$label, nthread = n_threads
+  )
+  dtest <- xgb.DMatrix(
+    agaricus.test$data, label = agaricus.test$label, nthread = n_threads
+  )
 
   param <- list(
     objective = "binary:logistic", eval_metric = "error", booster = "gblinear",
-    nthread = 2, eta = 0.8, alpha = 0.0001, lambda = 0.0001,
+    nthread = n_threads, eta = 0.8, alpha = 0.0001, lambda = 0.0001,
     updater = "coord_descent"
   )
 
diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R
index f00ac0881..04e034ce1 100644
--- a/R-package/tests/testthat/test_helpers.R
+++ b/R-package/tests/testthat/test_helpers.R
@@ -171,6 +171,7 @@ test_that("SHAPs sum to predictions, with or without DART", {
     fit <- xgboost(
       params = c(
         list(
+          nthread = 2,
           booster = booster,
           objective = "reg:squarederror",
           eval_metric = "rmse"),
@@ -257,7 +258,7 @@ test_that("xgb.Booster serializing as R object works", {
   .skip_if_vcd_not_available()
   saveRDS(bst.Tree, 'xgb.model.rds')
   bst <- readRDS('xgb.model.rds')
-  dtrain <- xgb.DMatrix(sparse_matrix, label = label)
+  dtrain <- xgb.DMatrix(sparse_matrix, label = label, nthread = 2)
   expect_equal(predict(bst.Tree, dtrain), predict(bst, dtrain), tolerance = float_tolerance)
   expect_equal(xgb.dump(bst.Tree), xgb.dump(bst))
   xgb.save(bst, 'xgb.model')
@@ -363,7 +364,8 @@ test_that("xgb.importance works with and without feature names", {
     data = as.matrix(data.frame(x = c(0, 1))),
     label = c(1, 2),
     nrounds = 1,
-    base_score = 0.5
+    base_score = 0.5,
+    nthread = 2
   )
   df <- xgb.model.dt.tree(model = m)
   expect_equal(df$Feature, "Leaf")
diff --git a/R-package/tests/testthat/test_interaction_constraints.R b/R-package/tests/testthat/test_interaction_constraints.R
index 680709638..1ac804501 100644
--- a/R-package/tests/testthat/test_interaction_constraints.R
+++ b/R-package/tests/testthat/test_interaction_constraints.R
@@ -2,6 +2,8 @@ require(xgboost)
 
 context("interaction constraints")
 
+n_threads <- 2
+
 set.seed(1024)
 x1 <- rnorm(1000, 1)
 x2 <- rnorm(1000, 1)
@@ -45,11 +47,18 @@ test_that("interaction constraints scientific representation", {
   d <- matrix(rexp(rows, rate = .1), nrow = rows, ncol = cols)
   y <- rnorm(rows)
 
-  dtrain <- xgb.DMatrix(data = d, info = list(label = y))
+  dtrain <- xgb.DMatrix(data = d, info = list(label = y), nthread = n_threads)
   inc <- list(c(seq.int(from = 0, to = cols, by = 1)))
 
-  with_inc <- xgb.train(data = dtrain, tree_method = 'hist',
-                        interaction_constraints = inc, nrounds = 10)
-  without_inc <- xgb.train(data = dtrain, tree_method = 'hist', nrounds = 10)
+  with_inc <- xgb.train(
+    data = dtrain,
+    tree_method = 'hist',
+    interaction_constraints = inc,
+    nrounds = 10,
+    nthread = n_threads
+  )
+  without_inc <- xgb.train(
+    data = dtrain, tree_method = 'hist', nrounds = 10, nthread = n_threads
+  )
   expect_equal(xgb.save.raw(with_inc), xgb.save.raw(without_inc))
 })
diff --git a/R-package/tests/testthat/test_interactions.R b/R-package/tests/testthat/test_interactions.R
index a658fc81f..398531e0e 100644
--- a/R-package/tests/testthat/test_interactions.R
+++ b/R-package/tests/testthat/test_interactions.R
@@ -1,6 +1,7 @@
 context('Test prediction of feature interactions')
 
 set.seed(123)
+n_threads <- 2
 
 test_that("predict feature interactions works", {
   # simulate some binary data and a linear outcome with an interaction term
@@ -19,8 +20,10 @@ test_that("predict feature interactions works", {
 
   y <- f_int(X)
 
-  dm <- xgb.DMatrix(X, label = y)
-  param <- list(eta = 0.1, max_depth = 4, base_score = mean(y), lambda = 0, nthread = 2)
+  dm <- xgb.DMatrix(X, label = y, nthread = n_threads)
+  param <- list(
+    eta = 0.1, max_depth = 4, base_score = mean(y), lambda = 0, nthread = n_threads
+  )
   b <- xgb.train(param, dm, 100)
 
   pred <- predict(b, dm, outputmargin = TRUE)
@@ -99,11 +102,13 @@ test_that("SHAP contribution values are not NAN", {
     verbose = 0,
     params = list(
       objective = "reg:squarederror",
-      eval_metric = "rmse"),
+      eval_metric = "rmse",
+      nthread = n_threads
+    ),
     data = as.matrix(subset(d, fold == 2)[, ivs]),
     label = subset(d, fold == 2)$y,
-    nthread = 1,
-    nrounds = 3)
+    nrounds = 3
+  )
 
   shaps <- as.data.frame(predict(fit,
     newdata = as.matrix(subset(d, fold == 1)[, ivs]),
@@ -116,8 +121,12 @@ test_that("SHAP contribution values are not NAN", {
 
 
 test_that("multiclass feature interactions work", {
-  dm <- xgb.DMatrix(as.matrix(iris[, -5]), label = as.numeric(iris$Species) - 1)
-  param <- list(eta = 0.1, max_depth = 4, objective = 'multi:softprob', num_class = 3)
+  dm <- xgb.DMatrix(
+    as.matrix(iris[, -5]), label = as.numeric(iris$Species) - 1, nthread = n_threads
+  )
+  param <- list(
+    eta = 0.1, max_depth = 4, objective = 'multi:softprob', num_class = 3, nthread = n_threads
+  )
   b <- xgb.train(param, dm, 40)
   pred <- t(
     array(
@@ -166,6 +175,7 @@ test_that("SHAP single sample works", {
     max_depth = 2,
     nrounds = 4,
     objective = "binary:logistic",
+    nthread = n_threads
   )
 
   predt <- predict(
diff --git a/R-package/tests/testthat/test_io.R b/R-package/tests/testthat/test_io.R
index c2cb1a1a8..8cf5a9ae9 100644
--- a/R-package/tests/testthat/test_io.R
+++ b/R-package/tests/testthat/test_io.R
@@ -9,7 +9,8 @@ test_that("load/save raw works", {
   nrounds <- 8
   booster <- xgboost(
     data = train$data, label = train$label,
-    nrounds = nrounds, objective = "binary:logistic"
+    nrounds = nrounds, objective = "binary:logistic",
+    nthread = 2
   )
 
   json_bytes <- xgb.save.raw(booster, raw_format = "json")
diff --git a/R-package/tests/testthat/test_model_compatibility.R b/R-package/tests/testthat/test_model_compatibility.R
index 2f4992c06..ce1725dc9 100644
--- a/R-package/tests/testthat/test_model_compatibility.R
+++ b/R-package/tests/testthat/test_model_compatibility.R
@@ -9,20 +9,20 @@ metadata <- list(
   kClasses = 3
 )
 
-run_model_param_check <- function (config) {
+run_model_param_check <- function(config) {
   testthat::expect_equal(config$learner$learner_model_param$num_feature, '4')
   testthat::expect_equal(config$learner$learner_train_param$booster, 'gbtree')
 }
 
-get_num_tree <- function (booster) {
+get_num_tree <- function(booster) {
   dump <- xgb.dump(booster)
   m <- regexec('booster\\[[0-9]+\\]', dump, perl = TRUE)
   m <- regmatches(dump, m)
   num_tree <- Reduce('+', lapply(m, length))
-  return (num_tree)
+  return(num_tree)
 }
 
-run_booster_check <- function (booster, name) {
+run_booster_check <- function(booster, name) {
   # If given a handle, we need to call xgb.Booster.complete() prior to using xgb.config().
   if (inherits(booster, "xgb.Booster") && xgboost:::is.null.handle(booster$handle)) {
     booster <- xgb.Booster.complete(booster)
@@ -66,9 +66,9 @@ test_that("Models from previous versions of XGBoost can be loaded", {
   unzip(zipfile, exdir = extract_dir, overwrite = TRUE)
   model_dir <- file.path(extract_dir, 'models')
 
-  pred_data <- xgb.DMatrix(matrix(c(0, 0, 0, 0), nrow = 1, ncol = 4))
+  pred_data <- xgb.DMatrix(matrix(c(0, 0, 0, 0), nrow = 1, ncol = 4), nthread = 2)
 
-  lapply(list.files(model_dir), function (x) {
+  lapply(list.files(model_dir), function(x) {
     model_file <- file.path(model_dir, x)
     m <- regexec("xgboost-([0-9\\.]+)\\.([a-z]+)\\.[a-z]+", model_file, perl = TRUE)
     m <- regmatches(model_file, m)[[1]]
@@ -87,6 +87,7 @@ test_that("Models from previous versions of XGBoost can be loaded", {
         booster <- readRDS(model_file)
       } else {
         booster <- xgb.load(model_file)
+        xgb.parameters(booster) <- list(nthread = 2)
       }
       predict(booster, newdata = pred_data)
       run_booster_check(booster, name)
diff --git a/R-package/tests/testthat/test_parameter_exposure.R b/R-package/tests/testthat/test_parameter_exposure.R
index 47524fbfc..ea71ca7b7 100644
--- a/R-package/tests/testthat/test_parameter_exposure.R
+++ b/R-package/tests/testthat/test_parameter_exposure.R
@@ -3,8 +3,12 @@ context('Test model params and call are exposed to R')
 data(agaricus.train, package = 'xgboost')
 data(agaricus.test, package = 'xgboost')
 
-dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
-dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
+dtrain <- xgb.DMatrix(
+  agaricus.train$data, label = agaricus.train$label, nthread = 2
+)
+dtest <- xgb.DMatrix(
+  agaricus.test$data, label = agaricus.test$label, nthread = 2
+)
 
 bst <- xgboost(data = dtrain,
                max_depth = 2,
diff --git a/R-package/tests/testthat/test_poisson_regression.R b/R-package/tests/testthat/test_poisson_regression.R
index b17c6c072..55918b57a 100644
--- a/R-package/tests/testthat/test_poisson_regression.R
+++ b/R-package/tests/testthat/test_poisson_regression.R
@@ -4,8 +4,10 @@ set.seed(1994)
 
 test_that("Poisson regression works", {
   data(mtcars)
-  bst <- xgboost(data = as.matrix(mtcars[, -11]), label = mtcars[, 11],
-                objective = 'count:poisson', nrounds = 10, verbose = 0)
+  bst <- xgboost(
+    data = as.matrix(mtcars[, -11]), label = mtcars[, 11],
+    objective = 'count:poisson', nrounds = 10, verbose = 0, nthread = 2
+  )
   expect_equal(class(bst), "xgb.Booster")
   pred <- predict(bst, as.matrix(mtcars[, -11]))
   expect_equal(length(pred), 32)
diff --git a/R-package/tests/testthat/test_ranking.R b/R-package/tests/testthat/test_ranking.R
index 9e8d0156e..d4102dfce 100644
--- a/R-package/tests/testthat/test_ranking.R
+++ b/R-package/tests/testthat/test_ranking.R
@@ -1,5 +1,7 @@
 context('Learning to rank')
 
+n_threads <- 2
+
 test_that('Test ranking with unweighted data', {
   X <- Matrix::sparseMatrix(
     i = c(2, 3, 7, 9, 12, 15, 17, 18)
@@ -9,10 +11,10 @@ test_that('Test ranking with unweighted data', {
   )
   y <- c(0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0)
   group <- c(5, 5, 5, 5)
-  dtrain <- xgb.DMatrix(X, label = y, group = group)
+  dtrain <- xgb.DMatrix(X, label = y, group = group, nthread = n_threads)
 
   params <- list(eta = 1, tree_method = 'exact', objective = 'rank:pairwise', max_depth = 1,
-                 eval_metric = 'auc', eval_metric = 'aucpr')
+                 eval_metric = 'auc', eval_metric = 'aucpr', nthread = n_threads)
   bst <- xgb.train(params, dtrain, nrounds = 10, watchlist = list(train = dtrain))
   # Check if the metric is monotone increasing
   expect_true(all(diff(bst$evaluation_log$train_auc) >= 0))
@@ -29,10 +31,14 @@ test_that('Test ranking with weighted data', {
   y <- c(0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0)
   group <- c(5, 5, 5, 5)
   weight <- c(1.0, 2.0, 3.0, 4.0)
-  dtrain <- xgb.DMatrix(X, label = y, group = group, weight = weight)
+  dtrain <- xgb.DMatrix(
+    X, label = y, group = group, weight = weight, nthread = n_threads
+  )
 
-  params <- list(eta = 1, tree_method = 'exact', objective = 'rank:pairwise', max_depth = 1,
-                 eval_metric = 'auc', eval_metric = 'aucpr')
+  params <- list(
+    eta = 1, tree_method = "exact", objective = "rank:pairwise", max_depth = 1,
+    eval_metric = "auc", eval_metric = "aucpr", nthread = n_threads
+  )
   bst <- xgb.train(params, dtrain, nrounds = 10, watchlist = list(train = dtrain))
   # Check if the metric is monotone increasing
   expect_true(all(diff(bst$evaluation_log$train_auc) >= 0))
@@ -41,7 +47,7 @@ test_that('Test ranking with weighted data', {
     pred <- predict(bst, newdata = dtrain, ntreelimit = i)
     # is_sorted[i]: is i-th group correctly sorted by the ranking predictor?
     is_sorted <- lapply(seq(1, 20, by = 5),
-      function (k) {
+      function(k) {
         ind <- order(-pred[k:(k + 4)])
         z <- y[ind + (k - 1)]
         all(diff(z) <= 0)  # Check if z is monotone decreasing
diff --git a/R-package/tests/testthat/test_unicode.R b/R-package/tests/testthat/test_unicode.R
index cac544ef9..c8a225716 100644
--- a/R-package/tests/testthat/test_unicode.R
+++ b/R-package/tests/testthat/test_unicode.R
@@ -16,6 +16,7 @@ test_that("Can save and load models with Unicode paths", {
     path <- file.path(tmpdir, x)
     xgb.save(bst, path)
     bst2 <- xgb.load(path)
+    xgb.parameters(bst2) <- list(nthread = 2)
     expect_equal(predict(bst, test$data), predict(bst2, test$data))
   })
 })
diff --git a/R-package/tests/testthat/test_update.R b/R-package/tests/testthat/test_update.R
index c961bab1a..cf8b6f007 100644
--- a/R-package/tests/testthat/test_update.R
+++ b/R-package/tests/testthat/test_update.R
@@ -2,8 +2,15 @@ context("update trees in an existing model")
 
 data(agaricus.train, package = 'xgboost')
 data(agaricus.test, package = 'xgboost')
-dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
-dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
+
+n_threads <- 1
+
+dtrain <- xgb.DMatrix(
+  agaricus.train$data, label = agaricus.train$label, nthread = n_threads
+)
+dtest <- xgb.DMatrix(
+  agaricus.test$data, label = agaricus.test$label, nthread = n_threads
+)
 
 # Disable flaky tests for 32-bit Windows.
 # See https://github.com/dmlc/xgboost/issues/3720
@@ -14,7 +21,7 @@ test_that("updating the model works", {
 
   # no-subsampling
   p1 <- list(
-    objective = "binary:logistic", max_depth = 2, eta = 0.05, nthread = 2,
+    objective = "binary:logistic", max_depth = 2, eta = 0.05, nthread = n_threads,
     updater = "grow_colmaker,prune"
   )
   set.seed(11)
@@ -86,9 +93,11 @@ test_that("updating the model works", {
 })
 
 test_that("updating works for multiclass & multitree", {
-  dtr <- xgb.DMatrix(as.matrix(iris[, -5]), label = as.numeric(iris$Species) - 1)
+  dtr <- xgb.DMatrix(
+    as.matrix(iris[, -5]), label = as.numeric(iris$Species) - 1, nthread = n_threads
+  )
   watchlist <- list(train = dtr)
-  p0 <- list(max_depth = 2, eta = 0.5, nthread = 2, subsample = 0.6,
+  p0 <- list(max_depth = 2, eta = 0.5, nthread = n_threads, subsample = 0.6,
              objective = "multi:softprob", num_class = 3, num_parallel_tree = 2,
              base_score = 0)
   set.seed(121)
diff --git a/R-package/vignettes/discoverYourData.Rmd b/R-package/vignettes/discoverYourData.Rmd
index 8b9e2e2e3..4b04f771f 100644
--- a/R-package/vignettes/discoverYourData.Rmd
+++ b/R-package/vignettes/discoverYourData.Rmd
@@ -31,6 +31,8 @@ require(data.table)
 if (!require('vcd')) {
   install.packages('vcd')
 }
+
+data.table::setDTthreads(2)
 ```
 
 > **VCD** package is used for one of its embedded dataset only.
@@ -297,23 +299,25 @@ test <- agaricus.test
 
 #Random Forest - 1000 trees
 bst <- xgboost(
-    data = train$data
-    , label = train$label
-    , max_depth = 4
-    , num_parallel_tree = 1000
-    , subsample = 0.5
-    , colsample_bytree = 0.5
-    , nrounds = 1
-    , objective = "binary:logistic"
+    data = train$data,
+    label = train$label,
+    max_depth = 4,
+    num_parallel_tree = 1000,
+    subsample = 0.5,
+    colsample_bytree = 0.5,
+    nrounds = 1,
+    objective = "binary:logistic",
+    nthread = 2
 )
 
 #Boosting - 3 rounds
 bst <- xgboost(
-    data = train$data
-    , label = train$label
-    , max_depth = 4
-    , nrounds = 3
-    , objective = "binary:logistic"
+    data = train$data,
+    label = train$label,
+    max_depth = 4,
+    nrounds = 3,
+    objective = "binary:logistic",
+    nthread = 2
 )
 ```
 
diff --git a/R-package/vignettes/xgboost.Rnw b/R-package/vignettes/xgboost.Rnw
index c9089cd6f..7edf4ace3 100644
--- a/R-package/vignettes/xgboost.Rnw
+++ b/R-package/vignettes/xgboost.Rnw
@@ -86,9 +86,10 @@ data(agaricus.test, package='xgboost')
 train <- agaricus.train
 test <- agaricus.test
 bst <- xgboost(data = train$data, label = train$label, max_depth = 2, eta = 1,
-               nrounds = 2, objective = "binary:logistic")
+               nrounds = 2, objective = "binary:logistic", nthread = 2)
 xgb.save(bst, 'model.save')
 bst = xgb.load('model.save')
+xgb.parameters(bst) <- list(nthread = 2)
 pred <- predict(bst, test$data)
 @
 
@@ -127,7 +128,7 @@ training from initial prediction value, weighted training instance.
 
 We can use \verb@xgb.DMatrix@ to construct an \verb@xgb.DMatrix@ object:
 <<xgb.DMatrix>>=
-dtrain <- xgb.DMatrix(train$data, label = train$label)
+dtrain <- xgb.DMatrix(train$data, label = train$label, nthread = 2)
 class(dtrain)
 head(getinfo(dtrain,'label'))
 @
@@ -161,9 +162,9 @@ evalerror <- function(preds, dtrain) {
   return(list(metric = "MSE", value = err))
 }
 
-dtest <- xgb.DMatrix(test$data, label = test$label)
+dtest <- xgb.DMatrix(test$data, label = test$label, nthread = 2)
 watchlist <- list(eval = dtest, train = dtrain)
-param <- list(max_depth = 2, eta = 1)
+param <- list(max_depth = 2, eta = 1, nthread = 2)
 
 bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, logregobj, evalerror, maximize = FALSE)
 @
diff --git a/R-package/vignettes/xgboostPresentation.Rmd b/R-package/vignettes/xgboostPresentation.Rmd
index 34f5963d5..1b015fab8 100644
--- a/R-package/vignettes/xgboostPresentation.Rmd
+++ b/R-package/vignettes/xgboostPresentation.Rmd
@@ -173,13 +173,13 @@ Alternatively, you can put your dataset in a *dense* matrix, i.e. a basic **R**
 
 ```{r trainingDense, message=F, warning=F}
 bstDense <- xgboost(
-    data = as.matrix(train$data)
-    , label = train$label
-    , max_depth = 2
-    , eta = 1
-    , nthread = 2
-    , nrounds = 2
-    , objective = "binary:logistic"
+    data = as.matrix(train$data),
+    label = train$label,
+    max_depth = 2,
+    eta = 1,
+    nthread = 2,
+    nrounds = 2,
+    objective = "binary:logistic"
 )
 ```
 
@@ -188,14 +188,14 @@ bstDense <- xgboost(
 **XGBoost** offers a way to group them in a `xgb.DMatrix`. You can even add other meta data in it. It will be useful for the most advanced features we will discover later.
 
 ```{r trainingDmatrix, message=F, warning=F}
-dtrain <- xgb.DMatrix(data = train$data, label = train$label)
+dtrain <- xgb.DMatrix(data = train$data, label = train$label, nthread = 2)
 bstDMatrix <- xgboost(
-    data = dtrain
-    , max_depth = 2
-    , eta = 1
-    , nthread = 2
-    , nrounds = 2
-    , objective = "binary:logistic"
+    data = dtrain,
+    max_depth = 2,
+    eta = 1,
+    nthread = 2,
+    nrounds = 2,
+    objective = "binary:logistic"
 )
 ```
 
@@ -314,8 +314,8 @@ Most of the features below have been implemented to help you to improve your mod
 For the following advanced features, we need to put data in `xgb.DMatrix` as explained above.
 
 ```{r DMatrix, message=F, warning=F}
-dtrain <- xgb.DMatrix(data = train$data, label = train$label)
-dtest <- xgb.DMatrix(data = test$data, label = test$label)
+dtrain <- xgb.DMatrix(data = train$data, label = train$label, nthread = 2)
+dtest <- xgb.DMatrix(data = test$data, label = test$label, nthread = 2)
 ```
 
 ### Measure learning progress with xgb.train
@@ -476,6 +476,7 @@ An interesting test to see how identical our saved model is to the original one
 ```{r loadModel, message=F, warning=F}
 # load binary model to R
 bst2 <- xgb.load("xgboost.model")
+xgb.parameters(bst2) <- list(nthread = 2)
 pred2 <- predict(bst2, test$data)
 
 # And now the test
@@ -500,6 +501,7 @@ print(class(rawVec))
 
 # load binary model to R
 bst3 <- xgb.load(rawVec)
+xgb.parameters(bst3) <- list(nthread = 2)
 pred3 <- predict(bst3, test$data)
 
 # pred2 should be identical to pred
diff --git a/R-package/vignettes/xgboostfromJSON.Rmd b/R-package/vignettes/xgboostfromJSON.Rmd
index 0ea2f56cb..e7ccdf3a9 100644
--- a/R-package/vignettes/xgboostfromJSON.Rmd
+++ b/R-package/vignettes/xgboostfromJSON.Rmd
@@ -175,7 +175,7 @@ bst_preds == bst_from_json_preds
 
 None are exactly equal again.  What is going on here?  Well, since we are using the value `1` in the calculations, we have introduced a double into the calculation.  Because of this, all float values are promoted to 64-bit doubles and the 64-bit version of the exponentiation operator `exp` is also used.  On the other hand, xgboost uses the 32-bit version of the exponentiation operator in its [sigmoid function](https://github.com/dmlc/xgboost/blob/54980b8959680a0da06a3fc0ec776e47c8cbb0a1/src/common/math.h#L25-L27).
 
-How do we fix this?  We have to ensure we use the correct data types everywhere and the correct operators.  If we use only floats, the float library that we have loaded will ensure the 32-bit float exponentiation operator is applied. 
+How do we fix this?  We have to ensure we use the correct data types everywhere and the correct operators.  If we use only floats, the float library that we have loaded will ensure the 32-bit float exponentiation operator is applied.
 ```{r}
 # calculate the predictions casting doubles to floats
 bst_from_json_preds <- ifelse(
diff --git a/cmake/Doc.cmake b/cmake/Doc.cmake
index 2ffa005ff..41c4fc961 100644
--- a/cmake/Doc.cmake
+++ b/cmake/Doc.cmake
@@ -1,16 +1,17 @@
-function (run_doxygen)
+function(run_doxygen)
   find_package(Doxygen REQUIRED)
 
-  if (NOT DOXYGEN_DOT_FOUND)
+  if(NOT DOXYGEN_DOT_FOUND)
     message(FATAL_ERROR "Command `dot` not found.  Please install graphviz.")
-  endif (NOT DOXYGEN_DOT_FOUND)
+  endif()
 
   configure_file(
     ${xgboost_SOURCE_DIR}/doc/Doxyfile.in
     ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile @ONLY)
-  add_custom_target( doc_doxygen ALL
+  add_custom_target(
+    doc_doxygen ALL
     COMMAND ${DOXYGEN_EXECUTABLE} ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile
     WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
     COMMENT "Generate C APIs documentation."
     VERBATIM)
-endfunction (run_doxygen)
+endfunction()
diff --git a/cmake/FindPrefetchIntrinsics.cmake b/cmake/FindPrefetchIntrinsics.cmake
index b00ff57d7..71ae5899f 100644
--- a/cmake/FindPrefetchIntrinsics.cmake
+++ b/cmake/FindPrefetchIntrinsics.cmake
@@ -1,4 +1,4 @@
-function (find_prefetch_intrinsics)
+function(find_prefetch_intrinsics)
   include(CheckCXXSourceCompiles)
   check_cxx_source_compiles("
   #include <xmmintrin.h>
@@ -19,4 +19,4 @@ function (find_prefetch_intrinsics)
   " XGBOOST_BUILTIN_PREFETCH_PRESENT)
   set(XGBOOST_MM_PREFETCH_PRESENT ${XGBOOST_MM_PREFETCH_PRESENT} PARENT_SCOPE)
   set(XGBOOST_BUILTIN_PREFETCH_PRESENT ${XGBOOST_BUILTIN_PREFETCH_PRESENT} PARENT_SCOPE)
-endfunction (find_prefetch_intrinsics)
+endfunction()
diff --git a/cmake/Sanitizer.cmake b/cmake/Sanitizer.cmake
index 77d7c93c1..176d967d0 100644
--- a/cmake/Sanitizer.cmake
+++ b/cmake/Sanitizer.cmake
@@ -12,9 +12,9 @@ macro(enable_sanitizer sanitizer)
   elseif(${sanitizer} MATCHES "thread")
     find_package(TSan)
     set(SAN_COMPILE_FLAGS "${SAN_COMPILE_FLAGS} -fsanitize=thread")
-    if (TSan_FOUND)
+    if(TSan_FOUND)
       link_libraries(${TSan_LIBRARY})
-    endif (TSan_FOUND)
+    endif()
 
   elseif(${sanitizer} MATCHES "leak")
     find_package(LSan)
@@ -33,16 +33,16 @@ macro(enable_sanitizers SANITIZERS)
   # Check sanitizers compatibility.
   # Idealy, we should use if(san IN_LIST SANITIZERS) ... endif()
   # But I haven't figure out how to make it work.
-  foreach ( _san ${SANITIZERS} )
+  foreach( _san ${SANITIZERS} )
     string(TOLOWER ${_san} _san)
-    if (_san MATCHES "thread")
-      if (${_use_other_sanitizers})
+    if(_san MATCHES "thread")
+      if(${_use_other_sanitizers})
         message(FATAL_ERROR
           "thread sanitizer is not compatible with ${_san} sanitizer.")
       endif()
       set(_use_thread_sanitizer 1)
-    else ()
-      if (${_use_thread_sanitizer})
+    else()
+      if(${_use_thread_sanitizer})
         message(FATAL_ERROR
           "${_san} sanitizer is not compatible with thread sanitizer.")
       endif()
diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
index 07968890d..4c94fbb45 100644
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@@ -11,7 +11,7 @@ function(auto_source_group SOURCES)
 
       source_group("${GROUP}" FILES "${FILE}")
   endforeach()
-endfunction(auto_source_group)
+endfunction()
 
 # Force static runtime for MSVC
 function(msvc_use_static_runtime)
@@ -50,7 +50,7 @@ function(msvc_use_static_runtime)
           endif()
       endforeach()
   endif()
-endfunction(msvc_use_static_runtime)
+endfunction()
 
 # Set output directory of target, ignoring debug or release
 function(set_output_directory target dir)
@@ -70,17 +70,17 @@ function(set_output_directory target dir)
     ARCHIVE_OUTPUT_DIRECTORY_RELEASE ${dir}
     ARCHIVE_OUTPUT_DIRECTORY_RELWITHDEBINFO ${dir}
     ARCHIVE_OUTPUT_DIRECTORY_MINSIZEREL ${dir})
-endfunction(set_output_directory)
+endfunction()
 
 # Set a default build type to release if none was specified
 function(set_default_configuration_release)
     if(CMAKE_CONFIGURATION_TYPES STREQUAL "Debug;Release;MinSizeRel;RelWithDebInfo") # multiconfig generator?
         set(CMAKE_CONFIGURATION_TYPES Release CACHE STRING "" FORCE)
-	elseif(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
-	  message(STATUS "Setting build type to 'Release' as none was specified.")
-	  set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE )
-	endif()
-endfunction(set_default_configuration_release)
+    elseif(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+      message(STATUS "Setting build type to 'Release' as none was specified.")
+      set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
+    endif()
+endfunction()
 
 # Generate nvcc compiler flags given a list of architectures
 # Also generates PTX for the most recent architecture for forwards compatibility
@@ -90,9 +90,9 @@ function(format_gencode_flags flags out)
   endif()
   # Set up architecture flags
   if(NOT flags)
-    if (CUDA_VERSION VERSION_GREATER_EQUAL "11.8")
+    if(CUDA_VERSION VERSION_GREATER_EQUAL "11.8")
       set(flags "50;60;70;80;90")
-    elseif (CUDA_VERSION VERSION_GREATER_EQUAL "11.0")
+    elseif(CUDA_VERSION VERSION_GREATER_EQUAL "11.0")
       set(flags "50;60;70;80")
     elseif(CUDA_VERSION VERSION_GREATER_EQUAL "10.0")
       set(flags "35;50;60;70")
@@ -103,7 +103,7 @@ function(format_gencode_flags flags out)
     endif()
   endif()
 
-  if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
+  if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
     cmake_policy(SET CMP0104 NEW)
     list(GET flags -1 latest_arch)
     list(TRANSFORM flags APPEND "-real")
@@ -121,8 +121,8 @@ function(format_gencode_flags flags out)
     set(${out} "${${out}}--generate-code=arch=compute_${ver},code=compute_${ver};")
     set(${out} "${${out}}" PARENT_SCOPE)
     message(STATUS "CUDA GEN_CODE: ${GEN_CODE}")
-  endif (CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
-endfunction(format_gencode_flags flags)
+  endif()
+endfunction()
 
 # Set CUDA related flags to target.  Must be used after code `format_gencode_flags`.
 function(xgboost_set_cuda_flags target)
@@ -133,35 +133,35 @@ function(xgboost_set_cuda_flags target)
     $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=${OpenMP_CXX_FLAGS}>
     $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all>)
 
-  if (USE_PER_THREAD_DEFAULT_STREAM)
+  if(USE_PER_THREAD_DEFAULT_STREAM)
     target_compile_options(${target} PRIVATE
             $<$<COMPILE_LANGUAGE:CUDA>:--default-stream per-thread>)
-  endif (USE_PER_THREAD_DEFAULT_STREAM)
+  endif()
 
-  if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
+  if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
     set_property(TARGET ${target} PROPERTY CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES})
-  endif (CMAKE_VERSION VERSION_GREATER_EQUAL "3.18")
+  endif()
 
-  if (FORCE_COLORED_OUTPUT)
-    if (FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND
+  if(FORCE_COLORED_OUTPUT)
+    if(FORCE_COLORED_OUTPUT AND (CMAKE_GENERATOR STREQUAL "Ninja") AND
         ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR
           (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")))
       target_compile_options(${target} PRIVATE
         $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-fdiagnostics-color=always>)
     endif()
-  endif (FORCE_COLORED_OUTPUT)
+  endif()
 
-  if (USE_DEVICE_DEBUG)
+  if(USE_DEVICE_DEBUG)
     target_compile_options(${target} PRIVATE
       $<$<AND:$<CONFIG:DEBUG>,$<COMPILE_LANGUAGE:CUDA>>:-G;-src-in-ptx>)
-  else (USE_DEVICE_DEBUG)
+  else()
     target_compile_options(${target} PRIVATE
       $<$<COMPILE_LANGUAGE:CUDA>:-lineinfo>)
-  endif (USE_DEVICE_DEBUG)
+  endif()
 
-  if (USE_NVTX)
+  if(USE_NVTX)
     target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_NVTX=1)
-  endif (USE_NVTX)
+  endif()
 
   target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_CUDA=1)
   target_include_directories(
@@ -169,17 +169,17 @@ function(xgboost_set_cuda_flags target)
     ${xgboost_SOURCE_DIR}/gputreeshap
     ${CUDAToolkit_INCLUDE_DIRS})
 
-  if (MSVC)
+  if(MSVC)
     target_compile_options(${target} PRIVATE
       $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=/utf-8>)
-  endif (MSVC)
+  endif()
 
   set_target_properties(${target} PROPERTIES
     CUDA_STANDARD 17
     CUDA_STANDARD_REQUIRED ON
     CUDA_SEPARABLE_COMPILATION OFF
     CUDA_RUNTIME_LIBRARY Static)
-endfunction(xgboost_set_cuda_flags)
+endfunction()
 
 # Set HIP related flags to target.
 function(xgboost_set_hip_flags target)
@@ -199,16 +199,16 @@ function(xgboost_set_hip_flags target)
 endfunction(xgboost_set_hip_flags)
 
 macro(xgboost_link_nccl target)
-  if (BUILD_STATIC_LIB)
+  if(BUILD_STATIC_LIB)
     target_include_directories(${target} PUBLIC ${NCCL_INCLUDE_DIR})
     target_compile_definitions(${target} PUBLIC -DXGBOOST_USE_NCCL=1)
     target_link_libraries(${target} PUBLIC ${NCCL_LIBRARY})
-  else ()
+  else()
     target_include_directories(${target} PRIVATE ${NCCL_INCLUDE_DIR})
     target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_NCCL=1)
     target_link_libraries(${target} PRIVATE ${NCCL_LIBRARY})
-  endif (BUILD_STATIC_LIB)
-endmacro(xgboost_link_nccl)
+  endif()
+endmacro()
 
 # compile options
 macro(xgboost_target_properties target)
@@ -217,110 +217,106 @@ macro(xgboost_target_properties target)
     CXX_STANDARD_REQUIRED ON
     POSITION_INDEPENDENT_CODE ON)
 
-  if (HIDE_CXX_SYMBOLS)
+  if(HIDE_CXX_SYMBOLS)
     #-- Hide all C++ symbols
     set_target_properties(${target} PROPERTIES
       C_VISIBILITY_PRESET hidden
       CXX_VISIBILITY_PRESET hidden
       CUDA_VISIBILITY_PRESET hidden
     )
-  endif (HIDE_CXX_SYMBOLS)
+  endif()
 
-  if (ENABLE_ALL_WARNINGS)
+  if(ENABLE_ALL_WARNINGS)
     target_compile_options(${target} PUBLIC
       $<IF:$<COMPILE_LANGUAGE:CUDA>,
       -Xcompiler=-Wall -Xcompiler=-Wextra -Xcompiler=-Wno-expansion-to-defined,
       -Wall -Wextra -Wno-expansion-to-defined>
     )
-    target_compile_options(${target} PUBLIC
-       $<IF:$<COMPILE_LANGUAGE:HIP>,
-      -Wall -Wextra >
-    )
-  endif(ENABLE_ALL_WARNINGS)
+  endif()
 
   target_compile_options(${target}
     PRIVATE
     $<$<AND:$<CXX_COMPILER_ID:MSVC>,$<COMPILE_LANGUAGE:CXX>>:/MP>
     $<$<AND:$<NOT:$<CXX_COMPILER_ID:MSVC>>,$<COMPILE_LANGUAGE:CXX>>:-funroll-loops>)
 
-  if (MSVC)
+  if(MSVC)
     target_compile_options(${target} PRIVATE
       $<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/utf-8>
       -D_CRT_SECURE_NO_WARNINGS
       -D_CRT_SECURE_NO_DEPRECATE
     )
-  endif (MSVC)
+  endif()
 
-  if (WIN32 AND MINGW)
+  if(WIN32 AND MINGW)
     target_compile_options(${target} PUBLIC -static-libstdc++)
-  endif (WIN32 AND MINGW)
-endmacro(xgboost_target_properties)
+  endif()
+endmacro()
 
 # Custom definitions used in xgboost.
 macro(xgboost_target_defs target)
-  if (NOT ${target} STREQUAL "dmlc") # skip dmlc core for custom logging.
+  if(NOT ${target} STREQUAL "dmlc") # skip dmlc core for custom logging.
     target_compile_definitions(${target}
       PRIVATE
       -DDMLC_LOG_CUSTOMIZE=1
       $<$<NOT:$<CXX_COMPILER_ID:MSVC>>:_MWAITXINTRIN_H_INCLUDED>)
-  endif ()
-  if (USE_DEBUG_OUTPUT)
+  endif()
+  if(USE_DEBUG_OUTPUT)
     target_compile_definitions(${target} PRIVATE -DXGBOOST_USE_DEBUG_OUTPUT=1)
-  endif (USE_DEBUG_OUTPUT)
-  if (XGBOOST_MM_PREFETCH_PRESENT)
+  endif()
+  if(XGBOOST_MM_PREFETCH_PRESENT)
     target_compile_definitions(${target}
       PRIVATE
       -DXGBOOST_MM_PREFETCH_PRESENT=1)
-  endif(XGBOOST_MM_PREFETCH_PRESENT)
-  if (XGBOOST_BUILTIN_PREFETCH_PRESENT)
+  endif()
+  if(XGBOOST_BUILTIN_PREFETCH_PRESENT)
     target_compile_definitions(${target}
       PRIVATE
       -DXGBOOST_BUILTIN_PREFETCH_PRESENT=1)
-  endif (XGBOOST_BUILTIN_PREFETCH_PRESENT)
+  endif()
 
-  if (PLUGIN_RMM)
+  if(PLUGIN_RMM)
     target_compile_definitions(objxgboost PUBLIC -DXGBOOST_USE_RMM=1)
-  endif (PLUGIN_RMM)
-endmacro(xgboost_target_defs)
+  endif()
+endmacro()
 
 # handles dependencies
 macro(xgboost_target_link_libraries target)
-  if (BUILD_STATIC_LIB)
+  if(BUILD_STATIC_LIB)
     target_link_libraries(${target} PUBLIC Threads::Threads ${CMAKE_THREAD_LIBS_INIT})
   else()
     target_link_libraries(${target} PRIVATE Threads::Threads ${CMAKE_THREAD_LIBS_INIT})
-  endif (BUILD_STATIC_LIB)
+  endif()
 
-  if (USE_OPENMP)
-    if (BUILD_STATIC_LIB)
+  if(USE_OPENMP)
+    if(BUILD_STATIC_LIB)
       target_link_libraries(${target} PUBLIC OpenMP::OpenMP_CXX)
     else()
       target_link_libraries(${target} PRIVATE OpenMP::OpenMP_CXX)
-    endif (BUILD_STATIC_LIB)
-  endif (USE_OPENMP)
+    endif()
+  endif()
 
-  if (USE_CUDA)
+  if(USE_CUDA)
     xgboost_set_cuda_flags(${target})
     target_link_libraries(${target} PUBLIC CUDA::cudart_static)
-  endif (USE_CUDA)
+  endif()
 
   if (USE_HIP)
     xgboost_set_hip_flags(${target})
   endif (USE_HIP)
 
-  if (PLUGIN_RMM)
+  if(PLUGIN_RMM)
     target_link_libraries(${target} PRIVATE rmm::rmm)
-  endif (PLUGIN_RMM)
+  endif()
 
-  if (USE_NCCL)
+  if(USE_NCCL)
     xgboost_link_nccl(${target})
-  endif (USE_NCCL)
+  endif()
 
-  if (USE_NVTX)
+  if(USE_NVTX)
     target_link_libraries(${target} PRIVATE CUDA::nvToolsExt)
-  endif (USE_NVTX)
+  endif()
 
-  if (MINGW)
+  if(MINGW)
     target_link_libraries(${target} PRIVATE wsock32 ws2_32)
-  endif (MINGW)
-endmacro(xgboost_target_link_libraries)
+  endif()
+endmacro()
diff --git a/cmake/Version.cmake b/cmake/Version.cmake
index ea8c081dc..4af6b27d6 100644
--- a/cmake/Version.cmake
+++ b/cmake/Version.cmake
@@ -1,6 +1,6 @@
-function (write_version)
+function(write_version)
   message(STATUS "xgboost VERSION: ${xgboost_VERSION}")
   configure_file(
     ${xgboost_SOURCE_DIR}/cmake/version_config.h.in
     ${xgboost_SOURCE_DIR}/include/xgboost/version_config.h @ONLY)
-endfunction (write_version)
+endfunction()
diff --git a/cmake/modules/FindLibR.cmake b/cmake/modules/FindLibR.cmake
index c9d9509fa..1eb384238 100644
--- a/cmake/modules/FindLibR.cmake
+++ b/cmake/modules/FindLibR.cmake
@@ -66,7 +66,7 @@ function(create_rlib_for_msvc)
   execute_process(COMMAND ${DLLTOOL_EXE}
     "--input-def" "${CMAKE_CURRENT_BINARY_DIR}/R.def"
     "--output-lib" "${CMAKE_CURRENT_BINARY_DIR}/R.lib")
-endfunction(create_rlib_for_msvc)
+endfunction()
 
 
 # detection for OSX
diff --git a/cmake/modules/FindNVML.cmake b/cmake/modules/FindNVML.cmake
index a4bed0019..f0e72f371 100644
--- a/cmake/modules/FindNVML.cmake
+++ b/cmake/modules/FindNVML.cmake
@@ -1,6 +1,6 @@
-if (NVML_LIBRARY)
+if(NVML_LIBRARY)
   unset(NVML_LIBRARY CACHE)
-endif(NVML_LIBRARY)
+endif()
 
 set(NVML_LIB_NAME nvml)
 
diff --git a/cmake/modules/FindNccl.cmake b/cmake/modules/FindNccl.cmake
index 5f06f96b8..02ee731a1 100644
--- a/cmake/modules/FindNccl.cmake
+++ b/cmake/modules/FindNccl.cmake
@@ -35,20 +35,20 @@
 #
 # This module assumes that the user has already called find_package(CUDA)
 
-if (NCCL_LIBRARY)
+if(NCCL_LIBRARY)
   if(NOT USE_NCCL_LIB_PATH)
     # Don't cache NCCL_LIBRARY to enable switching between static and shared.
     unset(NCCL_LIBRARY CACHE)
-  endif(NOT USE_NCCL_LIB_PATH)
+  endif()
 endif()
 
-if (BUILD_WITH_SHARED_NCCL)
+if(BUILD_WITH_SHARED_NCCL)
   # libnccl.so
   set(NCCL_LIB_NAME nccl)
-else ()
+else()
   # libnccl_static.a
   set(NCCL_LIB_NAME nccl_static)
-endif (BUILD_WITH_SHARED_NCCL)
+endif()
 
 find_path(NCCL_INCLUDE_DIR
   NAMES nccl.h
diff --git a/demo/c-api/basic/CMakeLists.txt b/demo/c-api/basic/CMakeLists.txt
index 32e2bc432..74567c9c8 100644
--- a/demo/c-api/basic/CMakeLists.txt
+++ b/demo/c-api/basic/CMakeLists.txt
@@ -3,11 +3,11 @@ find_package(xgboost REQUIRED)
 
 # xgboost is built as static libraries, all cxx dependencies need to be linked into the
 # executable.
-if (XGBOOST_BUILD_STATIC_LIB)
+if(XGBOOST_BUILD_STATIC_LIB)
   enable_language(CXX)
   # find again for those  cxx libraries.
   find_package(xgboost REQUIRED)
-endif(XGBOOST_BUILD_STATIC_LIB)
+endif()
 
 add_executable(api-demo c-api-demo.c)
 target_link_libraries(api-demo PRIVATE xgboost::xgboost)
diff --git a/demo/c-api/inference/CMakeLists.txt b/demo/c-api/inference/CMakeLists.txt
index 6aa8f1dd2..5eded43e2 100644
--- a/demo/c-api/inference/CMakeLists.txt
+++ b/demo/c-api/inference/CMakeLists.txt
@@ -4,11 +4,11 @@ find_package(xgboost REQUIRED)
 
 # xgboost is built as static libraries, all cxx dependencies need to be linked into the
 # executable.
-if (XGBOOST_BUILD_STATIC_LIB)
+if(XGBOOST_BUILD_STATIC_LIB)
   enable_language(CXX)
   # find again for those  cxx libraries.
   find_package(xgboost REQUIRED)
-endif(XGBOOST_BUILD_STATIC_LIB)
+endif()
 
 add_executable(inference-demo inference.c)
 target_link_libraries(inference-demo PRIVATE xgboost::xgboost)
diff --git a/demo/guide-python/callbacks.py b/demo/guide-python/callbacks.py
index be03b1693..9c12f70de 100644
--- a/demo/guide-python/callbacks.py
+++ b/demo/guide-python/callbacks.py
@@ -104,7 +104,7 @@ def check_point_callback():
         # Use callback class from xgboost.callback
         # Feel free to subclass/customize it to suit your need.
         check_point = xgb.callback.TrainingCheckPoint(
-            directory=tmpdir, iterations=rounds, name="model"
+            directory=tmpdir, interval=rounds, name="model"
         )
         xgb.train(
             {"objective": "binary:logistic"},
@@ -118,7 +118,7 @@ def check_point_callback():
         # This version of checkpoint saves everything including parameters and
         # model.  See: doc/tutorials/saving_model.rst
         check_point = xgb.callback.TrainingCheckPoint(
-            directory=tmpdir, iterations=rounds, as_pickle=True, name="model"
+            directory=tmpdir, interval=rounds, as_pickle=True, name="model"
         )
         xgb.train(
             {"objective": "binary:logistic"},
diff --git a/demo/kaggle-higgs/higgs-train.R b/demo/kaggle-higgs/higgs-train.R
index 6f37040f8..4730d7b3d 100644
--- a/demo/kaggle-higgs/higgs-train.R
+++ b/demo/kaggle-higgs/higgs-train.R
@@ -24,8 +24,8 @@ param <- list("objective" = "binary:logitraw",
               "nthread" = 16)
 watchlist <- list("train" = xgmat)
 nrounds <- 120
-print ("loading data end, start to boost trees")
+print("loading data end, start to boost trees")
 bst <- xgb.train(param, xgmat, nrounds, watchlist)
 # save out model
 xgb.save(bst, "higgs.model")
-print ('finish training')
+print('finish training')
diff --git a/demo/kaggle-higgs/speedtest.R b/demo/kaggle-higgs/speedtest.R
index c0e96a010..c6de1511b 100644
--- a/demo/kaggle-higgs/speedtest.R
+++ b/demo/kaggle-higgs/speedtest.R
@@ -39,11 +39,11 @@ for (i in seq_along(threads)){
                   "nthread" = thread)
     watchlist <- list("train" = xgmat)
     nrounds <- 120
-    print ("loading data end, start to boost trees")
+    print("loading data end, start to boost trees")
     bst <- xgb.train(param, xgmat, nrounds, watchlist)
     # save out model
     xgb.save(bst, "higgs.model")
-    print ('finish training')
+    print('finish training')
   })
 }
 
diff --git a/demo/nvflare/horizontal/README.md b/demo/nvflare/horizontal/README.md
index 19ac4cf4e..7337f1720 100644
--- a/demo/nvflare/horizontal/README.md
+++ b/demo/nvflare/horizontal/README.md
@@ -85,8 +85,8 @@ shutdown server
 ## Training with GPUs
 
 To demo with Federated Learning using GPUs, make sure your machine has at least 2 GPUs.
-Build XGBoost with the federated learning plugin enabled along with CUDA, but with NCCL
-turned off (see the [README](../../plugin/federated/README.md)).
+Build XGBoost with the federated learning plugin enabled along with CUDA
+(see the [README](../../plugin/federated/README.md)).
 
-Modify `config/config_fed_client.json` and set `use_gpus` to `true`, then repeat the steps
+Modify `../config/config_fed_client.json` and set `use_gpus` to `true`, then repeat the steps
 above.
diff --git a/demo/nvflare/horizontal/custom/trainer.py b/demo/nvflare/horizontal/custom/trainer.py
index b1ec94211..4f20b2f39 100644
--- a/demo/nvflare/horizontal/custom/trainer.py
+++ b/demo/nvflare/horizontal/custom/trainer.py
@@ -67,7 +67,7 @@ class XGBoostTrainer(Executor):
             dtest = xgb.DMatrix('agaricus.txt.test?format=libsvm')
 
             # Specify parameters via map, definition are same as c++ version
-            param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
+            param = {'tree_method': 'hist', 'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
             if self._use_gpus:
                 self.log_info(fl_ctx, f'Training with GPU {rank}')
                 param['device'] = f"cuda:{rank}"
diff --git a/demo/nvflare/vertical/README.md b/demo/nvflare/vertical/README.md
index f9cca57d9..d63b2bca4 100644
--- a/demo/nvflare/vertical/README.md
+++ b/demo/nvflare/vertical/README.md
@@ -56,4 +56,9 @@ shutdown server
 
 ## Training with GPUs
 
-Currently GPUs are not yet supported by vertical federated XGBoost.
+To demo with Vertical Federated Learning using GPUs, make sure your machine has at least 2 GPUs.
+Build XGBoost with the federated learning plugin enabled along with CUDA
+(see the [README](../../plugin/federated/README.md)).
+
+Modify `../config/config_fed_client.json` and set `use_gpus` to `true`, then repeat the steps
+above.
diff --git a/demo/nvflare/vertical/custom/trainer.py b/demo/nvflare/vertical/custom/trainer.py
index 1c235a439..b6c3855ef 100644
--- a/demo/nvflare/vertical/custom/trainer.py
+++ b/demo/nvflare/vertical/custom/trainer.py
@@ -77,13 +77,14 @@ class XGBoostTrainer(Executor):
                 'gamma': 1.0,
                 'max_depth': 8,
                 'min_child_weight': 100,
-                'tree_method': 'approx',
+                'tree_method': 'hist',
                 'grow_policy': 'depthwise',
                 'objective': 'binary:logistic',
                 'eval_metric': 'auc',
             }
             if self._use_gpus:
-                self.log_info(fl_ctx, 'GPUs are not currently supported by vertical federated XGBoost')
+                self.log_info(fl_ctx, f'Training with GPU {rank}')
+                param['device'] = f"cuda:{rank}"
 
             # specify validations set to watch performance
             watchlist = [(dtest, "eval"), (dtrain, "train")]
diff --git a/dev/release-artifacts.py b/dev/release-artifacts.py
index d9b9d6203..429fac078 100644
--- a/dev/release-artifacts.py
+++ b/dev/release-artifacts.py
@@ -250,8 +250,8 @@ echo "<hash> <artifact>" | shasum -a 256 --check
 ```
 
 **Experimental binary packages for R with CUDA enabled**
-* xgboost_r_gpu_linux_1.7.5.tar.gz: [Download]({r_gpu_linux_url})
-* xgboost_r_gpu_win64_1.7.5.tar.gz: [Download]({r_gpu_win64_url})
+* xgboost_r_gpu_linux_{release}.tar.gz: [Download]({r_gpu_linux_url})
+* xgboost_r_gpu_win64_{release}.tar.gz: [Download]({r_gpu_win64_url})
 
 **Source tarball**
 * xgboost.tar.gz: [Download]({src_tarball})"""
@@ -296,12 +296,13 @@ def main(args: argparse.Namespace) -> None:
     git.submodule("update")
     commit_hash = latest_hash()
 
-    if not os.path.exists(args.outdir):
-        os.mkdir(args.outdir)
+    outdir = os.path.abspath(args.outdir)
+    if not os.path.exists(outdir):
+        os.mkdir(outdir)
 
     # source tarball
     hashes: List[str] = []
-    tarname, h = make_src_package(release, args.outdir)
+    tarname, h = make_src_package(release, outdir)
     hashes.append(h)
 
     # CUDA R packages
@@ -310,18 +311,18 @@ def main(args: argparse.Namespace) -> None:
         branch,
         "" if rc is None else rc + str(rc_ver),
         commit_hash,
-        args.outdir,
+        outdir,
     )
     hashes.extend(hr)
 
     # Python source wheel
-    make_pysrc_wheel(release, rc, rc_ver, args.outdir)
+    make_pysrc_wheel(release, rc, rc_ver, outdir)
 
     # Python binary wheels
-    download_py_packages(branch, major, minor, commit_hash, args.outdir)
+    download_py_packages(branch, major, minor, commit_hash, outdir)
 
     # Write end note
-    release_note(release, hashes, urls, tarname, args.outdir)
+    release_note(release, hashes, urls, tarname, outdir)
 
 
 if __name__ == "__main__":
diff --git a/doc/contrib/coding_guide.rst b/doc/contrib/coding_guide.rst
index e799ad286..1169921bb 100644
--- a/doc/contrib/coding_guide.rst
+++ b/doc/contrib/coding_guide.rst
@@ -80,6 +80,24 @@ R package versioning
 ====================
 See :ref:`release`.
 
+Testing R package with different compilers
+==========================================
+
+You can change the default compiler of R by changing the configuration file in home
+directory. For instance, if you want to test XGBoost built with clang++ instead of g++ on
+Linux, put the following in your ``~/.R/Makevars`` file:
+
+.. code-block:: sh
+
+  CC=clang-15
+  CXX17=clang++-15
+
+Be aware that the variable name should match with the name used by ``R CMD``:
+
+.. code-block:: sh
+
+  R CMD config CXX17
+
 Registering native routines in R
 ================================
 According to `R extension manual <https://cran.r-project.org/doc/manuals/r-release/R-exts.html#Registering-native-routines>`_,
diff --git a/doc/contrib/python_packaging.rst b/doc/contrib/python_packaging.rst
index 5cf085685..ebd9d36ec 100644
--- a/doc/contrib/python_packaging.rst
+++ b/doc/contrib/python_packaging.rst
@@ -35,7 +35,7 @@ Building sdists
 
 In the case of XGBoost, an sdist contains both the Python code as well as
 the C++ code, so that the core part of XGBoost can be compiled into the
-shared libary ``libxgboost.so`` [#shared_lib_name]_.
+shared library ``libxgboost.so`` [#shared_lib_name]_.
 
 You can obtain an sdist as follows:
 
diff --git a/doc/contrib/unit_tests.rst b/doc/contrib/unit_tests.rst
index ef4ad1480..2aa97e812 100644
--- a/doc/contrib/unit_tests.rst
+++ b/doc/contrib/unit_tests.rst
@@ -16,7 +16,14 @@ Adding a new unit test
 
 Python package: pytest
 ======================
-Add your test under the directory `tests/python/ <https://github.com/dmlc/xgboost/tree/master/tests/python>`_ or `tests/python-gpu/ <https://github.com/dmlc/xgboost/tree/master/tests/python-gpu>`_ (if you are testing GPU code). Refer to `the PyTest tutorial <https://docs.pytest.org/en/latest/getting-started.html>`_ to learn how to write tests for Python code.
+Add your test under the directories
+
+- `tests/python/ <https://github.com/dmlc/xgboost/tree/master/tests/python>`_
+- `tests/python-gpu/ <https://github.com/dmlc/xgboost/tree/master/tests/python-gpu>`_ (if you are testing GPU code)
+- `tests/test_distributed <https://github.com/dmlc/xgboost/tree/master/tests/test_distributed>`_. (if a distributed framework is used)
+
+Refer to `the PyTest tutorial <https://docs.pytest.org/en/latest/getting-started.html>`_
+to learn how to write tests for Python code.
 
 You may try running your test by following instructions in :ref:`this section <running_pytest>`.
 
@@ -56,19 +63,26 @@ Run
 
 .. code-block:: bash
 
-  make Rcheck
+  python ./tests/ci_build/test_r_package.py --task=check
 
-at the root of the project directory.
+at the root of the project directory. The command builds and checks the XGBoost
+r-package. Alternatively, if you want to just run the tests, you can use the following
+commands after installing XGBoost:
+
+.. code-block:: bash
+
+  cd R-package/tests/
+  Rscript testthat.R
 
 .. _running_jvm_tests:
 
 JVM packages
 ============
-As part of the building process, tests are run:
+Maven is used
 
 .. code-block:: bash
 
-  mvn package
+  mvn test
 
 .. _running_pytest:
 
@@ -99,6 +113,14 @@ In addition, to test CUDA code, run:
 
 (For this step, you should have compiled XGBoost with CUDA enabled.)
 
+For testing with distributed frameworks like ``Dask`` and ``PySpark``:
+
+.. code:: bash
+
+  # Tell Python where to find XGBoost module
+  export PYTHONPATH=./python-package
+  pytest -v -s --fulltrace tests/test_distributed
+
 .. _running_gtest:
 
 C++: Google Test
@@ -110,21 +132,13 @@ To build and run C++ unit tests enable tests while running CMake:
 
   mkdir build
   cd build
-  cmake -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON  ..
-  make
-  make test
+  cmake -GNinja -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DUSE_CUDA=ON -DUSE_NCCL=ON ..
+  ninja
+  ./testxgboost
 
-To enable tests for CUDA code, add ``-DUSE_CUDA=ON`` and ``-DUSE_NCCL=ON`` (CUDA toolkit required):
-
-.. code-block:: bash
-
-  mkdir build
-  cd build
-  cmake -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DUSE_CUDA=ON -DUSE_NCCL=ON ..
-  make
-  make test
-
-One can also run all unit test using ctest tool which provides higher flexibility. For example:
+Flags like ``USE_CUDA``, ``USE_DMLC_GTEST`` are optional. For more info about how to build
+XGBoost from source, see :doc:`</build>`. One can also run all unit test using ctest tool
+which provides higher flexibility. For example:
 
 .. code-block:: bash
 
@@ -157,14 +171,14 @@ sanitizer is not compatible with the other two sanitizers.
 
 .. code-block:: bash
 
-  cmake -DUSE_SANITIZER=ON -DENABLED_SANITIZERS="address;leak" /path/to/xgboost
+  cmake -DUSE_SANITIZER=ON -DENABLED_SANITIZERS="address;undefined" /path/to/xgboost
 
 By default, CMake will search regular system paths for sanitizers, you can also
 supply a specified SANITIZER_PATH.
 
 .. code-block:: bash
 
-  cmake -DUSE_SANITIZER=ON -DENABLED_SANITIZERS="address;leak" \
+  cmake -DUSE_SANITIZER=ON -DENABLED_SANITIZERS="address;undefined" \
   -DSANITIZER_PATH=/path/to/sanitizers /path/to/xgboost
 
 How to use sanitizers with CUDA support
@@ -181,7 +195,7 @@ environment variable:
 Other sanitizer runtime options
 ===============================
 
-By default undefined sanitizer doesn't print out the backtrace.  You can enable it by
+By default undefined sanitizer doesn't print out the backtrace. You can enable it by
 exporting environment variable:
 
 .. code-block::
diff --git a/doc/prediction.rst b/doc/prediction.rst
index c94ddfbbf..100c82a1e 100644
--- a/doc/prediction.rst
+++ b/doc/prediction.rst
@@ -146,3 +146,48 @@ instance we might accidentally call ``clf.set_params()`` inside a predict functi
 
     with ThreadPoolExecutor(max_workers=10) as e:
         e.submit(predict_fn, ...)
+
+*****************************
+Privacy-Preserving Prediction
+*****************************
+
+`Concrete ML`_ is a third-party open-source library developed by `Zama`_ that proposes gradient
+boosting classes similar to ours, but predicting directly over encrypted data, thanks to
+Fully Homomorphic Encryption. A simple example would be as follows:
+
+.. code-block:: python
+
+    from sklearn.datasets import make_classification
+    from sklearn.model_selection import train_test_split
+    from concrete.ml.sklearn import XGBClassifier
+
+    x, y = make_classification(n_samples=100, class_sep=2, n_features=30, random_state=42)
+    X_train, X_test, y_train, y_test = train_test_split(
+        x, y, test_size=10, random_state=42
+    )
+
+    # Train in the clear and quantize the weights
+    model = XGBClassifier()
+    model.fit(X_train, y_train)
+
+    # Simulate the predictions in the clear
+    y_pred_clear = model.predict(X_test)
+
+    # Compile in FHE
+    model.compile(X_train)
+
+    # Generate keys
+    model.fhe_circuit.keygen()
+
+    # Run the inference on encrypted inputs!
+    y_pred_fhe = model.predict(X_test, fhe="execute")
+
+    print("In clear  :", y_pred_clear)
+    print("In FHE    :", y_pred_fhe)
+    print(f"Similarity: {int((y_pred_fhe == y_pred_clear).mean()*100)}%")
+
+More information and examples are given in the `Concrete ML documentation`_.
+
+.. _Zama: https://www.zama.ai/
+.. _Concrete ML: https://github.com/zama-ai/concrete-ml
+.. _Concrete ML documentation: https://docs.zama.ai/concrete-ml
diff --git a/doc/python/python_intro.rst b/doc/python/python_intro.rst
index bb74e7bc3..cc0e461e0 100644
--- a/doc/python/python_intro.rst
+++ b/doc/python/python_intro.rst
@@ -172,9 +172,8 @@ Support Matrix
 +-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
 | modin.Series            | NPA       | FF                | NPA       | NPA       | FF                 |             |
 +-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
-| pyarrow.Table           | T         | F                 |           | NPA       | FF                 |             |
+| pyarrow.Table           | NPA       | NPA               | NPA       | NPA       | NPA                | NPA         |
 +-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
-| pyarrow.dataset.Dataset | T         | F                 |           |           | F                  |             |
 +-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
 | _\_array\_\_            | NPA       | F                 | NPA       | NPA       | H                  |             |
 +-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
diff --git a/doc/tutorials/index.rst b/doc/tutorials/index.rst
index 7693173e9..5d090ce65 100644
--- a/doc/tutorials/index.rst
+++ b/doc/tutorials/index.rst
@@ -30,3 +30,4 @@ See `Awesome XGBoost <https://github.com/dmlc/xgboost/tree/master/demo>`_ for mo
   input_format
   param_tuning
   custom_metric_obj
+  privacy_preserving
\ No newline at end of file
diff --git a/doc/tutorials/learning_to_rank.rst b/doc/tutorials/learning_to_rank.rst
index c562dc2df..015f736e0 100644
--- a/doc/tutorials/learning_to_rank.rst
+++ b/doc/tutorials/learning_to_rank.rst
@@ -58,6 +58,7 @@ Notice that the samples are sorted based on their query index in a non-decreasin
   sorted_idx = np.argsort(qid)
   X = X[sorted_idx, :]
   y = y[sorted_idx]
+  qid = qid[sorted_idx]
 
 The simplest way to train a ranking model is by using the scikit-learn estimator interface. Continuing the previous snippet, we can train a simple ranking model without tuning:
 
diff --git a/doc/tutorials/privacy_preserving.rst b/doc/tutorials/privacy_preserving.rst
new file mode 100644
index 000000000..132861f7c
--- /dev/null
+++ b/doc/tutorials/privacy_preserving.rst
@@ -0,0 +1,97 @@
+#############################################
+Privacy Preserving Inference with Concrete ML
+#############################################
+
+`Concrete ML`_ is a specialized library developed by Zama that allows the execution of machine learning models on encrypted data through `Fully Homomorphic Encryption (FHE) <https://www.youtube.com/watch?v=FFox2S4uqEo>`_, thereby preserving data privacy.
+
+To use models such as XGBClassifier, use the following import:
+
+.. code:: python
+
+  from concrete.ml.sklearn import XGBClassifier
+
+***************************************
+Performing Privacy Preserving Inference
+***************************************
+
+Initialization of a XGBClassifier can be done as follows:
+
+.. code:: python
+
+  classifier = XGBClassifier(n_bits=6, [other_hyperparameters])
+
+
+where ``n_bits`` determines the precision of the input features. Note that a higher value of ``n_bits`` increases the precision of the input features and possibly the final model accuracy but also ends up with longer FHE execution time.
+
+Other hyper-parameters that exist in xgboost library can be used.
+
+******************************
+Model Training and Compilation
+******************************
+
+As commonly used in scikit-learn like models, it can be trained with the .fit() method.
+
+.. code:: python
+
+  classifier.fit(X_train, y_train)
+
+After training, the model can be compiled with a calibration dataset, potentially a subset of the training data:
+
+.. code:: python
+
+  classifier.compile(X_calibrate)
+
+This calibration dataset, ``X_calibrate``, is used in Concrete ML compute the precision (bit-width) of each intermediate value in the model. This is a necessary step to optimize the equivalent FHE circuit.
+
+****************************
+FHE Simulation and Execution
+****************************
+
+To verify model accuracy in encrypted computations, you can run an FHE simulation:
+
+.. code:: python
+
+  predictions = classifier.predict(X_test, fhe="simulate")
+
+This simulation can be used to evaluate the model. The resulting accuracy of this simulation step is representative of the actual FHE execution without having to pay the cost of an actual FHE execution. 
+
+When the model is ready, actual Fully Homomorphic Encryption execution can be performed:
+
+.. code:: python
+
+  predictions = classifier.predict(X_test, fhe="execute")
+
+
+Note that using FHE="execute" is a convenient way to assess the model in FHE, but for real deployment, functions to encrypt (on the client), run in FHE (on the server), and finally decrypt (on the client) have to be used for end-to-end privacy-preserving inferences.
+
+Concrete ML provides a deployment API to facilitate this process, ensuring end-to-end privacy.
+
+To go further in the deployment API you can read:
+
+- the `deployment documentation <https://docs.zama.ai/concrete-ml/advanced-topics/client_server>`_
+- the `deployment notebook <https://github.com/zama-ai/concrete-ml/blob/17779ca571d20b001caff5792eb11e76fe2c19ba/docs/advanced_examples/ClientServer.ipynb>`_
+
+*******************************
+Parameter Tuning in Concrete ML
+*******************************
+
+Concrete ML is compatible with standard scikit-learn pipelines such as GridSearchCV or any other hyper-parameter tuning techniques.
+
+******************
+Examples and Demos
+******************
+
+- `Sentiment analysis (based on transformers + xgboost) <https://huggingface.co/spaces/zama-fhe/encrypted_sentiment_analysis>`_
+- `XGBoost Classifier <https://github.com/zama-ai/concrete-ml/blob/6966c84b9698d5418209b346900f81d1270c64bd/docs/advanced_examples/XGBClassifier.ipynb>`_
+- `XGBoost Regressor <https://github.com/zama-ai/concrete-ml/blob/6966c84b9698d5418209b346900f81d1270c64bd/docs/advanced_examples/XGBRegressor.ipynb>`_
+
+**********
+Conclusion
+**********
+
+Concrete ML provides a framework for executing privacy-preserving inferences by leveraging Fully Homomorphic Encryption, allowing secure and private computations on encrypted data.
+
+More information and examples are given in the `Concrete ML documentation`_.
+
+.. _Concrete ML: https://github.com/zama-ai/concrete-ml
+.. _`Concrete ML documentation`: https://docs.zama.ai/concrete-ml
\ No newline at end of file
diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h
index afc1f47fd..d28b5098b 100644
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@@ -144,9 +144,7 @@ XGB_DLL int XGDMatrixCreateFromFile(const char *fname, int silent, DMatrixHandle
  *            See :doc:`/tutorials/input_format` for more info.
  *          \endverbatim
  *   - silent (optional): Whether to print message during loading. Default to true.
- *   - data_split_mode (optional): Whether to split by row or column. In distributed mode, the
- *     file is split accordingly; otherwise this is only an indicator on how the file was split
- *     beforehand. Default to row.
+ *   - data_split_mode (optional): Whether the file was split by row or column beforehand for distributed computing. Default to row.
  * \param out a loaded data matrix
  * \return 0 when success, -1 when failure happens
  */
@@ -174,6 +172,7 @@ XGB_DLL int XGDMatrixCreateFromCSREx(const size_t *indptr, const unsigned *indic
  * \param config  JSON encoded configuration.  Required values are:
  *   - missing: Which value to represent missing value.
  *   - nthread (optional): Number of threads used for initializing DMatrix.
+ *   - data_split_mode (optional): Whether the data was split by row or column beforehand. Default to row.
  * \param out created dmatrix
  * \return 0 when success, -1 when failure happens
  */
@@ -186,6 +185,7 @@ XGB_DLL int XGDMatrixCreateFromCSR(char const *indptr, char const *indices, char
  * \param config JSON encoded configuration.  Required values are:
  *   - missing: Which value to represent missing value.
  *   - nthread (optional): Number of threads used for initializing DMatrix.
+ *   - data_split_mode (optional): Whether the data was split by row or column beforehand. Default to row.
  * \param out created dmatrix
  * \return 0 when success, -1 when failure happens
  */
@@ -200,6 +200,7 @@ XGB_DLL int XGDMatrixCreateFromDense(char const *data, char const *config, DMatr
  * \param config  JSON encoded configuration.  Supported values are:
  *   - missing: Which value to represent missing value.
  *   - nthread (optional): Number of threads used for initializing DMatrix.
+ *   - data_split_mode (optional): Whether the data was split by row or column beforehand. Default to row.
  * \param out created dmatrix
  * \return 0 when success, -1 when failure happens
  */
@@ -266,6 +267,7 @@ XGB_DLL int XGDMatrixCreateFromDT(void** data,
  * \param config JSON encoded configuration.  Required values are:
  *   - missing: Which value to represent missing value.
  *   - nthread (optional): Number of threads used for initializing DMatrix.
+ *   - data_split_mode (optional): Whether the data was split by row or column beforehand. Default to row.
  * \param out created dmatrix
  * \return 0 when success, -1 when failure happens
  */
@@ -278,6 +280,7 @@ XGB_DLL int XGDMatrixCreateFromCudaColumnar(char const *data, char const *config
  * \param config JSON encoded configuration.  Required values are:
  *   - missing: Which value to represent missing value.
  *   - nthread (optional): Number of threads used for initializing DMatrix.
+ *   - data_split_mode (optional): Whether the data was split by row or column beforehand. Default to row.
  * \param out created dmatrix
  * \return 0 when success, -1 when failure happens
  */
@@ -552,24 +555,6 @@ XGB_DLL int XGProxyDMatrixSetDataCSR(DMatrixHandle handle, char const *indptr,
 
 /** @} */  // End of Streaming
 
-XGB_DLL int XGImportArrowRecordBatch(DataIterHandle data_handle, void *ptr_array, void *ptr_schema);
-
-/*!
- * \brief Construct DMatrix from arrow using callbacks.  Arrow related C API is not stable
- *        and subject to change in the future.
- *
- * \param next   Callback function for fetching arrow records.
- * \param config JSON encoded configuration.  Required values are:
- *   - missing: Which value to represent missing value.
- *   - nbatch: Number of batches in arrow table.
- *   - nthread (optional): Number of threads used for initializing DMatrix.
- * \param out      The created DMatrix.
- *
- * \return 0 when success, -1 when failure happens
- */
-XGB_DLL int XGDMatrixCreateFromArrowCallback(XGDMatrixCallbackNext *next, char const *config,
-                                             DMatrixHandle *out);
-
 /*!
  * \brief create a new dmatrix from sliced content of existing matrix
  * \param handle instance of data matrix to be sliced
@@ -808,6 +793,16 @@ XGB_DLL int XGDMatrixNumCol(DMatrixHandle handle, bst_ulong *out);
  */
 XGB_DLL int XGDMatrixNumNonMissing(DMatrixHandle handle, bst_ulong *out);
 
+/*!
+ * \brief Get the data split mode from DMatrix.
+ *
+ * \param handle the handle to the DMatrix
+ * \param out The output of the data split mode
+ *
+ * \return 0 when success, -1 when failure happens
+ */
+XGB_DLL int XGDMatrixDataSplitMode(DMatrixHandle handle, bst_ulong *out);
+
 /**
  * \brief Get the predictors from DMatrix as CSR matrix for testing.  If this is a
  *        quantized DMatrix, quantized values are returned instead.
@@ -1276,15 +1271,6 @@ XGB_DLL int XGBoosterLoadModelFromBuffer(BoosterHandle handle,
 XGB_DLL int XGBoosterSaveModelToBuffer(BoosterHandle handle, char const *config, bst_ulong *out_len,
                                        char const **out_dptr);
 
-/*!
- * \brief Save booster to a buffer with in binary format.
- *
- * \deprecated since 1.6.0
- * \see XGBoosterSaveModelToBuffer()
- */
-XGB_DLL int XGBoosterGetModelRaw(BoosterHandle handle, bst_ulong *out_len,
-                                 const char **out_dptr);
-
 /*!
  * \brief Memory snapshot based serialization method.  Saves everything states
  * into buffer.
@@ -1308,24 +1294,6 @@ XGB_DLL int XGBoosterSerializeToBuffer(BoosterHandle handle, bst_ulong *out_len,
 XGB_DLL int XGBoosterUnserializeFromBuffer(BoosterHandle handle,
                                            const void *buf, bst_ulong len);
 
-/*!
- * \brief Initialize the booster from rabit checkpoint.
- *  This is used in distributed training API.
- * \param handle handle
- * \param version The output version of the model.
- * \return 0 when success, -1 when failure happens
- */
-XGB_DLL int XGBoosterLoadRabitCheckpoint(BoosterHandle handle,
-                                         int* version);
-
-/*!
- * \brief Save the current checkpoint to rabit.
- * \param handle handle
- * \return 0 when success, -1 when failure happens
- */
-XGB_DLL int XGBoosterSaveRabitCheckpoint(BoosterHandle handle);
-
-
 /*!
  * \brief Save XGBoost's internal configuration into a JSON document.  Currently the
  *        support is experimental, function signature may change in the future without
@@ -1554,29 +1522,19 @@ XGB_DLL int XGBoosterFeatureScore(BoosterHandle handle, const char *config,
  * \param config JSON encoded configuration. Accepted JSON keys are:
  *   - xgboost_communicator: The type of the communicator. Can be set as an environment variable.
  *     * rabit: Use Rabit. This is the default if the type is unspecified.
- *     * mpi: Use MPI.
  *     * federated: Use the gRPC interface for Federated Learning.
  * Only applicable to the Rabit communicator (these are case-sensitive):
  *   - rabit_tracker_uri: Hostname of the tracker.
  *   - rabit_tracker_port: Port number of the tracker.
  *   - rabit_task_id: ID of the current task, can be used to obtain deterministic rank assignment.
  *   - rabit_world_size: Total number of workers.
- *   - rabit_hadoop_mode: Enable Hadoop support.
- *   - rabit_tree_reduce_minsize: Minimal size for tree reduce.
- *   - rabit_reduce_ring_mincount: Minimal count to perform ring reduce.
- *   - rabit_reduce_buffer: Size of the reduce buffer.
- *   - rabit_bootstrap_cache: Size of the bootstrap cache.
- *   - rabit_debug: Enable debugging.
  *   - rabit_timeout: Enable timeout.
  *   - rabit_timeout_sec: Timeout in seconds.
- *   - rabit_enable_tcp_no_delay: Enable TCP no delay on Unix platforms.
  * Only applicable to the Rabit communicator (these are case-sensitive, and can be set as
  * environment variables):
  *   - DMLC_TRACKER_URI: Hostname of the tracker.
  *   - DMLC_TRACKER_PORT: Port number of the tracker.
  *   - DMLC_TASK_ID: ID of the current task, can be used to obtain deterministic rank assignment.
- *   - DMLC_ROLE: Role of the current task, "worker" or "server".
- *   - DMLC_NUM_ATTEMPT: Number of attempts after task failure.
  *   - DMLC_WORKER_CONNECT_RETRY: Number of retries to connect to the tracker.
  * Only applicable to the Federated communicator (use upper case for environment variables, use
  * lower case for runtime configuration):
diff --git a/include/xgboost/collective/result.h b/include/xgboost/collective/result.h
index 209362505..507171dd4 100644
--- a/include/xgboost/collective/result.h
+++ b/include/xgboost/collective/result.h
@@ -157,4 +157,13 @@ struct Result {
 [[nodiscard]] inline auto Fail(std::string msg, std::error_code errc, Result&& prev) {
   return Result{std::move(msg), std::move(errc), std::forward<Result>(prev)};
 }
+
+// We don't have monad, a simple helper would do.
+template <typename Fn>
+Result operator<<(Result&& r, Fn&& fn) {
+  if (!r.OK()) {
+    return std::forward<Result>(r);
+  }
+  return fn();
+}
 }  // namespace xgboost::collective
diff --git a/include/xgboost/collective/socket.h b/include/xgboost/collective/socket.h
index 5bff2204e..5dd1b9ffa 100644
--- a/include/xgboost/collective/socket.h
+++ b/include/xgboost/collective/socket.h
@@ -215,9 +215,9 @@ class SockAddrV4 {
   static SockAddrV4 Loopback();
   static SockAddrV4 InaddrAny();
 
-  in_port_t Port() const { return ntohs(addr_.sin_port); }
+  [[nodiscard]] in_port_t Port() const { return ntohs(addr_.sin_port); }
 
-  std::string Addr() const {
+  [[nodiscard]] std::string Addr() const {
     char buf[INET_ADDRSTRLEN];
     auto const *s = system::inet_ntop(static_cast<std::int32_t>(SockDomain::kV4), &addr_.sin_addr,
                                       buf, INET_ADDRSTRLEN);
@@ -226,7 +226,7 @@ class SockAddrV4 {
     }
     return {buf};
   }
-  sockaddr_in const &Handle() const { return addr_; }
+  [[nodiscard]] sockaddr_in const &Handle() const { return addr_; }
 };
 
 /**
@@ -243,13 +243,13 @@ class SockAddress {
   explicit SockAddress(SockAddrV6 const &addr) : v6_{addr}, domain_{SockDomain::kV6} {}
   explicit SockAddress(SockAddrV4 const &addr) : v4_{addr} {}
 
-  auto Domain() const { return domain_; }
+  [[nodiscard]] auto Domain() const { return domain_; }
 
-  bool IsV4() const { return Domain() == SockDomain::kV4; }
-  bool IsV6() const { return !IsV4(); }
+  [[nodiscard]] bool IsV4() const { return Domain() == SockDomain::kV4; }
+  [[nodiscard]] bool IsV6() const { return !IsV4(); }
 
-  auto const &V4() const { return v4_; }
-  auto const &V6() const { return v6_; }
+  [[nodiscard]] auto const &V4() const { return v4_; }
+  [[nodiscard]] auto const &V6() const { return v6_; }
 };
 
 /**
@@ -261,6 +261,7 @@ class TCPSocket {
 
  private:
   HandleT handle_{InvalidSocket()};
+  bool non_blocking_{false};
   // There's reliable no way to extract domain from a socket without first binding that
   // socket on macos.
 #if defined(__APPLE__)
@@ -276,7 +277,7 @@ class TCPSocket {
   /**
    * \brief Return the socket domain.
    */
-  auto Domain() const -> SockDomain {
+  [[nodiscard]] auto Domain() const -> SockDomain {
     auto ret_iafamily = [](std::int32_t domain) {
       switch (domain) {
         case AF_INET:
@@ -321,10 +322,10 @@ class TCPSocket {
 #endif  // platforms
   }
 
-  bool IsClosed() const { return handle_ == InvalidSocket(); }
+  [[nodiscard]] bool IsClosed() const { return handle_ == InvalidSocket(); }
 
-  /** \brief get last error code if any */
-  Result GetSockError() const {
+  /** @brief get last error code if any */
+  [[nodiscard]] Result GetSockError() const {
     std::int32_t optval = 0;
     socklen_t len = sizeof(optval);
     auto ret = getsockopt(handle_, SOL_SOCKET, SO_ERROR, reinterpret_cast<char *>(&optval), &len);
@@ -340,7 +341,7 @@ class TCPSocket {
   }
 
   /** \brief check if anything bad happens */
-  bool BadSocket() const {
+  [[nodiscard]] bool BadSocket() const {
     if (IsClosed()) {
       return true;
     }
@@ -352,24 +353,63 @@ class TCPSocket {
     return false;
   }
 
-  void SetNonBlock(bool non_block) {
+  [[nodiscard]] Result NonBlocking(bool non_block) {
 #if defined(_WIN32)
     u_long mode = non_block ? 1 : 0;
-    xgboost_CHECK_SYS_CALL(ioctlsocket(handle_, FIONBIO, &mode), NO_ERROR);
+    if (ioctlsocket(handle_, FIONBIO, &mode) != NO_ERROR) {
+      return system::FailWithCode("Failed to set socket to non-blocking.");
+    }
 #else
     std::int32_t flag = fcntl(handle_, F_GETFL, 0);
-    if (flag == -1) {
-      system::ThrowAtError("fcntl");
+    auto rc = flag;
+    if (rc == -1) {
+      return system::FailWithCode("Failed to get socket flag.");
     }
     if (non_block) {
       flag |= O_NONBLOCK;
     } else {
       flag &= ~O_NONBLOCK;
     }
-    if (fcntl(handle_, F_SETFL, flag) == -1) {
-      system::ThrowAtError("fcntl");
+    rc = fcntl(handle_, F_SETFL, flag);
+    if (rc == -1) {
+      return system::FailWithCode("Failed to set socket to non-blocking.");
     }
 #endif  // _WIN32
+    non_blocking_ = non_block;
+    return Success();
+  }
+  [[nodiscard]] bool NonBlocking() const { return non_blocking_; }
+  [[nodiscard]] Result RecvTimeout(std::chrono::seconds timeout) {
+    // https://stackoverflow.com/questions/2876024/linux-is-there-a-read-or-recv-from-socket-with-timeout
+#if defined(_WIN32)
+    DWORD tv = timeout.count() * 1000;
+    auto rc =
+        setsockopt(Handle(), SOL_SOCKET, SO_RCVTIMEO, reinterpret_cast<char *>(&tv), sizeof(tv));
+#else
+    struct timeval tv;
+    tv.tv_sec = timeout.count();
+    tv.tv_usec = 0;
+    auto rc = setsockopt(Handle(), SOL_SOCKET, SO_RCVTIMEO, reinterpret_cast<char const *>(&tv),
+                         sizeof(tv));
+#endif
+    if (rc != 0) {
+      return system::FailWithCode("Failed to set timeout on recv.");
+    }
+    return Success();
+  }
+
+  [[nodiscard]] Result SetBufSize(std::int32_t n_bytes) {
+    auto rc = setsockopt(this->Handle(), SOL_SOCKET, SO_SNDBUF, reinterpret_cast<char *>(&n_bytes),
+                         sizeof(n_bytes));
+    if (rc != 0) {
+      return system::FailWithCode("Failed to set send buffer size.");
+    }
+    rc = setsockopt(this->Handle(), SOL_SOCKET, SO_RCVBUF, reinterpret_cast<char *>(&n_bytes),
+                    sizeof(n_bytes));
+    if (rc != 0) {
+      return system::FailWithCode("Failed to set recv buffer size.");
+    }
+    return Success();
   }
 
   void SetKeepAlive() {
@@ -391,14 +431,31 @@ class TCPSocket {
    * \brief Accept new connection, returns a new TCP socket for the new connection.
    */
   TCPSocket Accept() {
-    HandleT newfd = accept(handle_, nullptr, nullptr);
-    if (newfd == InvalidSocket()) {
+    HandleT newfd = accept(Handle(), nullptr, nullptr);
+#if defined(_WIN32)
+    auto interrupt = WSAEINTR;
+#else
+    auto interrupt = EINTR;
+#endif
+    if (newfd == InvalidSocket() && system::LastError() != interrupt) {
       system::ThrowAtError("accept");
     }
     TCPSocket newsock{newfd};
     return newsock;
   }
 
+  [[nodiscard]] Result Accept(TCPSocket *out, SockAddrV4 *addr) {
+    struct sockaddr_in caddr;
+    socklen_t caddr_len = sizeof(caddr);
+    HandleT newfd = accept(Handle(), reinterpret_cast<sockaddr *>(&caddr), &caddr_len);
+    if (newfd == InvalidSocket()) {
+      return system::FailWithCode("Failed to accept.");
+    }
+    *addr = SockAddrV4{caddr};
+    *out = TCPSocket{newfd};
+    return Success();
+  }
+
   ~TCPSocket() {
     if (!IsClosed()) {
       Close();
@@ -413,9 +470,9 @@ class TCPSocket {
     return *this;
   }
   /**
-   * \brief Return the native socket file descriptor.
+   * @brief Return the native socket file descriptor.
    */
-  HandleT const &Handle() const { return handle_; }
+  [[nodiscard]] HandleT const &Handle() const { return handle_; }
   /**
    * \brief Listen to incoming requests. Should be called after bind.
    */
@@ -423,7 +480,7 @@ class TCPSocket {
   /**
    * \brief Bind socket to INADDR_ANY, return the port selected by the OS.
    */
-  in_port_t BindHost() {
+  [[nodiscard]] in_port_t BindHost() {
     if (Domain() == SockDomain::kV6) {
       auto addr = SockAddrV6::InaddrAny();
       auto handle = reinterpret_cast<sockaddr const *>(&addr.Handle());
@@ -448,10 +505,53 @@ class TCPSocket {
       return ntohs(res_addr.sin_port);
     }
   }
+
+  [[nodiscard]] auto Port() const {
+    if (this->Domain() == SockDomain::kV4) {
+      sockaddr_in res_addr;
+      socklen_t addrlen = sizeof(res_addr);
+      auto code = getsockname(handle_, reinterpret_cast<sockaddr *>(&res_addr), &addrlen);
+      if (code != 0) {
+        return std::make_pair(system::FailWithCode("getsockname"), std::int32_t{0});
+      }
+      return std::make_pair(Success(), std::int32_t{ntohs(res_addr.sin_port)});
+    } else {
+      sockaddr_in6 res_addr;
+      socklen_t addrlen = sizeof(res_addr);
+      auto code = getsockname(handle_, reinterpret_cast<sockaddr *>(&res_addr), &addrlen);
+      if (code != 0) {
+        return std::make_pair(system::FailWithCode("getsockname"), std::int32_t{0});
+      }
+      return std::make_pair(Success(), std::int32_t{ntohs(res_addr.sin6_port)});
+    }
+  }
+
+  [[nodiscard]] Result Bind(StringView ip, std::int32_t *port) {
+    // bind socket handle_ to ip
+    auto addr = MakeSockAddress(ip, 0);
+    std::int32_t errc{0};
+    if (addr.IsV4()) {
+      auto handle = reinterpret_cast<sockaddr const *>(&addr.V4().Handle());
+      errc = bind(handle_, handle, sizeof(std::remove_reference_t<decltype(addr.V4().Handle())>));
+    } else {
+      auto handle = reinterpret_cast<sockaddr const *>(&addr.V6().Handle());
+      errc = bind(handle_, handle, sizeof(std::remove_reference_t<decltype(addr.V6().Handle())>));
+    }
+    if (errc != 0) {
+      return system::FailWithCode("Failed to bind socket.");
+    }
+    auto [rc, new_port] = this->Port();
+    if (!rc.OK()) {
+      return std::move(rc);
+    }
+    *port = new_port;
+    return Success();
+  }
+
   /**
    * \brief Send data, without error then all data should be sent.
    */
-  auto SendAll(void const *buf, std::size_t len) {
+  [[nodiscard]] auto SendAll(void const *buf, std::size_t len) {
     char const *_buf = reinterpret_cast<const char *>(buf);
     std::size_t ndone = 0;
     while (ndone < len) {
@@ -470,7 +570,7 @@ class TCPSocket {
   /**
    * \brief Receive data, without error then all data should be received.
    */
-  auto RecvAll(void *buf, std::size_t len) {
+  [[nodiscard]] auto RecvAll(void *buf, std::size_t len) {
     char *_buf = reinterpret_cast<char *>(buf);
     std::size_t ndone = 0;
     while (ndone < len) {
@@ -524,7 +624,15 @@ class TCPSocket {
    */
   void Close() {
     if (InvalidSocket() != handle_) {
+#if defined(_WIN32)
+      auto rc = system::CloseSocket(handle_);
+      // it's possible that we close TCP sockets after finalizing WSA due to detached thread.
+      if (rc != 0 && system::LastError() != WSANOTINITIALISED) {
+        system::ThrowAtError("close", rc);
+      }
+#else
       xgboost_CHECK_SYS_CALL(system::CloseSocket(handle_), 0);
+#endif
       handle_ = InvalidSocket();
     }
   }
@@ -546,6 +654,24 @@ class TCPSocket {
     socket.domain_ = domain;
 #endif  // defined(__APPLE__)
     return socket;
+#endif  // defined(xgboost_IS_MINGW)
+  }
+
+  static TCPSocket *CreatePtr(SockDomain domain) {
+#if defined(xgboost_IS_MINGW)
+    MingWError();
+    return nullptr;
+#else
+    auto fd = socket(static_cast<std::int32_t>(domain), SOCK_STREAM, 0);
+    if (fd == InvalidSocket()) {
+      system::ThrowAtError("socket");
+    }
+    auto socket = new TCPSocket{fd};
+
+#if defined(__APPLE__)
+    socket->domain_ = domain;
+#endif  // defined(__APPLE__)
+    return socket;
 #endif  // defined(xgboost_IS_MINGW)
   }
 };
@@ -567,12 +693,36 @@ class TCPSocket {
                              xgboost::collective::TCPSocket *out_conn);
 
 /**
- * \brief Get the local host name.
+ * @brief Get the local host name.
  */
-inline std::string GetHostName() {
-  char buf[HOST_NAME_MAX];
-  xgboost_CHECK_SYS_CALL(gethostname(&buf[0], HOST_NAME_MAX), 0);
-  return buf;
+[[nodiscard]] Result GetHostName(std::string *p_out);
+
+/**
+ * @brief inet_ntop
+ */
+template <typename H>
+Result INetNToP(H const &host, std::string *p_out) {
+  std::string &ip = *p_out;
+  switch (host->h_addrtype) {
+    case AF_INET: {
+      auto addr = reinterpret_cast<struct in_addr *>(host->h_addr_list[0]);
+      char str[INET_ADDRSTRLEN];
+      inet_ntop(AF_INET, addr, str, INET_ADDRSTRLEN);
+      ip = str;
+      break;
+    }
+    case AF_INET6: {
+      auto addr = reinterpret_cast<struct in6_addr *>(host->h_addr_list[0]);
+      char str[INET6_ADDRSTRLEN];
+      inet_ntop(AF_INET6, addr, str, INET6_ADDRSTRLEN);
+      ip = str;
+      break;
+    }
+    default: {
+      return Fail("Invalid address type.");
+    }
+  }
+  return Success();
 }
 }  // namespace collective
 }  // namespace xgboost
diff --git a/include/xgboost/context.h b/include/xgboost/context.h
index 262733b22..7748db9f9 100644
--- a/include/xgboost/context.h
+++ b/include/xgboost/context.h
@@ -29,31 +29,37 @@ struct DeviceSym {
  *        viewing types like `linalg::TensorView`.
  */
 struct DeviceOrd {
+  // Constant representing the device ID of CPU.
+  static bst_d_ordinal_t constexpr CPUOrdinal() { return -1; }
+  static bst_d_ordinal_t constexpr InvalidOrdinal() { return -2; }
+
   enum Type : std::int16_t { kCPU = 0, kCUDA = 1 } device{kCPU};
   // CUDA device ordinal.
-  bst_d_ordinal_t ordinal{-1};
+  bst_d_ordinal_t ordinal{CPUOrdinal()};
 
   [[nodiscard]] bool IsCUDA() const { return device == kCUDA; }
   [[nodiscard]] bool IsCPU() const { return device == kCPU; }
 
-  DeviceOrd() = default;
+  constexpr DeviceOrd() = default;
   constexpr DeviceOrd(Type type, bst_d_ordinal_t ord) : device{type}, ordinal{ord} {}
 
-  DeviceOrd(DeviceOrd const& that) = default;
-  DeviceOrd& operator=(DeviceOrd const& that) = default;
-  DeviceOrd(DeviceOrd&& that) = default;
-  DeviceOrd& operator=(DeviceOrd&& that) = default;
+  constexpr DeviceOrd(DeviceOrd const& that) = default;
+  constexpr DeviceOrd& operator=(DeviceOrd const& that) = default;
+  constexpr DeviceOrd(DeviceOrd&& that) = default;
+  constexpr DeviceOrd& operator=(DeviceOrd&& that) = default;
 
   /**
    * @brief Constructor for CPU.
    */
-  [[nodiscard]] constexpr static auto CPU() { return DeviceOrd{kCPU, -1}; }
+  [[nodiscard]] constexpr static auto CPU() { return DeviceOrd{kCPU, CPUOrdinal()}; }
   /**
    * @brief Constructor for CUDA device.
    *
    * @param ordinal CUDA device ordinal.
    */
-  [[nodiscard]] static auto CUDA(bst_d_ordinal_t ordinal) { return DeviceOrd{kCUDA, ordinal}; }
+  [[nodiscard]] static constexpr auto CUDA(bst_d_ordinal_t ordinal) {
+    return DeviceOrd{kCUDA, ordinal};
+  }
 
   [[nodiscard]] bool operator==(DeviceOrd const& that) const {
     return device == that.device && ordinal == that.ordinal;
@@ -78,25 +84,26 @@ struct DeviceOrd {
 
 static_assert(sizeof(DeviceOrd) == sizeof(std::int32_t));
 
+std::ostream& operator<<(std::ostream& os, DeviceOrd ord);
+
 /**
  * @brief Runtime context for XGBoost. Contains information like threads and device.
  */
 struct Context : public XGBoostParameter<Context> {
  private:
+  // User interfacing parameter for device ordinal
   std::string device{DeviceSym::CPU()};  // NOLINT
-  // The device object for the current context. We are in the middle of replacing the
-  // `gpu_id` with this device field.
+  // The device ordinal set by user
   DeviceOrd device_{DeviceOrd::CPU()};
 
  public:
-  // Constant representing the device ID of CPU.
-  static bst_d_ordinal_t constexpr kCpuId = -1;
-  static bst_d_ordinal_t constexpr InvalidOrdinal() { return -2; }
   static std::int64_t constexpr kDefaultSeed = 0;
 
  public:
   Context();
 
+  void Init(Args const& kwargs);
+
   template <typename Container>
   Args UpdateAllowUnknown(Container const& kwargs) {
     auto args = XGBoostParameter<Context>::UpdateAllowUnknown(kwargs);
@@ -104,7 +111,6 @@ struct Context : public XGBoostParameter<Context> {
     return args;
   }
 
-  std::int32_t gpu_id{kCpuId};
   // The number of threads to use if OpenMP is enabled. If equals 0, use the system default.
   std::int32_t nthread{0};  // NOLINT
   // stored random seed
@@ -116,7 +122,8 @@ struct Context : public XGBoostParameter<Context> {
   bool validate_parameters{false};
 
   /**
-   * @brief Configure the parameter `gpu_id'.
+   * @brief Configure the parameter `device'. Deprecated, will remove once `gpu_id` is
+   *        removed.
    *
    * @param require_gpu Whether GPU is explicitly required by the user through other
    *                    configurations.
@@ -212,9 +219,7 @@ struct Context : public XGBoostParameter<Context> {
  private:
   void SetDeviceOrdinal(Args const& kwargs);
   Context& SetDevice(DeviceOrd d) {
-    this->device_ = d;
-    this->gpu_id = d.ordinal;  // this can be removed once we move away from `gpu_id`.
-    this->device = d.Name();
+    this->device = (this->device_ = d).Name();
     return *this;
   }
 
diff --git a/include/xgboost/data.h b/include/xgboost/data.h
index eae2f612b..04b489d8b 100644
--- a/include/xgboost/data.h
+++ b/include/xgboost/data.h
@@ -106,10 +106,10 @@ class MetaInfo {
   MetaInfo& operator=(MetaInfo&& that) = default;
   MetaInfo& operator=(MetaInfo const& that) = delete;
 
-  /*!
-   * \brief Validate all metainfo.
+  /**
+   * @brief Validate all metainfo.
    */
-  void Validate(int32_t device) const;
+  void Validate(DeviceOrd device) const;
 
   MetaInfo Slice(common::Span<int32_t const> ridxs) const;
 
@@ -559,8 +559,7 @@ class DMatrix {
    *
    * \param uri The URI of input.
    * \param silent Whether print information during loading.
-   * \param data_split_mode In distributed mode, split the input according this mode; otherwise,
-   *                        it's just an indicator on how the input was split beforehand.
+   * \param data_split_mode Indicate how the data was split beforehand.
    * \return The created DMatrix.
    */
   static DMatrix* Load(const std::string& uri, bool silent = true,
diff --git a/include/xgboost/host_device_vector.h b/include/xgboost/host_device_vector.h
index 13ae5bdf5..eb4b004dd 100644
--- a/include/xgboost/host_device_vector.h
+++ b/include/xgboost/host_device_vector.h
@@ -88,9 +88,9 @@ class HostDeviceVector {
   static_assert(std::is_standard_layout<T>::value, "HostDeviceVector admits only POD types");
 
  public:
-  explicit HostDeviceVector(size_t size = 0, T v = T(), int device = -1);
-  HostDeviceVector(std::initializer_list<T> init, int device = -1);
-  explicit HostDeviceVector(const std::vector<T>& init, int device = -1);
+  explicit HostDeviceVector(size_t size = 0, T v = T(), DeviceOrd device = DeviceOrd::CPU());
+  HostDeviceVector(std::initializer_list<T> init, DeviceOrd device = DeviceOrd::CPU());
+  explicit HostDeviceVector(const std::vector<T>& init, DeviceOrd device = DeviceOrd::CPU());
   ~HostDeviceVector();
 
   HostDeviceVector(const HostDeviceVector<T>&) = delete;
@@ -99,17 +99,9 @@ class HostDeviceVector {
   HostDeviceVector<T>& operator=(const HostDeviceVector<T>&) = delete;
   HostDeviceVector<T>& operator=(HostDeviceVector<T>&&);
 
-  bool Empty() const { return Size() == 0; }
-  size_t Size() const;
-  int DeviceIdx() const;
-  DeviceOrd Device() const {
-    auto idx = this->DeviceIdx();
-    if (idx == DeviceOrd::CPU().ordinal) {
-      return DeviceOrd::CPU();
-    } else {
-      return DeviceOrd::CUDA(idx);
-    }
-  }
+  [[nodiscard]] bool Empty() const { return Size() == 0; }
+  [[nodiscard]] std::size_t Size() const;
+  [[nodiscard]] DeviceOrd Device() const;
   common::Span<T> DeviceSpan();
   common::Span<const T> ConstDeviceSpan() const;
   common::Span<const T> DeviceSpan() const { return ConstDeviceSpan(); }
@@ -135,13 +127,12 @@ class HostDeviceVector {
   const std::vector<T>& ConstHostVector() const;
   const std::vector<T>& HostVector() const {return ConstHostVector(); }
 
-  bool HostCanRead() const;
-  bool HostCanWrite() const;
-  bool DeviceCanRead() const;
-  bool DeviceCanWrite() const;
-  GPUAccess DeviceAccess() const;
+  [[nodiscard]] bool HostCanRead() const;
+  [[nodiscard]] bool HostCanWrite() const;
+  [[nodiscard]] bool DeviceCanRead() const;
+  [[nodiscard]] bool DeviceCanWrite() const;
+  [[nodiscard]] GPUAccess DeviceAccess() const;
 
-  void SetDevice(int device) const;
   void SetDevice(DeviceOrd device) const;
 
   void Resize(size_t new_size, T v = T());
diff --git a/include/xgboost/json.h b/include/xgboost/json.h
index cb22e120e..c2c16ef8f 100644
--- a/include/xgboost/json.h
+++ b/include/xgboost/json.h
@@ -372,6 +372,19 @@ class Json {
   /*! \brief Use your own JsonWriter. */
   static void Dump(Json json, JsonWriter* writer);
 
+  template <typename Container = std::string>
+  static Container Dump(Json json) {
+    if constexpr (std::is_same_v<Container, std::string>) {
+      std::string str;
+      Dump(json, &str);
+      return str;
+    } else {
+      std::vector<char> str;
+      Dump(json, &str);
+      return str;
+    }
+  }
+
   Json() = default;
 
   // number
@@ -595,44 +608,6 @@ using Boolean = JsonBoolean;
 using String  = JsonString;
 using Null    = JsonNull;
 
-// Utils tailored for XGBoost.
-namespace detail {
-template <typename Head>
-bool TypeCheckImpl(Json const& value) {
-  return IsA<Head>(value);
-}
-
-template <typename Head, typename... JT>
-std::enable_if_t<sizeof...(JT) != 0, bool> TypeCheckImpl(Json const& value) {
-  return IsA<Head>(value) || TypeCheckImpl<JT...>(value);
-}
-
-template <typename Head>
-std::string TypeCheckError() {
-  return "`" + Head{}.TypeStr() + "`";
-}
-
-template <typename Head, typename... JT>
-std::enable_if_t<sizeof...(JT) != 0, std::string> TypeCheckError() {
-  return "`" + Head{}.TypeStr() + "`, " + TypeCheckError<JT...>();
-}
-}  // namespace detail
-
-/**
- * \brief Type check for JSON-based parameters
- *
- * \tparam JT    Expected JSON types.
- * \param  value Value to be checked.
- */
-template <typename... JT>
-void TypeCheck(Json const& value, StringView name) {
-  if (!detail::TypeCheckImpl<JT...>(value)) {
-    LOG(FATAL) << "Invalid type for: `" << name << "`, expecting one of the: {`"
-               << detail::TypeCheckError<JT...>() << "}, got: `" << value.GetValue().TypeStr()
-               << "`";
-  }
-}
-
 /**
  * \brief Convert XGBoost parameter to JSON object.
  *
diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h
index c183e2eea..09ad0d847 100644
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -603,13 +603,13 @@ auto MakeTensorView(Context const *ctx, Order order, common::Span<T> data, S &&.
 
 template <typename T, typename... S>
 auto MakeTensorView(Context const *ctx, HostDeviceVector<T> *data, S &&...shape) {
-  auto span = ctx->IsCPU() ? data->HostSpan() : data->DeviceSpan();
+  auto span = ctx->IsCUDA() ? data->DeviceSpan() : data->HostSpan();
   return MakeTensorView(ctx->Device(), span, std::forward<S>(shape)...);
 }
 
 template <typename T, typename... S>
 auto MakeTensorView(Context const *ctx, HostDeviceVector<T> const *data, S &&...shape) {
-  auto span = ctx->IsCPU() ? data->ConstHostSpan() : data->ConstDeviceSpan();
+  auto span = ctx->IsCUDA() ? data->ConstDeviceSpan() : data->ConstHostSpan();
   return MakeTensorView(ctx->Device(), span, std::forward<S>(shape)...);
 }
 
@@ -659,13 +659,13 @@ auto MakeVec(T *ptr, size_t s, DeviceOrd device = DeviceOrd::CPU()) {
 
 template <typename T>
 auto MakeVec(HostDeviceVector<T> *data) {
-  return MakeVec(data->DeviceIdx() == -1 ? data->HostPointer() : data->DevicePointer(),
-                 data->Size(), data->Device());
+  return MakeVec(data->Device().IsCPU() ? data->HostPointer() : data->DevicePointer(), data->Size(),
+                 data->Device());
 }
 
 template <typename T>
 auto MakeVec(HostDeviceVector<T> const *data) {
-  return MakeVec(data->DeviceIdx() == -1 ? data->ConstHostPointer() : data->ConstDevicePointer(),
+  return MakeVec(data->Device().IsCPU() ? data->ConstHostPointer() : data->ConstDevicePointer(),
                  data->Size(), data->Device());
 }
 
@@ -757,13 +757,13 @@ class Tensor {
   Order order_{Order::kC};
 
   template <typename I, std::int32_t D>
-  void Initialize(I const (&shape)[D], std::int32_t device) {
+  void Initialize(I const (&shape)[D], DeviceOrd device) {
     static_assert(D <= kDim, "Invalid shape.");
     std::copy(shape, shape + D, shape_);
     for (auto i = D; i < kDim; ++i) {
       shape_[i] = 1;
     }
-    if (device >= 0) {
+    if (device.IsCUDA()) {
       data_.SetDevice(device);
       data_.ConstDevicePointer();  // Pull to device;
     }
@@ -780,14 +780,11 @@ class Tensor {
    * See \ref TensorView for parameters of this constructor.
    */
   template <typename I, int32_t D>
-  explicit Tensor(I const (&shape)[D], std::int32_t device, Order order = kC)
-      : Tensor{common::Span<I const, D>{shape}, device, order} {}
-  template <typename I, int32_t D>
   explicit Tensor(I const (&shape)[D], DeviceOrd device, Order order = kC)
-      : Tensor{common::Span<I const, D>{shape}, device.ordinal, order} {}
+      : Tensor{common::Span<I const, D>{shape}, device, order} {}
 
   template <typename I, size_t D>
-  explicit Tensor(common::Span<I const, D> shape, std::int32_t device, Order order = kC)
+  explicit Tensor(common::Span<I const, D> shape, DeviceOrd device, Order order = kC)
       : order_{order} {
     // No device unroll as this is a host only function.
     std::copy(shape.data(), shape.data() + D, shape_);
@@ -795,11 +792,11 @@ class Tensor {
       shape_[i] = 1;
     }
     auto size = detail::CalcSize(shape_);
-    if (device >= 0) {
+    if (device.IsCUDA()) {
       data_.SetDevice(device);
     }
     data_.Resize(size);
-    if (device >= 0) {
+    if (device.IsCUDA()) {
       data_.DevicePointer();  // Pull to device
     }
   }
@@ -807,7 +804,7 @@ class Tensor {
    * Initialize from 2 host iterators.
    */
   template <typename It, typename I, int32_t D>
-  explicit Tensor(It begin, It end, I const (&shape)[D], std::int32_t device, Order order = kC)
+  explicit Tensor(It begin, It end, I const (&shape)[D], DeviceOrd device, Order order = kC)
       : order_{order} {
     auto &h_vec = data_.HostVector();
     h_vec.insert(h_vec.begin(), begin, end);
@@ -816,7 +813,7 @@ class Tensor {
   }
 
   template <typename I, int32_t D>
-  explicit Tensor(std::initializer_list<T> data, I const (&shape)[D], std::int32_t device,
+  explicit Tensor(std::initializer_list<T> data, I const (&shape)[D], DeviceOrd device,
                   Order order = kC)
       : order_{order} {
     auto &h_vec = data_.HostVector();
@@ -824,10 +821,6 @@ class Tensor {
     // shape
     this->Initialize(shape, device);
   }
-  template <typename I, int32_t D>
-  explicit Tensor(std::initializer_list<T> data, I const (&shape)[D], DeviceOrd device,
-                  Order order = kC)
-      : Tensor{data, shape, device.ordinal, order} {}
   /**
    * \brief Index operator. Not thread safe, should not be used in performance critical
    *        region. For more efficient indexing, consider getting a view first.
@@ -944,9 +937,7 @@ class Tensor {
   /**
    * \brief Set device ordinal for this tensor.
    */
-  void SetDevice(int32_t device) const { data_.SetDevice(device); }
   void SetDevice(DeviceOrd device) const { data_.SetDevice(device); }
-  [[nodiscard]] int32_t DeviceIdx() const { return data_.DeviceIdx(); }
   [[nodiscard]] DeviceOrd Device() const { return data_.Device(); }
 };
 
@@ -962,7 +953,7 @@ using Vector = Tensor<T, 1>;
 template <typename T, typename... Index>
 auto Empty(Context const *ctx, Index &&...index) {
   Tensor<T, sizeof...(Index)> t;
-  t.SetDevice(ctx->gpu_id);
+  t.SetDevice(ctx->Device());
   t.Reshape(index...);
   return t;
 }
@@ -973,7 +964,7 @@ auto Empty(Context const *ctx, Index &&...index) {
 template <typename T, typename... Index>
 auto Constant(Context const *ctx, T v, Index &&...index) {
   Tensor<T, sizeof...(Index)> t;
-  t.SetDevice(ctx->gpu_id);
+  t.SetDevice(ctx->Device());
   t.Reshape(index...);
   t.Data()->Fill(std::move(v));
   return t;
@@ -990,8 +981,8 @@ auto Zeros(Context const *ctx, Index &&...index) {
 // Only first axis is supported for now.
 template <typename T, int32_t D>
 void Stack(Tensor<T, D> *l, Tensor<T, D> const &r) {
-  if (r.DeviceIdx() >= 0) {
-    l->SetDevice(r.DeviceIdx());
+  if (r.Device().IsCUDA()) {
+    l->SetDevice(r.Device());
   }
   l->ModifyInplace([&](HostDeviceVector<T> *data, common::Span<size_t, D> shape) {
     for (size_t i = 1; i < D; ++i) {
diff --git a/include/xgboost/predictor.h b/include/xgboost/predictor.h
index 2c69cf648..25571213d 100644
--- a/include/xgboost/predictor.h
+++ b/include/xgboost/predictor.h
@@ -52,9 +52,9 @@ class PredictionContainer : public DMatrixCache<PredictionCacheEntry> {
 
  public:
   PredictionContainer() : DMatrixCache<PredictionCacheEntry>{DefaultSize()} {}
-  PredictionCacheEntry& Cache(std::shared_ptr<DMatrix> m, std::int32_t device) {
+  PredictionCacheEntry& Cache(std::shared_ptr<DMatrix> m, DeviceOrd device) {
     auto p_cache = this->CacheItem(m);
-    if (device != Context::kCpuId) {
+    if (device.IsCUDA()) {
       p_cache->predictions.SetDevice(device);
     }
     return *p_cache;
diff --git a/include/xgboost/string_view.h b/include/xgboost/string_view.h
index 8b5bff7f6..ba0d9f368 100644
--- a/include/xgboost/string_view.h
+++ b/include/xgboost/string_view.h
@@ -29,7 +29,7 @@ struct StringView {
  public:
   constexpr StringView() = default;
   constexpr StringView(CharT const* str, std::size_t size) : str_{str}, size_{size} {}
-  explicit StringView(std::string const& str) : str_{str.c_str()}, size_{str.size()} {}
+  StringView(std::string const& str) : str_{str.c_str()}, size_{str.size()} {}  // NOLINT
   constexpr StringView(CharT const* str)  // NOLINT
       : str_{str}, size_{str == nullptr ? 0ul : Traits::length(str)} {}
 
diff --git a/jvm-packages/CMakeLists.txt b/jvm-packages/CMakeLists.txt
index 247c44378..d87301753 100644
--- a/jvm-packages/CMakeLists.txt
+++ b/jvm-packages/CMakeLists.txt
@@ -4,16 +4,16 @@ list(APPEND JVM_SOURCES
   ${PROJECT_SOURCE_DIR}/jvm-packages/xgboost4j/src/native/xgboost4j.cpp
   ${PROJECT_SOURCE_DIR}/jvm-packages/xgboost4j-gpu/src/native/xgboost4j-gpu.cpp)
 
-if (USE_CUDA)
+if(USE_CUDA)
   list(APPEND JVM_SOURCES
     ${PROJECT_SOURCE_DIR}/jvm-packages/xgboost4j-gpu/src/native/xgboost4j-gpu.cu)
-endif (USE_CUDA)
+endif()
 
 add_library(xgboost4j SHARED ${JVM_SOURCES} ${XGBOOST_OBJ_SOURCES})
 
-if (ENABLE_ALL_WARNINGS)
+if(ENABLE_ALL_WARNINGS)
   target_compile_options(xgboost4j PUBLIC -Wall -Wextra)
-endif (ENABLE_ALL_WARNINGS)
+endif()
 
 target_link_libraries(xgboost4j PRIVATE objxgboost)
 target_include_directories(xgboost4j
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ExternalCheckpointManagerSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ExternalCheckpointManagerSuite.scala
index adc9c1068..e6835158d 100755
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ExternalCheckpointManagerSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ExternalCheckpointManagerSuite.scala
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2014-2022 by Contributors
+ Copyright (c) 2014-2023 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -32,57 +32,53 @@ class ExternalCheckpointManagerSuite extends AnyFunSuite with TmpFolderPerSuite
   }
 
   private def createNewModels():
-    (String, XGBoostClassificationModel, XGBoostClassificationModel) = {
+      (String, XGBoostClassificationModel, XGBoostClassificationModel) = {
     val tmpPath = createTmpFolder("test").toAbsolutePath.toString
-    val (model4, model8) = {
+    val (model2, model4) = {
       val training = buildDataFrame(Classification.train)
       val paramMap = produceParamMap(tmpPath, 2)
       (new XGBoostClassifier(paramMap ++ Seq("num_round" -> 2)).fit(training),
         new XGBoostClassifier(paramMap ++ Seq("num_round" -> 4)).fit(training))
     }
-    (tmpPath, model4, model8)
+    (tmpPath, model2, model4)
   }
 
   test("test update/load models") {
-    val (tmpPath, model4, model8) = createNewModels()
+    val (tmpPath, model2, model4) = createNewModels()
     val manager = new ExternalCheckpointManager(tmpPath, FileSystem.get(sc.hadoopConfiguration))
 
-    manager.updateCheckpoint(model4._booster.booster)
+    manager.updateCheckpoint(model2._booster.booster)
     var files = FileSystem.get(sc.hadoopConfiguration).listStatus(new Path(tmpPath))
     assert(files.length == 1)
-    assert(files.head.getPath.getName == "4.model")
-    assert(manager.loadCheckpointAsScalaBooster().getVersion == 4)
+    assert(files.head.getPath.getName == "1.model")
+    assert(manager.loadCheckpointAsScalaBooster().getNumBoostedRound == 2)
 
-    manager.updateCheckpoint(model8._booster)
+    manager.updateCheckpoint(model4._booster)
     files = FileSystem.get(sc.hadoopConfiguration).listStatus(new Path(tmpPath))
     assert(files.length == 1)
-    assert(files.head.getPath.getName == "8.model")
-    assert(manager.loadCheckpointAsScalaBooster().getVersion == 8)
+    assert(files.head.getPath.getName == "3.model")
+    assert(manager.loadCheckpointAsScalaBooster().getNumBoostedRound == 4)
   }
 
   test("test cleanUpHigherVersions") {
-    val (tmpPath, model4, model8) = createNewModels()
+    val (tmpPath, model2, model4) = createNewModels()
 
     val manager = new ExternalCheckpointManager(tmpPath, FileSystem.get(sc.hadoopConfiguration))
-    manager.updateCheckpoint(model8._booster)
-    manager.cleanUpHigherVersions(8)
-    assert(new File(s"$tmpPath/8.model").exists())
+    manager.updateCheckpoint(model4._booster)
+    manager.cleanUpHigherVersions(3)
+    assert(new File(s"$tmpPath/3.model").exists())
 
-    manager.cleanUpHigherVersions(4)
-    assert(!new File(s"$tmpPath/8.model").exists())
+    manager.cleanUpHigherVersions(2)
+    assert(!new File(s"$tmpPath/3.model").exists())
   }
 
   test("test checkpoint rounds") {
     import scala.collection.JavaConverters._
-    val (tmpPath, model4, model8) = createNewModels()
+    val (tmpPath, model2, model4) = createNewModels()
     val manager = new ExternalCheckpointManager(tmpPath, FileSystem.get(sc.hadoopConfiguration))
-    assertResult(Seq(7))(
-      manager.getCheckpointRounds(0, 7).asScala)
-    assertResult(Seq(2, 4, 6, 7))(
-      manager.getCheckpointRounds(2, 7).asScala)
-    manager.updateCheckpoint(model4._booster)
-    assertResult(Seq(4, 6, 7))(
-      manager.getCheckpointRounds(2, 7).asScala)
+    assertResult(Seq(2))(manager.getCheckpointRounds(0, 0, 3).asScala)
+    assertResult(Seq(0, 2, 4, 6))(manager.getCheckpointRounds(0, 2, 7).asScala)
+    assertResult(Seq(0, 2, 4, 6, 7))(manager.getCheckpointRounds(0, 2, 8).asScala)
   }
 
 
@@ -109,8 +105,8 @@ class ExternalCheckpointManagerSuite extends AnyFunSuite with TmpFolderPerSuite
       // Check only one model is kept after training
       val files = FileSystem.get(sc.hadoopConfiguration).listStatus(new Path(tmpPath))
       assert(files.length == 1)
-      assert(files.head.getPath.getName == "8.model")
-      val tmpModel = SXGBoost.loadModel(s"$tmpPath/8.model")
+      assert(files.head.getPath.getName == "4.model")
+      val tmpModel = SXGBoost.loadModel(s"$tmpPath/4.model")
       // Train next model based on prev model
       val nextModel = new XGBoostClassifier(paramMap ++ Seq("num_round" -> 8)).fit(training)
       assert(error(tmpModel) >= error(prevModel._booster))
diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java
index 11f5299c0..51959ce0c 100644
--- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java
+++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/Booster.java
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2014-2022 by Contributors
+ Copyright (c) 2014-2023 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -39,6 +39,21 @@ public class Booster implements Serializable, KryoSerializable {
   // handle to the booster.
   private long handle = 0;
   private int version = 0;
+  /**
+   * Type of prediction, used for inplace_predict.
+   */
+  public enum PredictionType {
+    kValue(0),
+    kMargin(1);
+
+    private Integer ptype;
+    private PredictionType(final Integer ptype) {
+      this.ptype = ptype;
+    }
+    public Integer getPType() {
+      return ptype;
+    }
+  }
 
   /**
    * Create a new Booster with empty stage.
@@ -375,6 +390,97 @@ public class Booster implements Serializable, KryoSerializable {
     return predicts;
   }
 
+  /**
+   * Perform thread-safe prediction.
+   *
+   * @param data      Flattened input matrix of features for prediction
+   * @param nrow      The number of preditions to make (count of input matrix rows)
+   * @param ncol      The number of features in the model (count of input matrix columns)
+   * @param missing   Value indicating missing element in the <code>data</code> input matrix
+   *
+   * @return predict  Result matrix
+   */
+  public float[][] inplace_predict(float[] data,
+                                   int nrow,
+                                   int ncol,
+                                   float missing) throws XGBoostError {
+    int[] iteration_range = new int[2];
+    iteration_range[0] = 0;
+    iteration_range[1] = 0;
+    return this.inplace_predict(data, nrow, ncol,
+        missing, iteration_range, PredictionType.kValue, null);
+  }
+
+  /**
+   * Perform thread-safe prediction.
+   *
+   * @param data      Flattened input matrix of features for prediction
+   * @param nrow      The number of preditions to make (count of input matrix rows)
+   * @param ncol      The number of features in the model (count of input matrix columns)
+   * @param missing   Value indicating missing element in the <code>data</code> input matrix
+   * @param iteration_range Specifies which layer of trees are used in prediction.  For
+   *                        example, if a random forest is trained with 100 rounds.
+   *                        Specifying `iteration_range=[10, 20)`, then only the forests
+   *                        built during [10, 20) (half open set) rounds are used in this
+   *                        prediction.
+   *
+   * @return predict  Result matrix
+   */
+  public float[][] inplace_predict(float[] data,
+                                   int nrow,
+                                   int ncol,
+                                   float missing, int[] iteration_range) throws XGBoostError {
+    return this.inplace_predict(data, nrow, ncol,
+        missing, iteration_range, PredictionType.kValue, null);
+  }
+
+
+  /**
+   * Perform thread-safe prediction.
+   *
+   * @param data            Flattened input matrix of features for prediction
+   * @param nrow            The number of preditions to make (count of input matrix rows)
+   * @param ncol            The number of features in the model (count of input matrix columns)
+   * @param missing         Value indicating missing element in the <code>data</code> input matrix
+   * @param iteration_range Specifies which layer of trees are used in prediction.  For
+   *                        example, if a random forest is trained with 100 rounds.
+   *                        Specifying `iteration_range=[10, 20)`, then only the forests
+   *                        built during [10, 20) (half open set) rounds are used in this
+   *                        prediction.
+   * @param predict_type    What kind of prediction to run.
+   * @return predict       Result matrix
+   */
+  public float[][] inplace_predict(float[] data,
+                                   int nrow,
+                                   int ncol,
+                                   float missing,
+                                   int[] iteration_range,
+                                   PredictionType predict_type,
+                                   float[] base_margin) throws XGBoostError {
+    if (iteration_range.length != 2) {
+      throw new XGBoostError(new String("Iteration range is expected to be [begin, end)."));
+    }
+    int ptype = predict_type.getPType();
+
+    int begin = iteration_range[0];
+    int end = iteration_range[1];
+
+    float[][] rawPredicts = new float[1][];
+    XGBoostJNI.checkCall(XGBoostJNI.XGBoosterPredictFromDense(handle, data, nrow, ncol,
+        missing,
+        begin, end, ptype, base_margin, rawPredicts));
+
+    int col = rawPredicts[0].length / nrow;
+    float[][] predicts = new float[nrow][col];
+    int r, c;
+    for (int i = 0; i < rawPredicts[0].length; i++) {
+      r = i / col;
+      c = i % col;
+      predicts[r][c] = rawPredicts[0][i];
+    }
+    return predicts;
+  }
+
   /**
    * Predict leaf indices given the data
    *
@@ -681,35 +787,6 @@ public class Booster implements Serializable, KryoSerializable {
     return importanceMap;
   }
 
-  /**
-   * Save the model as byte array representation.
-   * Write these bytes to a file will give compatible format with other xgboost bindings.
-   *
-   * If java natively support HDFS file API, use toByteArray and write the ByteArray
-   *
-   * @param withStats Controls whether the split statistics are output.
-   * @return dumped model information
-   * @throws XGBoostError native error
-   */
-  private String[] getDumpInfo(boolean withStats) throws XGBoostError {
-    int statsFlag = 0;
-    if (withStats) {
-      statsFlag = 1;
-    }
-    String[][] modelInfos = new String[1][];
-    XGBoostJNI.checkCall(XGBoostJNI.XGBoosterDumpModelEx(handle, "", statsFlag, "text",
-            modelInfos));
-    return modelInfos[0];
-  }
-
-  public int getVersion() {
-    return this.version;
-  }
-
-  public void setVersion(int version) {
-    this.version = version;
-  }
-
   /**
    * Save model into raw byte array. Currently it's using the deprecated format as
    * default, which will be changed into `ubj` in future releases.
@@ -735,29 +812,6 @@ public class Booster implements Serializable, KryoSerializable {
     return bytes[0];
   }
 
-  /**
-   * Load the booster model from thread-local rabit checkpoint.
-   * This is only used in distributed training.
-   * @return the stored version number of the checkpoint.
-   * @throws XGBoostError
-   */
-  int loadRabitCheckpoint() throws XGBoostError {
-    int[] out = new int[1];
-    XGBoostJNI.checkCall(XGBoostJNI.XGBoosterLoadRabitCheckpoint(this.handle, out));
-    version = out[0];
-    return version;
-  }
-
-  /**
-   * Save the booster model into thread-local rabit checkpoint and increment the version.
-   * This is only used in distributed training.
-   * @throws XGBoostError
-   */
-  void saveRabitCheckpoint() throws XGBoostError {
-    XGBoostJNI.checkCall(XGBoostJNI.XGBoosterSaveRabitCheckpoint(this.handle));
-    version += 1;
-  }
-
   /**
    * Get number of model features.
    * @return the number of features.
@@ -768,6 +822,11 @@ public class Booster implements Serializable, KryoSerializable {
     XGBoostJNI.checkCall(XGBoostJNI.XGBoosterGetNumFeature(this.handle, numFeature));
     return numFeature[0];
   }
+  public int getNumBoostedRound() throws XGBoostError {
+    int[] numRound = new int[1];
+    XGBoostJNI.checkCall(XGBoostJNI.XGBoosterGetNumBoostedRound(this.handle, numRound));
+    return numRound[0];
+  }
 
   /**
    * Internal initialization function.
diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/ExternalCheckpointManager.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/ExternalCheckpointManager.java
index 655b99020..3d794756d 100644
--- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/ExternalCheckpointManager.java
+++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/ExternalCheckpointManager.java
@@ -1,3 +1,18 @@
+/*
+ Copyright (c) 2014-2023 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
 package ml.dmlc.xgboost4j.java;
 
 import java.io.IOException;
@@ -15,7 +30,7 @@ public class ExternalCheckpointManager {
 
   private Log logger = LogFactory.getLog("ExternalCheckpointManager");
   private String modelSuffix = ".model";
-  private Path checkpointPath;
+  private Path checkpointPath;  // directory for checkpoints
   private FileSystem fs;
 
   public ExternalCheckpointManager(String checkpointPath, FileSystem fs) throws XGBoostError {
@@ -35,6 +50,7 @@ public class ExternalCheckpointManager {
     if (!fs.exists(checkpointPath)) {
       return new ArrayList<>();
     } else {
+      // Get integer versions from a list of checkpoint files.
       return Arrays.stream(fs.listStatus(checkpointPath))
               .map(path -> path.getPath().getName())
               .filter(fileName -> fileName.endsWith(modelSuffix))
@@ -44,6 +60,11 @@ public class ExternalCheckpointManager {
     }
   }
 
+  private Integer latest(List<Integer> versions) {
+    return versions.stream()
+        .max(Comparator.comparing(Integer::valueOf)).get();
+  }
+
   public void cleanPath() throws IOException {
     fs.delete(checkpointPath, true);
   }
@@ -51,12 +72,11 @@ public class ExternalCheckpointManager {
   public Booster loadCheckpointAsBooster() throws IOException, XGBoostError {
     List<Integer> versions = getExistingVersions();
     if (versions.size() > 0) {
-      int latestVersion = versions.stream().max(Comparator.comparing(Integer::valueOf)).get();
+      int latestVersion = this.latest(versions);
       String checkpointPath = getPath(latestVersion);
       InputStream in = fs.open(new Path(checkpointPath));
       logger.info("loaded checkpoint from " + checkpointPath);
       Booster booster = XGBoost.loadModel(in);
-      booster.setVersion(latestVersion);
       return booster;
     } else {
       return null;
@@ -65,13 +85,16 @@ public class ExternalCheckpointManager {
 
   public void updateCheckpoint(Booster boosterToCheckpoint) throws IOException, XGBoostError {
     List<String> prevModelPaths = getExistingVersions().stream()
-            .map(this::getPath).collect(Collectors.toList());
-    String eventualPath = getPath(boosterToCheckpoint.getVersion());
+        .map(this::getPath).collect(Collectors.toList());
+    // checkpointing is done after update, so n_rounds - 1 is the current iteration
+    // accounting for training continuation.
+    Integer iter = boosterToCheckpoint.getNumBoostedRound() - 1;
+    String eventualPath = getPath(iter);
     String tempPath = eventualPath + "-" + UUID.randomUUID();
     try (OutputStream out = fs.create(new Path(tempPath), true)) {
       boosterToCheckpoint.saveModel(out);
       fs.rename(new Path(tempPath), new Path(eventualPath));
-      logger.info("saving checkpoint with version " + boosterToCheckpoint.getVersion());
+      logger.info("saving checkpoint with version " + iter);
       prevModelPaths.stream().forEach(path -> {
         try {
           fs.delete(new Path(path), true);
@@ -83,7 +106,7 @@ public class ExternalCheckpointManager {
   }
 
   public void cleanUpHigherVersions(int currentRound) throws IOException {
-    getExistingVersions().stream().filter(v -> v / 2 >= currentRound).forEach(v -> {
+    getExistingVersions().stream().filter(v -> v > currentRound).forEach(v -> {
       try {
         fs.delete(new Path(getPath(v)), true);
       } catch (IOException e) {
@@ -91,27 +114,26 @@ public class ExternalCheckpointManager {
       }
     });
   }
-
-  public List<Integer> getCheckpointRounds(int checkpointInterval, int numOfRounds)
+  // Get a list of iterations that need checkpointing.
+  public List<Integer> getCheckpointRounds(
+      int firstRound, int checkpointInterval, int numOfRounds)
       throws IOException {
+    int end = firstRound + numOfRounds; // exclusive
+    int lastRound = end - 1;
+    if (end - 1 < 0) {
+      throw new IllegalArgumentException("Inavlid `numOfRounds`.");
+    }
+
+    List<Integer> arr = new ArrayList<>();
     if (checkpointInterval > 0) {
-      List<Integer> prevRounds =
-              getExistingVersions().stream().map(v -> v / 2).collect(Collectors.toList());
-      prevRounds.add(0);
-      int firstCheckpointRound = prevRounds.stream()
-              .max(Comparator.comparing(Integer::valueOf)).get() + checkpointInterval;
-      List<Integer> arr = new ArrayList<>();
-      for (int i = firstCheckpointRound; i <= numOfRounds; i += checkpointInterval) {
+      for (int i = firstRound; i < end; i += checkpointInterval) {
         arr.add(i);
       }
-      arr.add(numOfRounds);
-      return arr;
-    } else if (checkpointInterval <= 0) {
-      List<Integer> l = new ArrayList<Integer>();
-      l.add(numOfRounds);
-      return l;
-    } else {
-      throw new IllegalArgumentException("parameters \"checkpoint_path\" should also be set.");
     }
+
+    if (!arr.contains(lastRound)) {
+      arr.add(lastRound);
+    }
+    return arr;
   }
 }
diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoost.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoost.java
index bcd0b1b11..2be62a343 100644
--- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoost.java
+++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoost.java
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2014,2021 by Contributors
+ Copyright (c) 2014-2023 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -133,7 +133,7 @@ public class XGBoost {
           int earlyStoppingRound) throws XGBoostError {
     return train(dtrain, params, round, watches, metrics, obj, eval, earlyStoppingRound, null);
   }
-
+  // save checkpoint if iter is in checkpointIterations
   private static void saveCheckpoint(
           Booster booster,
           int iter,
@@ -169,7 +169,6 @@ public class XGBoost {
     int bestIteration;
     List<String> names = new ArrayList<String>();
     List<DMatrix> mats = new ArrayList<DMatrix>();
-    Set<Integer> checkpointIterations = new HashSet<>();
     ExternalCheckpointManager ecm = null;
     if (checkpointPath != null) {
       ecm = new ExternalCheckpointManager(checkpointPath, fs);
@@ -203,32 +202,30 @@ public class XGBoost {
       booster = new Booster(params, allMats);
       booster.setFeatureNames(dtrain.getFeatureNames());
       booster.setFeatureTypes(dtrain.getFeatureTypes());
-      booster.loadRabitCheckpoint();
     } else {
       // Start training on an existing booster
       booster.setParams(params);
     }
 
+    Set<Integer> checkpointIterations = new HashSet<>();
     if (ecm != null) {
-      checkpointIterations = new HashSet<>(ecm.getCheckpointRounds(checkpointInterval, numRounds));
+      checkpointIterations = new HashSet<>(
+          ecm.getCheckpointRounds(booster.getNumBoostedRound(), checkpointInterval, numRounds));
     }
 
     boolean initial_best_score_flag = false;
     boolean max_direction = false;
 
     // begin to train
-    for (int iter = booster.getVersion() / 2; iter < numRounds; iter++) {
-      if (booster.getVersion() % 2 == 0) {
-        if (obj != null) {
-          booster.update(dtrain, obj);
-        } else {
-          booster.update(dtrain, iter);
-        }
-        saveCheckpoint(booster, iter, checkpointIterations, ecm);
-        booster.saveRabitCheckpoint();
+    for (int iter = 0; iter < numRounds; iter++) {
+      if (obj != null) {
+        booster.update(dtrain, iter, obj);
+      } else {
+        booster.update(dtrain, iter);
       }
+      saveCheckpoint(booster, iter, checkpointIterations, ecm);
 
-      //evaluation
+      // evaluation
       if (evalMats.length > 0) {
         float[] metricsOut = new float[evalMats.length];
         String evalInfo;
@@ -285,7 +282,6 @@ public class XGBoost {
           Communicator.communicatorPrint(evalInfo + '\n');
         }
       }
-      booster.saveRabitCheckpoint();
     }
     return booster;
   }
diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoostJNI.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoostJNI.java
index d71d0a4f5..236d53e90 100644
--- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoostJNI.java
+++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/XGBoostJNI.java
@@ -119,6 +119,10 @@ class XGBoostJNI {
   public final static native int XGBoosterPredict(long handle, long dmat, int option_mask,
                                                   int ntree_limit, float[][] predicts);
 
+  public final static native int XGBoosterPredictFromDense(long handle, float[] data,
+      long nrow, long ncol, float missing, int iteration_begin, int iteration_end, int predict_type, float[] margin,
+      float[][] predicts);
+
   public final static native int XGBoosterLoadModel(long handle, String fname);
 
   public final static native int XGBoosterSaveModel(long handle, String fname);
@@ -136,10 +140,11 @@ class XGBoostJNI {
   public final static native int XGBoosterGetAttrNames(long handle, String[][] out_strings);
   public final static native int XGBoosterGetAttr(long handle, String key, String[] out_string);
   public final static native int XGBoosterSetAttr(long handle, String key, String value);
-  public final static native int XGBoosterLoadRabitCheckpoint(long handle, int[] out_version);
-  public final static native int XGBoosterSaveRabitCheckpoint(long handle);
+
   public final static native int XGBoosterGetNumFeature(long handle, long[] feature);
 
+  public final static native int XGBoosterGetNumBoostedRound(long handle, int[] rounds);
+
   // communicator functions
   public final static native int CommunicatorInit(String[] args);
   public final static native int CommunicatorFinalize();
@@ -154,10 +159,6 @@ class XGBoostJNI {
   public final static native int XGDMatrixSetInfoFromInterface(
     long handle, String field, String json);
 
-  @Deprecated
-  public final static native int XGDeviceQuantileDMatrixCreateFromCallback(
-    java.util.Iterator<ColumnBatch> iter, float missing, int nthread, int maxBin, long[] out);
-
   public final static native int XGQuantileDMatrixCreateFromCallback(
     java.util.Iterator<ColumnBatch> iter, java.util.Iterator<ColumnBatch> ref, String config, long[] out);
 
diff --git a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/Booster.scala b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/Booster.scala
index 31be86898..c288bfab1 100644
--- a/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/Booster.scala
+++ b/jvm-packages/xgboost4j/src/main/scala/ml/dmlc/xgboost4j/scala/Booster.scala
@@ -326,7 +326,7 @@ class Booster private[xgboost4j](private[xgboost4j] var booster: JBooster)
   @throws(classOf[XGBoostError])
   def getNumFeature: Long = booster.getNumFeature
 
-  def getVersion: Int = booster.getVersion
+  def getNumBoostedRound: Long = booster.getNumBoostedRound
 
   /**
     * Save model into a raw byte array.  Available options are "json", "ubj" and "deprecated".
diff --git a/jvm-packages/xgboost4j/src/native/xgboost4j.cpp b/jvm-packages/xgboost4j/src/native/xgboost4j.cpp
index c0c077430..332b1a127 100644
--- a/jvm-packages/xgboost4j/src/native/xgboost4j.cpp
+++ b/jvm-packages/xgboost4j/src/native/xgboost4j.cpp
@@ -684,6 +684,85 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterPredict
   return ret;
 }
 
+/*
+ * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
+ * Method:    XGBoosterPredictFromDense
+ * Signature: (J[FJJFIII[F[[F)I
+ */
+JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterPredictFromDense(
+    JNIEnv *jenv, jclass jcls, jlong jhandle, jfloatArray jdata, jlong num_rows, jlong num_features,
+    jfloat missing, jint iteration_begin, jint iteration_end, jint predict_type,
+    jfloatArray jmargin, jobjectArray jout) {
+  API_BEGIN();
+  BoosterHandle handle = reinterpret_cast<BoosterHandle>(jhandle);
+
+  /**
+   * Create array interface.
+   */
+  namespace linalg = xgboost::linalg;
+  jfloat *data = jenv->GetFloatArrayElements(jdata, nullptr);
+  xgboost::Context ctx;
+  auto t_data = linalg::MakeTensorView(
+      ctx.Device(),
+      xgboost::common::Span{data, static_cast<std::size_t>(num_rows * num_features)}, num_rows,
+      num_features);
+  auto s_array = linalg::ArrayInterfaceStr(t_data);
+
+  /**
+   * Create configuration object.
+   */
+  xgboost::Json config{xgboost::Object{}};
+  config["cache_id"] = xgboost::Integer{};
+  config["type"] = xgboost::Integer{static_cast<std::int32_t>(predict_type)};
+  config["iteration_begin"] = xgboost::Integer{static_cast<xgboost::bst_layer_t>(iteration_begin)};
+  config["iteration_end"] = xgboost::Integer{static_cast<xgboost::bst_layer_t>(iteration_end)};
+  config["missing"] = xgboost::Number{static_cast<float>(missing)};
+  config["strict_shape"] = xgboost::Boolean{true};
+  std::string s_config;
+  xgboost::Json::Dump(config, &s_config);
+
+  /**
+   * Handle base margin
+   */
+  BoosterHandle proxy{nullptr};
+
+  float *margin{nullptr};
+  if (jmargin) {
+    margin = jenv->GetFloatArrayElements(jmargin, nullptr);
+    JVM_CHECK_CALL(XGProxyDMatrixCreate(&proxy));
+    JVM_CHECK_CALL(
+        XGDMatrixSetFloatInfo(proxy, "base_margin", margin, jenv->GetArrayLength(jmargin)));
+  }
+
+  bst_ulong const *out_shape;
+  bst_ulong out_dim;
+  float const *result;
+  auto ret = XGBoosterPredictFromDense(handle, s_array.c_str(), s_config.c_str(), proxy, &out_shape,
+                                       &out_dim, &result);
+
+  jenv->ReleaseFloatArrayElements(jdata, data, 0);
+  if (proxy) {
+    XGDMatrixFree(proxy);
+    jenv->ReleaseFloatArrayElements(jmargin, margin, 0);
+  }
+
+  if (ret != 0) {
+    return ret;
+  }
+
+  std::size_t n{1};
+  for (std::size_t i = 0; i < out_dim; ++i) {
+    n *= out_shape[i];
+  }
+
+  jfloatArray jarray = jenv->NewFloatArray(n);
+
+  jenv->SetFloatArrayRegion(jarray, 0, n, result);
+  jenv->SetObjectArrayElement(jout, 0, jarray);
+
+  API_END();
+}
+
 /*
  * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
  * Method:    XGBoosterLoadModel
@@ -905,33 +984,6 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterSetAttr
   return ret;
 }
 
-/*
- * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
- * Method:    XGBoosterLoadRabitCheckpoint
- * Signature: (J[I)I
- */
-JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterLoadRabitCheckpoint
-  (JNIEnv *jenv , jclass jcls, jlong jhandle, jintArray jout) {
-  BoosterHandle handle = (BoosterHandle) jhandle;
-  int version;
-  int ret = XGBoosterLoadRabitCheckpoint(handle, &version);
-  JVM_CHECK_CALL(ret);
-  jint jversion = version;
-  jenv->SetIntArrayRegion(jout, 0, 1, &jversion);
-  return ret;
-}
-
-/*
- * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
- * Method:    XGBoosterSaveRabitCheckpoint
- * Signature: (J)I
- */
-JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterSaveRabitCheckpoint
-  (JNIEnv *jenv, jclass jcls, jlong jhandle) {
-  BoosterHandle handle = (BoosterHandle) jhandle;
-  return XGBoosterSaveRabitCheckpoint(handle);
-}
-
 /*
  * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
  * Method:    XGBoosterGetNumFeature
@@ -948,6 +1000,17 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterGetNumFea
   return ret;
 }
 
+JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterGetNumBoostedRound(
+    JNIEnv *jenv, jclass, jlong jhandle, jintArray jout) {
+  BoosterHandle handle = (BoosterHandle)jhandle;
+  std::int32_t n_rounds{0};
+  auto ret = XGBoosterBoostedRounds(handle, &n_rounds);
+  JVM_CHECK_CALL(ret);
+  jint jn_rounds = n_rounds;
+  jenv->SetIntArrayRegion(jout, 0, 1, &jn_rounds);
+  return ret;
+}
+
 /*
  * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
  * Method:    CommunicatorInit
diff --git a/jvm-packages/xgboost4j/src/native/xgboost4j.h b/jvm-packages/xgboost4j/src/native/xgboost4j.h
index b221c6a57..cc4ad53d4 100644
--- a/jvm-packages/xgboost4j/src/native/xgboost4j.h
+++ b/jvm-packages/xgboost4j/src/native/xgboost4j.h
@@ -207,6 +207,14 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterEvalOneIt
 JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterPredict
   (JNIEnv *, jclass, jlong, jlong, jint, jint, jobjectArray);
 
+/*
+ * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
+ * Method:    XGBoosterPredictFromDense
+ * Signature: (J[FJJFIII[F[[F)I
+ */
+JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterPredictFromDense
+  (JNIEnv *, jclass, jlong, jfloatArray, jlong, jlong, jfloat, jint, jint, jint, jfloatArray, jobjectArray);
+
 /*
  * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
  * Method:    XGBoosterLoadModel
@@ -279,22 +287,6 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterGetAttr
 JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterSetAttr
   (JNIEnv *, jclass, jlong, jstring, jstring);
 
-/*
- * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
- * Method:    XGBoosterLoadRabitCheckpoint
- * Signature: (J[I)I
- */
-JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterLoadRabitCheckpoint
-  (JNIEnv *, jclass, jlong, jintArray);
-
-/*
- * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
- * Method:    XGBoosterSaveRabitCheckpoint
- * Signature: (J)I
- */
-JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterSaveRabitCheckpoint
-  (JNIEnv *, jclass, jlong);
-
 /*
  * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
  * Method:    XGBoosterGetNumFeature
@@ -303,6 +295,14 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterSaveRabit
 JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterGetNumFeature
   (JNIEnv *, jclass, jlong, jlongArray);
 
+/*
+ * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
+ * Method:    XGBoosterGetNumBoostedRound
+ * Signature: (J[I)I
+ */
+JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterGetNumBoostedRound
+  (JNIEnv *, jclass, jlong, jintArray);
+
 /*
  * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
  * Method:    CommunicatorInit
@@ -359,14 +359,6 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_CommunicatorAllred
 JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixSetInfoFromInterface
   (JNIEnv *, jclass, jlong, jstring, jstring);
 
-/*
- * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
- * Method:    XGDeviceQuantileDMatrixCreateFromCallback
- * Signature: (Ljava/util/Iterator;FII[J)I
- */
-JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDeviceQuantileDMatrixCreateFromCallback
-  (JNIEnv *, jclass, jobject, jfloat, jint, jint, jlongArray);
-
 /*
  * Class:     ml_dmlc_xgboost4j_java_XGBoostJNI
  * Method:    XGQuantileDMatrixCreateFromCallback
diff --git a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java
index 70966a38f..b686ddbed 100644
--- a/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java
+++ b/jvm-packages/xgboost4j/src/test/java/ml/dmlc/xgboost4j/java/BoosterImplTest.java
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2014-2022 by Contributors
+ Copyright (c) 2014-2023 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -15,16 +15,23 @@
  */
 package ml.dmlc.xgboost4j.java;
 
-import java.io.*;
-import java.util.*;
-
 import junit.framework.TestCase;
 import org.junit.Test;
 
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.IOException;
+import java.util.*;
+import java.util.concurrent.*;
+
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.fail;
+
 /**
- * test cases for Booster
+ * test cases for Booster Inplace Predict
  *
- * @author hzx
+ * @author hzx and Sovrn
  */
 public class BoosterImplTest {
   private String train_uri = "../../demo/data/agaricus.txt.train?indexing_mode=1&format=libsvm";
@@ -99,6 +106,179 @@ public class BoosterImplTest {
     TestCase.assertTrue(eval.eval(predicts, testMat) < 0.1f);
   }
 
+  @Test
+  public void inplacePredictTest() throws XGBoostError {
+    /* Data Generation */
+    // Generate a training set.
+    int trainRows = 1000;
+    int features = 10;
+    int trainSize = trainRows * features;
+    float[] trainX = generateRandomDataSet(trainSize);
+    float[] trainY = generateRandomDataSet(trainRows);
+
+    DMatrix trainingMatrix = new DMatrix(trainX, trainRows, features, Float.NaN);
+    trainingMatrix.setLabel(trainY);
+
+    // Generate a testing set
+    int testRows = 10;
+    int testSize = testRows * features;
+    float[] testX = generateRandomDataSet(testSize);
+    float[] testY = generateRandomDataSet(testRows);
+
+    DMatrix testingMatrix = new DMatrix(testX, testRows, features, Float.NaN);
+    testingMatrix.setLabel(testY);
+
+    /* Training */
+
+    // Set parameters
+    Map<String, Object> params = new HashMap<>();
+    params.put("eta", 1.0);
+    params.put("max_depth",2);
+    params.put("silent", 1);
+    params.put("tree_method", "hist");
+
+    Map<String, DMatrix> watches = new HashMap<>();
+    watches.put("train", trainingMatrix);
+    watches.put("test", testingMatrix);
+
+    Booster booster = XGBoost.train(trainingMatrix, params, 10, watches, null, null);
+
+    /* Prediction */
+
+    // Standard prediction
+    float[][] predictions = booster.predict(testingMatrix);
+
+    // Inplace-prediction
+    float[][] inplacePredictions = booster.inplace_predict(testX, testRows, features, Float.NaN);
+
+    // Confirm that the two prediction results are identical
+    assertArrayEquals(predictions, inplacePredictions);
+  }
+
+  @Test
+  public void inplacePredictMultiPredictTest() throws InterruptedException {
+    // Multithreaded, multiple prediction
+    int trainRows = 1000;
+    int features = 10;
+    int trainSize = trainRows * features;
+
+    int testRows = 10;
+    int testSize = testRows * features;
+
+    //Simulate multiple predictions on multiple random data sets simultaneously.
+    ExecutorService executorService = Executors.newFixedThreadPool(5);
+    int predictsToPerform = 100;
+    for(int i = 0; i < predictsToPerform; i++) {
+      executorService.submit(() -> {
+        try {
+          float[] trainX = generateRandomDataSet(trainSize);
+          float[] trainY = generateRandomDataSet(trainRows);
+          DMatrix trainingMatrix = new DMatrix(trainX, trainRows, features, Float.NaN);
+          trainingMatrix.setLabel(trainY);
+
+          float[] testX = generateRandomDataSet(testSize);
+          float[] testY = generateRandomDataSet(testRows);
+          DMatrix testingMatrix = new DMatrix(testX, testRows, features, Float.NaN);
+          testingMatrix.setLabel(testY);
+
+          Map<String, Object> params = new HashMap<>();
+          params.put("eta", 1.0);
+          params.put("max_depth", 2);
+          params.put("silent", 1);
+          params.put("tree_method", "hist");
+
+          Map<String, DMatrix> watches = new HashMap<>();
+          watches.put("train", trainingMatrix);
+          watches.put("test", testingMatrix);
+
+          Booster booster = XGBoost.train(trainingMatrix, params, 10, watches, null, null);
+
+          float[][] predictions = booster.predict(testingMatrix);
+          float[][] inplacePredictions = booster.inplace_predict(testX, testRows, features, Float.NaN);
+
+          assertArrayEquals(predictions, inplacePredictions);
+        } catch (XGBoostError xgBoostError) {
+          fail(xgBoostError.getMessage());
+        }
+      });
+    }
+    executorService.shutdown();
+    if(!executorService.awaitTermination(1, TimeUnit.MINUTES))
+      executorService.shutdownNow();
+  }
+
+  @Test
+  public void inplacePredictWithMarginTest() throws XGBoostError {
+    //Generate a training set
+    int trainRows = 1000;
+    int features = 10;
+    int trainSize = trainRows * features;
+    float[] trainX = generateRandomDataSet(trainSize);
+    float[] trainY = generateRandomDataSet(trainRows);
+
+    DMatrix trainingMatrix = new DMatrix(trainX, trainRows, features, Float.NaN);
+    trainingMatrix.setLabel(trainY);
+
+    // Generate a testing set
+    int testRows = 10;
+    int testSize = testRows * features;
+    float[] testX = generateRandomDataSet(testSize);
+    float[] testY = generateRandomDataSet(testRows);
+
+    DMatrix testingMatrix = new DMatrix(testX, testRows, features, Float.NaN);
+    testingMatrix.setLabel(testY);
+
+    // Set booster parameters
+    Map<String, Object> params = new HashMap<>();
+    params.put("eta", 1.0);
+    params.put("max_depth",2);
+    params.put("tree_method", "hist");
+    params.put("base_score", 0.0);
+
+    Map<String, DMatrix> watches = new HashMap<>();
+    watches.put("train", trainingMatrix);
+    watches.put("test", testingMatrix);
+
+    // Train booster on training matrix.
+    Booster booster = XGBoost.train(trainingMatrix, params, 10, watches, null, null);
+
+    // Create a margin
+    float[] margin = new float[testRows];
+    Arrays.fill(margin, 0.5f);
+
+    // Define an iteration range to use all training iterations, this should match
+    // the without margin call
+    // which defines an iteration range of [0,0)
+    int[] iterationRange = new int[] { 0, 0 };
+
+    float[][] inplacePredictionsWithMargin = booster.inplace_predict(testX,
+        testRows,
+        features,
+        Float.NaN,
+        iterationRange,
+        Booster.PredictionType.kValue,
+        margin);
+    float[][] inplacePredictionsWithoutMargin = booster.inplace_predict(testX, testRows, features, Float.NaN);
+
+    for (int i = 0; i < inplacePredictionsWithoutMargin.length; i++) {
+      for (int j = 0; j < inplacePredictionsWithoutMargin[i].length; j++) {
+        inplacePredictionsWithoutMargin[i][j] += margin[j];
+      }
+    }
+    for (int i = 0; i < inplacePredictionsWithoutMargin.length; i++) {
+      assertArrayEquals(inplacePredictionsWithMargin[i], inplacePredictionsWithoutMargin[i], 1e-6f);
+    }
+  }
+
+  private float[] generateRandomDataSet(int size) {
+    float[] newSet = new float[size];
+    Random random = new Random();
+    for(int i = 0; i < size; i++) {
+      newSet[i] = random.nextFloat();
+    }
+    return newSet;
+  }
+
   @Test
   public void saveLoadModelWithPath() throws XGBoostError, IOException {
     DMatrix trainMat = new DMatrix(this.train_uri);
@@ -664,14 +844,12 @@ public class BoosterImplTest {
     float tempBoosterError = eval.eval(tempBooster.predict(testMat, true, 0), testMat);
 
     // Save tempBooster to bytestream and load back
-    int prevVersion = tempBooster.getVersion();
     ByteArrayInputStream in = new ByteArrayInputStream(tempBooster.toByteArray());
     tempBooster = XGBoost.loadModel(in);
     in.close();
-    tempBooster.setVersion(prevVersion);
 
     // Continue training using tempBooster
-    round = 4;
+    round = 2;
     Booster booster2 = XGBoost.train(trainMat, paramMap, round, watches, null, null, null, 0, tempBooster);
     float booster2error = eval.eval(booster2.predict(testMat, true, 0), testMat);
     TestCase.assertTrue(booster1error == booster2error);
diff --git a/plugin/CMakeLists.txt b/plugin/CMakeLists.txt
index 7026238e3..6089ae486 100644
--- a/plugin/CMakeLists.txt
+++ b/plugin/CMakeLists.txt
@@ -1,8 +1,8 @@
-if (PLUGIN_DENSE_PARSER)
+if(PLUGIN_DENSE_PARSER)
   target_sources(objxgboost PRIVATE ${xgboost_SOURCE_DIR}/plugin/dense_parser/dense_libsvm.cc)
-endif (PLUGIN_DENSE_PARSER)
+endif()
 
-if (PLUGIN_UPDATER_ONEAPI)
+if(PLUGIN_UPDATER_ONEAPI)
   add_library(oneapi_plugin OBJECT
     ${xgboost_SOURCE_DIR}/plugin/updater_oneapi/regression_obj_oneapi.cc
     ${xgboost_SOURCE_DIR}/plugin/updater_oneapi/predictor_oneapi.cc)
@@ -18,17 +18,17 @@ if (PLUGIN_UPDATER_ONEAPI)
     CXX_STANDARD 17
     CXX_STANDARD_REQUIRED ON
     POSITION_INDEPENDENT_CODE ON)
-  if (USE_OPENMP)
+  if(USE_OPENMP)
     find_package(OpenMP REQUIRED)
     target_link_libraries(oneapi_plugin PUBLIC OpenMP::OpenMP_CXX)
-  endif (USE_OPENMP)
+  endif()
   # Get compilation and link flags of oneapi_plugin and propagate to objxgboost
   target_link_libraries(objxgboost PUBLIC oneapi_plugin)
   # Add all objects of oneapi_plugin to objxgboost
   target_sources(objxgboost INTERFACE $<TARGET_OBJECTS:oneapi_plugin>)
-endif (PLUGIN_UPDATER_ONEAPI)
+endif()
 
 # Add the Federate Learning plugin if enabled.
-if (PLUGIN_FEDERATED)
+if(PLUGIN_FEDERATED)
   add_subdirectory(federated)
-endif (PLUGIN_FEDERATED)
+endif()
diff --git a/plugin/federated/federated.proto b/plugin/federated/federated.proto
index d8ef5bd92..8450659fd 100644
--- a/plugin/federated/federated.proto
+++ b/plugin/federated/federated.proto
@@ -7,6 +7,7 @@ package xgboost.federated;
 
 service Federated {
   rpc Allgather(AllgatherRequest) returns (AllgatherReply) {}
+  rpc AllgatherV(AllgatherVRequest) returns (AllgatherVReply) {}
   rpc Allreduce(AllreduceRequest) returns (AllreduceReply) {}
   rpc Broadcast(BroadcastRequest) returns (BroadcastReply) {}
 }
@@ -42,6 +43,17 @@ message AllgatherReply {
   bytes receive_buffer = 1;
 }
 
+message AllgatherVRequest {
+  // An incrementing counter that is unique to each round to operations.
+  uint64 sequence_number = 1;
+  int32 rank = 2;
+  bytes send_buffer = 3;
+}
+
+message AllgatherVReply {
+  bytes receive_buffer = 1;
+}
+
 message AllreduceRequest {
   // An incrementing counter that is unique to each round to operations.
   uint64 sequence_number = 1;
diff --git a/plugin/federated/federated_client.h b/plugin/federated/federated_client.h
index 2b4637339..ac1fbd57d 100644
--- a/plugin/federated/federated_client.h
+++ b/plugin/federated/federated_client.h
@@ -11,9 +11,7 @@
 #include <limits>
 #include <string>
 
-namespace xgboost {
-namespace federated {
-
+namespace xgboost::federated {
 /**
  * @brief A wrapper around the gRPC client.
  */
@@ -46,11 +44,11 @@ class FederatedClient {
         }()},
         rank_{rank} {}
 
-  std::string Allgather(std::string const &send_buffer) {
+  std::string Allgather(std::string_view send_buffer) {
     AllgatherRequest request;
     request.set_sequence_number(sequence_number_++);
     request.set_rank(rank_);
-    request.set_send_buffer(send_buffer);
+    request.set_send_buffer(send_buffer.data(), send_buffer.size());
 
     AllgatherReply reply;
     grpc::ClientContext context;
@@ -65,6 +63,25 @@ class FederatedClient {
     }
   }
 
+  std::string AllgatherV(std::string_view send_buffer) {
+    AllgatherVRequest request;
+    request.set_sequence_number(sequence_number_++);
+    request.set_rank(rank_);
+    request.set_send_buffer(send_buffer.data(), send_buffer.size());
+
+    AllgatherVReply reply;
+    grpc::ClientContext context;
+    context.set_wait_for_ready(true);
+    grpc::Status status = stub_->AllgatherV(&context, request, &reply);
+
+    if (status.ok()) {
+      return reply.receive_buffer();
+    } else {
+      std::cout << status.error_code() << ": " << status.error_message() << '\n';
+      throw std::runtime_error("AllgatherV RPC failed");
+    }
+  }
+
   std::string Allreduce(std::string const &send_buffer, DataType data_type,
                         ReduceOperation reduce_operation) {
     AllreduceRequest request;
@@ -112,6 +129,4 @@ class FederatedClient {
   int const rank_;
   uint64_t sequence_number_{};
 };
-
-}  // namespace federated
-}  // namespace xgboost
+}  // namespace xgboost::federated
diff --git a/plugin/federated/federated_communicator.h b/plugin/federated/federated_communicator.h
index 7acd8a829..46c6b0fda 100644
--- a/plugin/federated/federated_communicator.h
+++ b/plugin/federated/federated_communicator.h
@@ -9,9 +9,7 @@
 #include "../../src/common/io.h"
 #include "federated_client.h"
 
-namespace xgboost {
-namespace collective {
-
+namespace xgboost::collective {
 /**
  * @brief A Federated Learning communicator class that handles collective communication.
  */
@@ -118,23 +116,28 @@ class FederatedCommunicator : public Communicator {
    * \brief Get if the communicator is distributed.
    * \return True.
    */
-  bool IsDistributed() const override { return true; }
+  [[nodiscard]] bool IsDistributed() const override { return true; }
 
   /**
    * \brief Get if the communicator is federated.
    * \return True.
    */
-  bool IsFederated() const override { return true; }
+  [[nodiscard]] bool IsFederated() const override { return true; }
 
   /**
-   * \brief Perform in-place allgather.
-   * \param send_receive_buffer Buffer for both sending and receiving data.
-   * \param size Number of bytes to be gathered.
+   * \brief Perform allgather.
+   * \param input Buffer for sending data.
    */
-  void AllGather(void *send_receive_buffer, std::size_t size) override {
-    std::string const send_buffer(reinterpret_cast<char const *>(send_receive_buffer), size);
-    auto const received = client_->Allgather(send_buffer);
-    received.copy(reinterpret_cast<char *>(send_receive_buffer), size);
+  std::string AllGather(std::string_view input) override {
+    return client_->Allgather(input);
+  }
+
+  /**
+   * \brief Perform variable-length allgather.
+   * \param input Buffer for sending data.
+   */
+  std::string AllGatherV(std::string_view input) override {
+    return client_->AllgatherV(input);
   }
 
   /**
@@ -189,5 +192,4 @@ class FederatedCommunicator : public Communicator {
  private:
   std::unique_ptr<xgboost::federated::FederatedClient> client_{};
 };
-}  // namespace collective
-}  // namespace xgboost
+}  // namespace xgboost::collective
diff --git a/plugin/federated/federated_server.cc b/plugin/federated/federated_server.cc
index c50bf1f35..ad6cf6022 100644
--- a/plugin/federated/federated_server.cc
+++ b/plugin/federated/federated_server.cc
@@ -11,9 +11,7 @@
 
 #include "../../src/common/io.h"
 
-namespace xgboost {
-namespace federated {
-
+namespace xgboost::federated {
 grpc::Status FederatedService::Allgather(grpc::ServerContext*, AllgatherRequest const* request,
                                          AllgatherReply* reply) {
   handler_.Allgather(request->send_buffer().data(), request->send_buffer().size(),
@@ -21,6 +19,13 @@ grpc::Status FederatedService::Allgather(grpc::ServerContext*, AllgatherRequest
   return grpc::Status::OK;
 }
 
+grpc::Status FederatedService::AllgatherV(grpc::ServerContext*, AllgatherVRequest const* request,
+                                          AllgatherVReply* reply) {
+  handler_.AllgatherV(request->send_buffer().data(), request->send_buffer().size(),
+                      reply->mutable_receive_buffer(), request->sequence_number(), request->rank());
+  return grpc::Status::OK;
+}
+
 grpc::Status FederatedService::Allreduce(grpc::ServerContext*, AllreduceRequest const* request,
                                          AllreduceReply* reply) {
   handler_.Allreduce(request->send_buffer().data(), request->send_buffer().size(),
@@ -38,8 +43,8 @@ grpc::Status FederatedService::Broadcast(grpc::ServerContext*, BroadcastRequest
   return grpc::Status::OK;
 }
 
-void RunServer(int port, int world_size, char const* server_key_file, char const* server_cert_file,
-               char const* client_cert_file) {
+void RunServer(int port, std::size_t world_size, char const* server_key_file,
+               char const* server_cert_file, char const* client_cert_file) {
   std::string const server_address = "0.0.0.0:" + std::to_string(port);
   FederatedService service{world_size};
 
@@ -61,7 +66,7 @@ void RunServer(int port, int world_size, char const* server_key_file, char const
   server->Wait();
 }
 
-void RunInsecureServer(int port, int world_size) {
+void RunInsecureServer(int port, std::size_t world_size) {
   std::string const server_address = "0.0.0.0:" + std::to_string(port);
   FederatedService service{world_size};
 
@@ -75,6 +80,4 @@ void RunInsecureServer(int port, int world_size) {
 
   server->Wait();
 }
-
-}  // namespace federated
-}  // namespace xgboost
+}  // namespace xgboost::federated
diff --git a/plugin/federated/federated_server.h b/plugin/federated/federated_server.h
index 7738248ea..711ef5588 100644
--- a/plugin/federated/federated_server.h
+++ b/plugin/federated/federated_server.h
@@ -12,11 +12,14 @@ namespace federated {
 
 class FederatedService final : public Federated::Service {
  public:
-  explicit FederatedService(int const world_size) : handler_{world_size} {}
+  explicit FederatedService(std::size_t const world_size) : handler_{world_size} {}
 
   grpc::Status Allgather(grpc::ServerContext* context, AllgatherRequest const* request,
                          AllgatherReply* reply) override;
 
+  grpc::Status AllgatherV(grpc::ServerContext* context, AllgatherVRequest const* request,
+                          AllgatherVReply* reply) override;
+
   grpc::Status Allreduce(grpc::ServerContext* context, AllreduceRequest const* request,
                          AllreduceReply* reply) override;
 
@@ -27,10 +30,10 @@ class FederatedService final : public Federated::Service {
   xgboost::collective::InMemoryHandler handler_;
 };
 
-void RunServer(int port, int world_size, char const* server_key_file, char const* server_cert_file,
-               char const* client_cert_file);
+void RunServer(int port, std::size_t world_size, char const* server_key_file,
+               char const* server_cert_file, char const* client_cert_file);
 
-void RunInsecureServer(int port, int world_size);
+void RunInsecureServer(int port, std::size_t world_size);
 
 }  // namespace federated
 }  // namespace xgboost
diff --git a/python-package/xgboost/callback.py b/python-package/xgboost/callback.py
index 6077aa1e3..29d880539 100644
--- a/python-package/xgboost/callback.py
+++ b/python-package/xgboost/callback.py
@@ -540,7 +540,10 @@ class EvaluationMonitor(TrainingCallback):
 
 
 class TrainingCheckPoint(TrainingCallback):
-    """Checkpointing operation.
+    """Checkpointing operation. Users are encouraged to create their own callbacks for
+    checkpoint as XGBoost doesn't handle distributed file systems. When checkpointing on
+    distributed systems, be sure to know the rank of the worker to avoid multiple
+    workers checkpointing to the same place.
 
     .. versionadded:: 1.3.0
 
@@ -553,9 +556,9 @@ class TrainingCheckPoint(TrainingCallback):
         pattern of output model file.  Models will be saved as name_0.json, name_1.json,
         name_2.json ....
     as_pickle :
-        When set to True, all training parameters will be saved in pickle format, instead
-        of saving only the model.
-    iterations :
+        When set to True, all training parameters will be saved in pickle format,
+        instead of saving only the model.
+    interval :
         Interval of checkpointing.  Checkpointing is slow so setting a larger number can
         reduce performance hit.
 
@@ -566,15 +569,20 @@ class TrainingCheckPoint(TrainingCallback):
         directory: Union[str, os.PathLike],
         name: str = "model",
         as_pickle: bool = False,
-        iterations: int = 100,
+        interval: int = 100,
     ) -> None:
         self._path = os.fspath(directory)
         self._name = name
         self._as_pickle = as_pickle
-        self._iterations = iterations
-        self._epoch = 0
+        self._iterations = interval
+        self._epoch = 0  # counter for iterval
+        self._start = 0  # beginning iteration
         super().__init__()
 
+    def before_training(self, model: _Model) -> _Model:
+        self._start = model.num_boosted_rounds()
+        return model
+
     def after_iteration(
         self, model: _Model, epoch: int, evals_log: TrainingCallback.EvalsLog
     ) -> bool:
@@ -583,11 +591,12 @@ class TrainingCheckPoint(TrainingCallback):
                 self._path,
                 self._name
                 + "_"
-                + str(epoch)
+                + (str(epoch + self._start))
                 + (".pkl" if self._as_pickle else ".json"),
             )
-            self._epoch = 0
+            self._epoch = 0  # reset counter
             if collective.get_rank() == 0:
+                # checkpoint using the first worker
                 if self._as_pickle:
                     with open(path, "wb") as fd:
                         pickle.dump(model, fd)
diff --git a/python-package/xgboost/compat.py b/python-package/xgboost/compat.py
index a01eeef09..c40dea5fd 100644
--- a/python-package/xgboost/compat.py
+++ b/python-package/xgboost/compat.py
@@ -88,6 +88,18 @@ def is_cudf_available() -> bool:
         return False
 
 
+def is_cupy_available() -> bool:
+    """Check cupy package available or not"""
+    if importlib.util.find_spec("cupy") is None:
+        return False
+    try:
+        import cupy
+
+        return True
+    except ImportError:
+        return False
+
+
 try:
     import scipy.sparse as scipy_sparse
     from scipy.sparse import csr_matrix as scipy_csr
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 486cee514..648851b31 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -303,14 +303,14 @@ def _check_distributed_params(kwargs: Dict[str, Any]) -> None:
 
 
 def _validate_feature_info(
-    feature_info: Sequence[str], n_features: int, name: str
+    feature_info: Sequence[str], n_features: int, is_column_split: bool, name: str
 ) -> List[str]:
     if isinstance(feature_info, str) or not isinstance(feature_info, Sequence):
         raise TypeError(
             f"Expecting a sequence of strings for {name}, got: {type(feature_info)}"
         )
     feature_info = list(feature_info)
-    if len(feature_info) != n_features and n_features != 0:
+    if len(feature_info) != n_features and n_features != 0 and not is_column_split:
         msg = (
             f"{name} must have the same length as the number of data columns, ",
             f"expected {n_features}, got {len(feature_info)}",
@@ -1231,6 +1231,16 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes,too-many-public-m
         _check_call(_LIB.XGDMatrixNumNonMissing(self.handle, ctypes.byref(ret)))
         return ret.value
 
+    def data_split_mode(self) -> DataSplitMode:
+        """Get the data split mode of the DMatrix.
+
+        .. versionadded:: 2.1.0
+
+        """
+        ret = c_bst_ulong()
+        _check_call(_LIB.XGDMatrixDataSplitMode(self.handle, ctypes.byref(ret)))
+        return DataSplitMode(ret.value)
+
     def slice(
         self, rindex: Union[List[int], np.ndarray], allow_groups: bool = False
     ) -> "DMatrix":
@@ -1298,7 +1308,10 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes,too-many-public-m
 
         # validate feature name
         feature_names = _validate_feature_info(
-            feature_names, self.num_col(), "feature names"
+            feature_names,
+            self.num_col(),
+            self.data_split_mode() == DataSplitMode.COL,
+            "feature names",
         )
         if len(feature_names) != len(set(feature_names)):
             values, counts = np.unique(
@@ -1371,7 +1384,10 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes,too-many-public-m
             return
 
         feature_types = _validate_feature_info(
-            feature_types, self.num_col(), "feature types"
+            feature_types,
+            self.num_col(),
+            self.data_split_mode() == DataSplitMode.COL,
+            "feature types",
         )
 
         feature_types_bytes = [bytes(f, encoding="utf-8") for f in feature_types]
@@ -2431,9 +2447,12 @@ class Booster:
 
         from .data import (
             _array_interface,
+            _arrow_transform,
+            _is_arrow,
             _is_cudf_df,
             _is_cupy_array,
             _is_list,
+            _is_np_array_like,
             _is_pandas_df,
             _is_pandas_series,
             _is_tuple,
@@ -2441,6 +2460,8 @@ class Booster:
         )
 
         enable_categorical = True
+        if _is_arrow(data):
+            data = _arrow_transform(data)
         if _is_pandas_series(data):
             import pandas as pd
 
@@ -2463,7 +2484,7 @@ class Booster:
                     f"got {data.shape[1]}"
                 )
 
-        if isinstance(data, np.ndarray):
+        if _is_np_array_like(data):
             from .data import _ensure_np_dtype
 
             data, _ = _ensure_np_dtype(data, data.dtype)
diff --git a/python-package/xgboost/dask.py b/python-package/xgboost/dask/__init__.py
similarity index 99%
rename from python-package/xgboost/dask.py
rename to python-package/xgboost/dask/__init__.py
index f62a3e5af..a58c0f225 100644
--- a/python-package/xgboost/dask.py
+++ b/python-package/xgboost/dask/__init__.py
@@ -60,11 +60,11 @@ from typing import (
 
 import numpy
 
-from . import collective, config
-from ._typing import _T, FeatureNames, FeatureTypes, ModelIn
-from .callback import TrainingCallback
-from .compat import DataFrame, LazyLoader, concat, lazy_isinstance
-from .core import (
+from xgboost import collective, config
+from xgboost._typing import _T, FeatureNames, FeatureTypes, ModelIn
+from xgboost.callback import TrainingCallback
+from xgboost.compat import DataFrame, LazyLoader, concat, lazy_isinstance
+from xgboost.core import (
     Booster,
     DataIter,
     DMatrix,
@@ -75,8 +75,8 @@ from .core import (
     _deprecate_positional_args,
     _expect,
 )
-from .data import _is_cudf_ser, _is_cupy_array
-from .sklearn import (
+from xgboost.data import _is_cudf_ser, _is_cupy_array
+from xgboost.sklearn import (
     XGBClassifier,
     XGBClassifierBase,
     XGBClassifierMixIn,
@@ -91,8 +91,8 @@ from .sklearn import (
     _wrap_evaluation_matrices,
     xgboost_model_doc,
 )
-from .tracker import RabitTracker, get_host_ip
-from .training import train as worker_train
+from xgboost.tracker import RabitTracker, get_host_ip
+from xgboost.training import train as worker_train
 
 if TYPE_CHECKING:
     import dask
diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py
index 04bdc5739..49287d817 100644
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -5,7 +5,7 @@ import ctypes
 import json
 import os
 import warnings
-from typing import Any, Callable, Iterator, List, Optional, Sequence, Tuple, cast
+from typing import Any, Callable, List, Optional, Sequence, Tuple, cast
 
 import numpy as np
 
@@ -107,6 +107,7 @@ def _from_scipy_csr(
     nthread: int,
     feature_names: Optional[FeatureNames],
     feature_types: Optional[FeatureTypes],
+    data_split_mode: DataSplitMode = DataSplitMode.ROW,
 ) -> DispatchedDataBackendReturnType:
     """Initialize data from a CSR matrix."""
 
@@ -118,7 +119,11 @@ def _from_scipy_csr(
             _array_interface(data.indices),
             _array_interface(data.data),
             c_bst_ulong(data.shape[1]),
-            make_jcargs(missing=float(missing), nthread=int(nthread)),
+            make_jcargs(
+                missing=float(missing),
+                nthread=int(nthread),
+                data_split_mode=int(data_split_mode),
+            ),
             ctypes.byref(handle),
         )
     )
@@ -139,6 +144,7 @@ def _from_scipy_csc(
     nthread: int,
     feature_names: Optional[FeatureNames],
     feature_types: Optional[FeatureTypes],
+    data_split_mode: DataSplitMode = DataSplitMode.ROW,
 ) -> DispatchedDataBackendReturnType:
     """Initialize data from a CSC matrix."""
     handle = ctypes.c_void_p()
@@ -149,7 +155,11 @@ def _from_scipy_csc(
             _array_interface(data.indices),
             _array_interface(data.data),
             c_bst_ulong(data.shape[0]),
-            make_jcargs(missing=float(missing), nthread=int(nthread)),
+            make_jcargs(
+                missing=float(missing),
+                nthread=int(nthread),
+                data_split_mode=int(data_split_mode),
+            ),
             ctypes.byref(handle),
         )
     )
@@ -164,8 +174,8 @@ def _is_scipy_coo(data: DataType) -> bool:
     return isinstance(data, scipy.sparse.coo_matrix)
 
 
-def _is_numpy_array(data: DataType) -> bool:
-    return isinstance(data, (np.ndarray, np.matrix))
+def _is_np_array_like(data: DataType) -> bool:
+    return hasattr(data, "__array_interface__")
 
 
 def _ensure_np_dtype(
@@ -317,7 +327,6 @@ def pandas_feature_info(
 ) -> Tuple[Optional[FeatureNames], Optional[FeatureTypes]]:
     """Handle feature info for pandas dataframe."""
     import pandas as pd
-    from pandas.api.types import is_categorical_dtype, is_sparse
 
     # handle feature names
     if feature_names is None and meta is None:
@@ -332,10 +341,10 @@ def pandas_feature_info(
     if feature_types is None and meta is None:
         feature_types = []
         for dtype in data.dtypes:
-            if is_sparse(dtype):
+            if is_pd_sparse_dtype(dtype):
                 feature_types.append(_pandas_dtype_mapper[dtype.subtype.name])
             elif (
-                is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
+                is_pd_cat_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
             ) and enable_categorical:
                 feature_types.append(CAT_T)
             else:
@@ -345,18 +354,13 @@ def pandas_feature_info(
 
 def is_nullable_dtype(dtype: PandasDType) -> bool:
     """Whether dtype is a pandas nullable type."""
-    from pandas.api.types import (
-        is_bool_dtype,
-        is_categorical_dtype,
-        is_float_dtype,
-        is_integer_dtype,
-    )
+    from pandas.api.types import is_bool_dtype, is_float_dtype, is_integer_dtype
 
     is_int = is_integer_dtype(dtype) and dtype.name in pandas_nullable_mapper
     # np.bool has alias `bool`, while pd.BooleanDtype has `boolean`.
     is_bool = is_bool_dtype(dtype) and dtype.name == "boolean"
     is_float = is_float_dtype(dtype) and dtype.name in pandas_nullable_mapper
-    return is_int or is_bool or is_float or is_categorical_dtype(dtype)
+    return is_int or is_bool or is_float or is_pd_cat_dtype(dtype)
 
 
 def is_pa_ext_dtype(dtype: Any) -> bool:
@@ -371,17 +375,48 @@ def is_pa_ext_categorical_dtype(dtype: Any) -> bool:
     )
 
 
+def is_pd_cat_dtype(dtype: PandasDType) -> bool:
+    """Wrapper for testing pandas category type."""
+    import pandas as pd
+
+    if hasattr(pd.util, "version") and hasattr(pd.util.version, "Version"):
+        Version = pd.util.version.Version
+        if Version(pd.__version__) >= Version("2.1.0"):
+            from pandas import CategoricalDtype
+
+            return isinstance(dtype, CategoricalDtype)
+
+    from pandas.api.types import is_categorical_dtype
+
+    return is_categorical_dtype(dtype)
+
+
+def is_pd_sparse_dtype(dtype: PandasDType) -> bool:
+    """Wrapper for testing pandas sparse type."""
+    import pandas as pd
+
+    if hasattr(pd.util, "version") and hasattr(pd.util.version, "Version"):
+        Version = pd.util.version.Version
+        if Version(pd.__version__) >= Version("2.1.0"):
+            from pandas import SparseDtype
+
+            return isinstance(dtype, SparseDtype)
+
+    from pandas.api.types import is_sparse
+
+    return is_sparse(dtype)
+
+
 def pandas_cat_null(data: DataFrame) -> DataFrame:
     """Handle categorical dtype and nullable extension types from pandas."""
     import pandas as pd
-    from pandas.api.types import is_categorical_dtype
 
     # handle category codes and nullable.
     cat_columns = []
     nul_columns = []
     # avoid an unnecessary conversion if possible
     for col, dtype in zip(data.columns, data.dtypes):
-        if is_categorical_dtype(dtype):
+        if is_pd_cat_dtype(dtype):
             cat_columns.append(col)
         elif is_pa_ext_categorical_dtype(dtype):
             raise ValueError(
@@ -398,7 +433,7 @@ def pandas_cat_null(data: DataFrame) -> DataFrame:
         transformed = data
 
     def cat_codes(ser: pd.Series) -> pd.Series:
-        if is_categorical_dtype(ser.dtype):
+        if is_pd_cat_dtype(ser.dtype):
             return ser.cat.codes
         assert is_pa_ext_categorical_dtype(ser.dtype)
         # Not yet supported, the index is not ordered for some reason. Alternately:
@@ -454,14 +489,12 @@ def _transform_pandas_df(
     meta: Optional[str] = None,
     meta_type: Optional[NumpyDType] = None,
 ) -> Tuple[np.ndarray, Optional[FeatureNames], Optional[FeatureTypes]]:
-    from pandas.api.types import is_categorical_dtype, is_sparse
-
     pyarrow_extension = False
     for dtype in data.dtypes:
         if not (
             (dtype.name in _pandas_dtype_mapper)
-            or is_sparse(dtype)
-            or (is_categorical_dtype(dtype) and enable_categorical)
+            or is_pd_sparse_dtype(dtype)
+            or (is_pd_cat_dtype(dtype) and enable_categorical)
             or is_pa_ext_dtype(dtype)
         ):
             _invalid_dataframe_dtype(data)
@@ -495,11 +528,14 @@ def _from_pandas_df(
     nthread: int,
     feature_names: Optional[FeatureNames],
     feature_types: Optional[FeatureTypes],
+    data_split_mode: DataSplitMode = DataSplitMode.ROW,
 ) -> DispatchedDataBackendReturnType:
     data, feature_names, feature_types = _transform_pandas_df(
         data, enable_categorical, feature_names, feature_types
     )
-    return _from_numpy_array(data, missing, nthread, feature_names, feature_types)
+    return _from_numpy_array(
+        data, missing, nthread, feature_names, feature_types, data_split_mode
+    )
 
 
 def _is_pandas_series(data: DataType) -> bool:
@@ -515,9 +551,8 @@ def _meta_from_pandas_series(
 ) -> None:
     """Help transform pandas series for meta data like labels"""
     data = data.values.astype("float")
-    from pandas.api.types import is_sparse
 
-    if is_sparse(data):
+    if is_pd_sparse_dtype(getattr(data, "dtype", data)):
         data = data.to_dense()  # type: ignore
     assert len(data.shape) == 1 or data.shape[1] == 0 or data.shape[1] == 1
     _meta_from_numpy(data, name, dtype, handle)
@@ -539,13 +574,11 @@ def _from_pandas_series(
     feature_names: Optional[FeatureNames],
     feature_types: Optional[FeatureTypes],
 ) -> DispatchedDataBackendReturnType:
-    from pandas.api.types import is_categorical_dtype
-
     if (data.dtype.name not in _pandas_dtype_mapper) and not (
-        is_categorical_dtype(data.dtype) and enable_categorical
+        is_pd_cat_dtype(data.dtype) and enable_categorical
     ):
         _invalid_dataframe_dtype(data)
-    if enable_categorical and is_categorical_dtype(data.dtype):
+    if enable_categorical and is_pd_cat_dtype(data.dtype):
         data = data.cat.codes
     return _from_numpy_array(
         data.values.reshape(data.shape[0], 1).astype("float"),
@@ -656,86 +689,51 @@ def _from_dt_df(
 
 
 def _is_arrow(data: DataType) -> bool:
-    try:
-        import pyarrow as pa
-        from pyarrow import dataset as arrow_dataset
-
-        return isinstance(data, (pa.Table, arrow_dataset.Dataset))
-    except ImportError:
-        return False
+    return lazy_isinstance(data, "pyarrow.lib", "Table") or lazy_isinstance(
+        data, "pyarrow._dataset", "Dataset"
+    )
 
 
-def record_batch_data_iter(data_iter: Iterator) -> Callable:
-    """Data iterator used to ingest Arrow columnar record batches. We are not using
-    class DataIter because it is only intended for building Device DMatrix and external
-    memory DMatrix.
-
-    """
-    from pyarrow.cffi import ffi
-
-    c_schemas: List[ffi.CData] = []
-    c_arrays: List[ffi.CData] = []
-
-    def _next(data_handle: int) -> int:
-        from pyarrow.cffi import ffi
-
-        try:
-            batch = next(data_iter)
-            c_schemas.append(ffi.new("struct ArrowSchema*"))
-            c_arrays.append(ffi.new("struct ArrowArray*"))
-            ptr_schema = int(ffi.cast("uintptr_t", c_schemas[-1]))
-            ptr_array = int(ffi.cast("uintptr_t", c_arrays[-1]))
-            # pylint: disable=protected-access
-            batch._export_to_c(ptr_array, ptr_schema)
-            _check_call(
-                _LIB.XGImportArrowRecordBatch(
-                    ctypes.c_void_p(data_handle),
-                    ctypes.c_void_p(ptr_array),
-                    ctypes.c_void_p(ptr_schema),
-                )
-            )
-            return 1
-        except StopIteration:
-            return 0
-
-    return _next
-
-
-def _from_arrow(
-    data: DataType,
-    missing: FloatCompatible,
-    nthread: int,
-    feature_names: Optional[FeatureNames],
-    feature_types: Optional[FeatureTypes],
-    enable_categorical: bool,
-) -> DispatchedDataBackendReturnType:
+def _arrow_transform(data: DataType) -> Any:
+    import pandas as pd
     import pyarrow as pa
+    from pyarrow.dataset import Dataset
 
-    if not all(
-        pa.types.is_integer(t) or pa.types.is_floating(t) for t in data.schema.types
-    ):
-        raise ValueError(
-            "Features in dataset can only be integers or floating point number"
-        )
-    if enable_categorical:
-        raise ValueError("categorical data in arrow is not supported yet.")
+    if isinstance(data, Dataset):
+        raise TypeError("arrow Dataset is not supported.")
 
-    batches = data.to_batches()
-    rb_iter = iter(batches)
-    it = record_batch_data_iter(rb_iter)
-    next_callback = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_void_p)(it)
-    handle = ctypes.c_void_p()
-    config = from_pystr_to_cstr(
-        json.dumps({"missing": missing, "nthread": nthread, "nbatch": len(batches)})
-    )
-    _check_call(
-        _LIB.XGDMatrixCreateFromArrowCallback(
-            next_callback,
-            config,
-            ctypes.byref(handle),
-        )
-    )
-    return handle, feature_names, feature_types
+    data = cast(pa.Table, data)
+
+    def type_mapper(dtype: pa.DataType) -> Optional[str]:
+        """Maps pyarrow type to pandas arrow extension type."""
+        if pa.types.is_int8(dtype):
+            return pd.ArrowDtype(pa.int8())
+        if pa.types.is_int16(dtype):
+            return pd.ArrowDtype(pa.int16())
+        if pa.types.is_int32(dtype):
+            return pd.ArrowDtype(pa.int32())
+        if pa.types.is_int64(dtype):
+            return pd.ArrowDtype(pa.int64())
+        if pa.types.is_uint8(dtype):
+            return pd.ArrowDtype(pa.uint8())
+        if pa.types.is_uint16(dtype):
+            return pd.ArrowDtype(pa.uint16())
+        if pa.types.is_uint32(dtype):
+            return pd.ArrowDtype(pa.uint32())
+        if pa.types.is_uint64(dtype):
+            return pd.ArrowDtype(pa.uint64())
+        if pa.types.is_float16(dtype):
+            return pd.ArrowDtype(pa.float16())
+        if pa.types.is_float32(dtype):
+            return pd.ArrowDtype(pa.float32())
+        if pa.types.is_float64(dtype):
+            return pd.ArrowDtype(pa.float64())
+        if pa.types.is_boolean(dtype):
+            return pd.ArrowDtype(pa.bool_())
+        return None
+
+    df = data.to_pandas(types_mapper=type_mapper)
+    return df
 
 
 def _is_cudf_df(data: DataType) -> bool:
@@ -985,10 +983,13 @@ def _from_list(
     n_threads: int,
     feature_names: Optional[FeatureNames],
     feature_types: Optional[FeatureTypes],
+    data_split_mode: DataSplitMode = DataSplitMode.ROW,
 ) -> DispatchedDataBackendReturnType:
     array = np.array(data)
     _check_data_shape(data)
-    return _from_numpy_array(array, missing, n_threads, feature_names, feature_types)
+    return _from_numpy_array(
+        array, missing, n_threads, feature_names, feature_types, data_split_mode
+    )
 
 
 def _is_tuple(data: DataType) -> bool:
@@ -1001,8 +1002,11 @@ def _from_tuple(
     n_threads: int,
     feature_names: Optional[FeatureNames],
     feature_types: Optional[FeatureTypes],
+    data_split_mode: DataSplitMode = DataSplitMode.ROW,
 ) -> DispatchedDataBackendReturnType:
-    return _from_list(data, missing, n_threads, feature_names, feature_types)
+    return _from_list(
+        data, missing, n_threads, feature_names, feature_types, data_split_mode
+    )
 
 
 def _is_iter(data: DataType) -> bool:
@@ -1044,30 +1048,51 @@ def dispatch_data_backend(
     if not _is_cudf_ser(data) and not _is_pandas_series(data):
         _check_data_shape(data)
     if _is_scipy_csr(data):
-        return _from_scipy_csr(data, missing, threads, feature_names, feature_types)
+        return _from_scipy_csr(
+            data, missing, threads, feature_names, feature_types, data_split_mode
+        )
     if _is_scipy_csc(data):
-        return _from_scipy_csc(data, missing, threads, feature_names, feature_types)
+        return _from_scipy_csc(
+            data, missing, threads, feature_names, feature_types, data_split_mode
+        )
     if _is_scipy_coo(data):
         return _from_scipy_csr(
-            data.tocsr(), missing, threads, feature_names, feature_types
+            data.tocsr(),
+            missing,
+            threads,
+            feature_names,
+            feature_types,
+            data_split_mode,
         )
-    if _is_numpy_array(data):
+    if _is_np_array_like(data):
         return _from_numpy_array(
             data, missing, threads, feature_names, feature_types, data_split_mode
         )
     if _is_uri(data):
         return _from_uri(data, missing, feature_names, feature_types, data_split_mode)
     if _is_list(data):
-        return _from_list(data, missing, threads, feature_names, feature_types)
+        return _from_list(
+            data, missing, threads, feature_names, feature_types, data_split_mode
+        )
     if _is_tuple(data):
-        return _from_tuple(data, missing, threads, feature_names, feature_types)
+        return _from_tuple(
+            data, missing, threads, feature_names, feature_types, data_split_mode
+        )
+    if _is_arrow(data):
+        data = _arrow_transform(data)
     if _is_pandas_series(data):
         import pandas as pd
 
         data = pd.DataFrame(data)
     if _is_pandas_df(data):
         return _from_pandas_df(
-            data, enable_categorical, missing, threads, feature_names, feature_types
+            data,
+            enable_categorical,
+            missing,
+            threads,
+            feature_names,
+            feature_types,
+            data_split_mode,
         )
     if _is_cudf_df(data) or _is_cudf_ser(data):
         return _from_cudf_df(
@@ -1094,10 +1119,6 @@ def dispatch_data_backend(
         return _from_pandas_series(
             data, missing, threads, enable_categorical, feature_names, feature_types
         )
-    if _is_arrow(data):
-        return _from_arrow(
-            data, missing, threads, feature_names, feature_types, enable_categorical
-        )
     if _has_array_protocol(data):
         array = np.asarray(data)
         return _from_numpy_array(array, missing, threads, feature_names, feature_types)
@@ -1194,9 +1215,11 @@ def dispatch_meta_backend(
     if _is_tuple(data):
         _meta_from_tuple(data, name, dtype, handle)
         return
-    if _is_numpy_array(data):
+    if _is_np_array_like(data):
         _meta_from_numpy(data, name, dtype, handle)
         return
+    if _is_arrow(data):
+        data = _arrow_transform(data)
     if _is_pandas_df(data):
         data, _, _ = _transform_pandas_df(data, False, meta=name, meta_type=dtype)
         _meta_from_numpy(data, name, dtype, handle)
@@ -1281,7 +1304,7 @@ def _proxy_transform(
         return _transform_dlpack(data), None, feature_names, feature_types
     if _is_list(data) or _is_tuple(data):
         data = np.array(data)
-    if _is_numpy_array(data):
+    if _is_np_array_like(data):
         data, _ = _ensure_np_dtype(data, data.dtype)
         return data, None, feature_names, feature_types
     if _is_scipy_csr(data):
@@ -1291,6 +1314,8 @@ def _proxy_transform(
         import pandas as pd
 
         data = pd.DataFrame(data)
+    if _is_arrow(data):
+        data = _arrow_transform(data)
     if _is_pandas_df(data):
         arr, feature_names, feature_types = _transform_pandas_df(
             data, enable_categorical, feature_names, feature_types
@@ -1331,7 +1356,7 @@ def dispatch_proxy_set_data(
     if not allow_host:
         raise err
 
-    if _is_numpy_array(data):
+    if _is_np_array_like(data):
         _check_data_shape(data)
         proxy._set_data_from_array(data)  # pylint: disable=W0212
         return
diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py
index d6667ad89..9fe73005a 100644
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@@ -22,7 +22,7 @@ from typing import (
 
 import numpy as np
 import pandas as pd
-from pyspark import SparkContext, cloudpickle
+from pyspark import RDD, SparkContext, cloudpickle
 from pyspark.ml import Estimator, Model
 from pyspark.ml.functions import array_to_vector, vector_to_array
 from pyspark.ml.linalg import VectorUDT
@@ -44,6 +44,7 @@ from pyspark.ml.util import (
     MLWritable,
     MLWriter,
 )
+from pyspark.resource import ResourceProfileBuilder, TaskResourceRequests
 from pyspark.sql import Column, DataFrame
 from pyspark.sql.functions import col, countDistinct, pandas_udf, rand, struct
 from pyspark.sql.types import (
@@ -59,7 +60,7 @@ from scipy.special import expit, softmax  # pylint: disable=no-name-in-module
 
 import xgboost
 from xgboost import XGBClassifier
-from xgboost.compat import is_cudf_available
+from xgboost.compat import is_cudf_available, is_cupy_available
 from xgboost.core import Booster, _check_distributed_params
 from xgboost.sklearn import DEFAULT_N_ESTIMATORS, XGBModel, _can_use_qdm
 from xgboost.training import train as worker_train
@@ -88,6 +89,7 @@ from .utils import (
     _get_rabit_args,
     _get_spark_session,
     _is_local,
+    _is_standalone_or_localcluster,
     deserialize_booster,
     deserialize_xgb_model,
     get_class_name,
@@ -242,6 +244,13 @@ class _SparkXGBParams(
         TypeConverters.toList,
     )
 
+    def set_device(self, value: str) -> "_SparkXGBParams":
+        """Set device, optional value: cpu, cuda, gpu"""
+        _check_distributed_params({"device": value})
+        assert value in ("cpu", "cuda", "gpu")
+        self.set(self.device, value)
+        return self
+
     @classmethod
     def _xgb_cls(cls) -> Type[XGBModel]:
         """
@@ -335,6 +344,54 @@ class _SparkXGBParams(
                 predict_params[param.name] = self.getOrDefault(param)
         return predict_params
 
+    def _validate_gpu_params(self) -> None:
+        """Validate the gpu parameters and gpu configurations"""
+
+        if use_cuda(self.getOrDefault(self.device)) or self.getOrDefault(self.use_gpu):
+            ss = _get_spark_session()
+            sc = ss.sparkContext
+
+            if _is_local(sc):
+                # Support GPU training in Spark local mode is just for debugging
+                # purposes, so it's okay for printing the below warning instead of
+                # checking the real gpu numbers and raising the exception.
+                get_logger(self.__class__.__name__).warning(
+                    "You have enabled GPU in spark local mode. Please make sure your"
+                    " local node has at least %d GPUs",
+                    self.getOrDefault(self.num_workers),
+                )
+            else:
+                executor_gpus = sc.getConf().get("spark.executor.resource.gpu.amount")
+                if executor_gpus is None:
+                    raise ValueError(
+                        "The `spark.executor.resource.gpu.amount` is required for training"
+                        " on GPU."
+                    )
+
+                if not (ss.version >= "3.4.0" and _is_standalone_or_localcluster(sc)):
+                    # We will enable stage-level scheduling in spark 3.4.0+ which doesn't
+                    # require spark.task.resource.gpu.amount to be set explicitly
+                    gpu_per_task = sc.getConf().get("spark.task.resource.gpu.amount")
+                    if gpu_per_task is not None:
+                        if float(gpu_per_task) < 1.0:
+                            raise ValueError(
+                                "XGBoost doesn't support GPU fractional configurations. "
+                                "Please set `spark.task.resource.gpu.amount=spark.executor"
+                                ".resource.gpu.amount`"
+                            )
+
+                        if float(gpu_per_task) > 1.0:
+                            get_logger(self.__class__.__name__).warning(
+                                "%s GPUs for each Spark task is configured, but each "
+                                "XGBoost training task uses only 1 GPU.",
+                                gpu_per_task,
+                            )
+                    else:
+                        raise ValueError(
+                            "The `spark.task.resource.gpu.amount` is required for training"
+                            " on GPU."
+                        )
+
     def _validate_params(self) -> None:
         # pylint: disable=too-many-branches
         init_model = self.getOrDefault("xgb_model")
@@ -414,53 +471,7 @@ class _SparkXGBParams(
                     "`pyspark.ml.linalg.Vector` type."
                 )
 
-        if use_cuda(self.getOrDefault(self.device)) or self.getOrDefault(self.use_gpu):
-            gpu_per_task = (
-                _get_spark_session()
-                .sparkContext.getConf()
-                .get("spark.task.resource.gpu.amount")
-            )
-
-            is_local = _is_local(_get_spark_session().sparkContext)
-
-            if is_local:
-                # checking spark local mode.
-                if gpu_per_task is not None:
-                    raise RuntimeError(
-                        "The spark local mode does not support gpu configuration."
-                        "Please remove spark.executor.resource.gpu.amount and "
-                        "spark.task.resource.gpu.amount"
-                    )
-
-                # Support GPU training in Spark local mode is just for debugging
-                # purposes, so it's okay for printing the below warning instead of
-                # checking the real gpu numbers and raising the exception.
-                get_logger(self.__class__.__name__).warning(
-                    "You have enabled GPU in spark local mode. Please make sure your"
-                    " local node has at least %d GPUs",
-                    self.getOrDefault(self.num_workers),
-                )
-            else:
-                # checking spark non-local mode.
-                if gpu_per_task is not None:
-                    if float(gpu_per_task) < 1.0:
-                        raise ValueError(
-                            "XGBoost doesn't support GPU fractional configurations. "
-                            "Please set `spark.task.resource.gpu.amount=spark.executor"
-                            ".resource.gpu.amount`"
-                        )
-
-                    if float(gpu_per_task) > 1.0:
-                        get_logger(self.__class__.__name__).warning(
-                            "%s GPUs for each Spark task is configured, but each "
-                            "XGBoost training task uses only 1 GPU.",
-                            gpu_per_task,
-                        )
-                else:
-                    raise ValueError(
-                        "The `spark.task.resource.gpu.amount` is required for training"
-                        " on GPU."
-                    )
+        self._validate_gpu_params()
 
 
 def _validate_and_convert_feature_col_as_float_col_list(
@@ -585,6 +596,8 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
             arbitrary_params_dict={},
         )
 
+        self.logger = get_logger(self.__class__.__name__)
+
     def setParams(self, **kwargs: Any) -> None:  # pylint: disable=invalid-name
         """
         Set params for the estimator.
@@ -887,6 +900,116 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
 
         return booster_params, train_call_kwargs_params, dmatrix_kwargs
 
+    def _skip_stage_level_scheduling(self) -> bool:
+        # pylint: disable=too-many-return-statements
+        """Check if stage-level scheduling is not needed,
+        return true to skip stage-level scheduling"""
+
+        if use_cuda(self.getOrDefault(self.device)) or self.getOrDefault(self.use_gpu):
+            ss = _get_spark_session()
+            sc = ss.sparkContext
+
+            if ss.version < "3.4.0":
+                self.logger.info(
+                    "Stage-level scheduling in xgboost requires spark version 3.4.0+"
+                )
+                return True
+
+            if not _is_standalone_or_localcluster(sc):
+                self.logger.info(
+                    "Stage-level scheduling in xgboost requires spark standalone or "
+                    "local-cluster mode"
+                )
+                return True
+
+            executor_cores = sc.getConf().get("spark.executor.cores")
+            executor_gpus = sc.getConf().get("spark.executor.resource.gpu.amount")
+            if executor_cores is None or executor_gpus is None:
+                self.logger.info(
+                    "Stage-level scheduling in xgboost requires spark.executor.cores, "
+                    "spark.executor.resource.gpu.amount to be set."
+                )
+                return True
+
+            if int(executor_cores) == 1:
+                # there will be only 1 task running at any time.
+                self.logger.info(
+                    "Stage-level scheduling in xgboost requires spark.executor.cores > 1 "
+                )
+                return True
+
+            if int(executor_gpus) > 1:
+                # For spark.executor.resource.gpu.amount > 1, we suppose user knows how to configure
+                # to make xgboost run successfully.
+                #
+                self.logger.info(
+                    "Stage-level scheduling in xgboost will not work "
+                    "when spark.executor.resource.gpu.amount>1"
+                )
+                return True
+
+            task_gpu_amount = sc.getConf().get("spark.task.resource.gpu.amount")
+
+            if task_gpu_amount is None:
+                # The ETL tasks will not grab a gpu when spark.task.resource.gpu.amount is not set,
+                # but with stage-level scheduling, we can make training task grab the gpu.
+                return False
+
+            if float(task_gpu_amount) == float(executor_gpus):
+                # spark.executor.resource.gpu.amount=spark.task.resource.gpu.amount "
+                # results in only 1 task running at a time, which may cause perf issue.
+                return True
+
+            # We can enable stage-level scheduling
+            return False
+
+        # CPU training doesn't require stage-level scheduling
+        return True
+
+    def _try_stage_level_scheduling(self, rdd: RDD) -> RDD:
+        """Try to enable stage-level scheduling"""
+
+        if self._skip_stage_level_scheduling():
+            return rdd
+
+        ss = _get_spark_session()
+
+        # executor_cores will not be None
+        executor_cores = ss.sparkContext.getConf().get("spark.executor.cores")
+        assert executor_cores is not None
+
+        # Spark-rapids is a project to leverage GPUs to accelerate spark SQL.
+        # If spark-rapids is enabled, to avoid GPU OOM, we don't allow other
+        # ETL gpu tasks running alongside training tasks.
+        spark_plugins = ss.conf.get("spark.plugins", " ")
+        assert spark_plugins is not None
+        spark_rapids_sql_enabled = ss.conf.get("spark.rapids.sql.enabled", "true")
+        assert spark_rapids_sql_enabled is not None
+
+        task_cores = (
+            int(executor_cores)
+            if "com.nvidia.spark.SQLPlugin" in spark_plugins
+            and "true" == spark_rapids_sql_enabled.lower()
+            else (int(executor_cores) // 2) + 1
+        )
+
+        # Each training task requires cpu cores > total executor cores//2 + 1 which can
+        # make sure the tasks be sent to different executors.
+        #
+        # Please note that we can't use GPU to limit the concurrent tasks because of
+        # https://issues.apache.org/jira/browse/SPARK-45527.
+
+        task_gpus = 1.0
+        treqs = TaskResourceRequests().cpus(task_cores).resource("gpu", task_gpus)
+        rp = ResourceProfileBuilder().require(treqs).build
+
+        self.logger.info(
+            "XGBoost training tasks require the resource(cores=%s, gpu=%s).",
+            task_cores,
+            task_gpus,
+        )
+        return rdd.withResources(rp)
+
     def _fit(self, dataset: DataFrame) -> "_SparkXGBModel":
         # pylint: disable=too-many-statements, too-many-locals
         self._validate_params()
@@ -987,14 +1110,16 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
                 )
 
         def _run_job() -> Tuple[str, str]:
-            ret = (
+            rdd = (
                 dataset.mapInPandas(
-                    _train_booster, schema="config string, booster string"  # type: ignore
+                    _train_booster,  # type: ignore
+                    schema="config string, booster string",
                 )
                 .rdd.barrier()
                 .mapPartitions(lambda x: x)
-                .collect()[0]
             )
+            rdd_with_resource = self._try_stage_level_scheduling(rdd)
+            ret = rdd_with_resource.collect()[0]
             return ret[0], ret[1]
 
         get_logger("XGBoost-PySpark").info(
@@ -1193,6 +1318,31 @@ class _SparkXGBModel(Model, _SparkXGBParams, MLReadable, MLWritable):
             dataset = dataset.drop(pred_struct_col)
         return dataset
 
+    def _gpu_transform(self) -> bool:
+        """If gpu is used to do the prediction, true to gpu prediction"""
+
+        if _is_local(_get_spark_session().sparkContext):
+            # if it's local model, we just use the internal "device"
+            return use_cuda(self.getOrDefault(self.device))
+
+        gpu_per_task = (
+            _get_spark_session()
+            .sparkContext.getConf()
+            .get("spark.task.resource.gpu.amount")
+        )
+
+        # User don't set gpu configurations, just use cpu
+        if gpu_per_task is None:
+            if use_cuda(self.getOrDefault(self.device)):
+                get_logger("XGBoost-PySpark").warning(
+                    "Do the prediction on the CPUs since "
+                    "no gpu configurations are set"
+                )
+            return False
+
+        # User already sets the gpu configurations, we just use the internal "device".
+        return use_cuda(self.getOrDefault(self.device))
+
     def _transform(self, dataset: DataFrame) -> DataFrame:
         # pylint: disable=too-many-statements, too-many-locals
         # Save xgb_sklearn_model and predict_params to be local variable
@@ -1216,21 +1366,77 @@ class _SparkXGBModel(Model, _SparkXGBParams, MLReadable, MLWritable):
 
         _, schema = self._out_schema()
 
+        is_local = _is_local(_get_spark_session().sparkContext)
+        run_on_gpu = self._gpu_transform()
+
         @pandas_udf(schema)  # type: ignore
         def predict_udf(iterator: Iterator[pd.DataFrame]) -> Iterator[pd.Series]:
             assert xgb_sklearn_model is not None
             model = xgb_sklearn_model
+
+            from pyspark import TaskContext
+
+            context = TaskContext.get()
+            assert context is not None
+
+            dev_ordinal = -1
+
+            if is_cudf_available():
+                if is_local:
+                    if run_on_gpu and is_cupy_available():
+                        import cupy as cp  # pylint: disable=import-error
+
+                        total_gpus = cp.cuda.runtime.getDeviceCount()
+                        if total_gpus > 0:
+                            partition_id = context.partitionId()
+                            # For transform local mode, default the dev_ordinal to
+                            # (partition id) % gpus.
+                            dev_ordinal = partition_id % total_gpus
+                elif run_on_gpu:
+                    dev_ordinal = _get_gpu_id(context)
+
+                if dev_ordinal >= 0:
+                    device = "cuda:" + str(dev_ordinal)
+                    get_logger("XGBoost-PySpark").info(
+                        "Do the inference with device: %s", device
+                    )
+                    model.set_params(device=device)
+                else:
+                    get_logger("XGBoost-PySpark").info("Do the inference on the CPUs")
+            else:
+                msg = (
+                    "CUDF is unavailable, fallback the inference on the CPUs"
+                    if run_on_gpu
+                    else "Do the inference on the CPUs"
+                )
+                get_logger("XGBoost-PySpark").info(msg)
+
+            def to_gpu_if_possible(data: ArrayLike) -> ArrayLike:
+                """Move the data to gpu if possible"""
+                if dev_ordinal >= 0:
+                    import cudf  # pylint: disable=import-error
+                    import cupy as cp  # pylint: disable=import-error
+
+                    # We must set the device after import cudf, which will change the device id to 0
+                    # See https://github.com/rapidsai/cudf/issues/11386
+                    cp.cuda.runtime.setDevice(dev_ordinal)  # pylint: disable=I1101
+                    df = cudf.DataFrame(data)
+                    del data
+                    return df
+                return data
+
             for data in iterator:
                 if enable_sparse_data_optim:
                     X = _read_csr_matrix_from_unwrapped_spark_vec(data)
                 else:
                     if feature_col_names is not None:
-                        X = data[feature_col_names]
+                        tmp = data[feature_col_names]
                     else:
-                        X = stack_series(data[alias.data])
+                        tmp = stack_series(data[alias.data])
+                    X = to_gpu_if_possible(tmp)
 
                 if has_base_margin:
-                    base_margin = data[alias.margin].to_numpy()
+                    base_margin = to_gpu_if_possible(data[alias.margin])
                 else:
                     base_margin = None
 
diff --git a/python-package/xgboost/spark/estimator.py b/python-package/xgboost/spark/estimator.py
index b73dfba6c..193ca4b2a 100644
--- a/python-package/xgboost/spark/estimator.py
+++ b/python-package/xgboost/spark/estimator.py
@@ -198,7 +198,7 @@ class SparkXGBRegressor(_SparkXGBEstimator):
     """
 
     @keyword_only
-    def __init__(
+    def __init__(  # pylint:disable=too-many-arguments
         self,
         *,
         features_col: Union[str, List[str]] = "features",
@@ -376,7 +376,7 @@ class SparkXGBClassifier(_SparkXGBEstimator, HasProbabilityCol, HasRawPrediction
     """
 
     @keyword_only
-    def __init__(
+    def __init__(  # pylint:disable=too-many-arguments
         self,
         *,
         features_col: Union[str, List[str]] = "features",
@@ -566,7 +566,7 @@ class SparkXGBRanker(_SparkXGBEstimator):
     """
 
     @keyword_only
-    def __init__(
+    def __init__(  # pylint:disable=too-many-arguments
         self,
         *,
         features_col: Union[str, List[str]] = "features",
diff --git a/python-package/xgboost/spark/utils.py b/python-package/xgboost/spark/utils.py
index 33a45a90e..395865386 100644
--- a/python-package/xgboost/spark/utils.py
+++ b/python-package/xgboost/spark/utils.py
@@ -10,7 +10,7 @@ from threading import Thread
 from typing import Any, Callable, Dict, Optional, Set, Type
 
 import pyspark
-from pyspark import BarrierTaskContext, SparkContext, SparkFiles
+from pyspark import BarrierTaskContext, SparkContext, SparkFiles, TaskContext
 from pyspark.sql.session import SparkSession
 
 from xgboost import Booster, XGBModel, collective
@@ -129,7 +129,14 @@ def _is_local(spark_context: SparkContext) -> bool:
     return spark_context._jsc.sc().isLocal()
 
 
-def _get_gpu_id(task_context: BarrierTaskContext) -> int:
+def _is_standalone_or_localcluster(spark_context: SparkContext) -> bool:
+    master = spark_context.getConf().get("spark.master")
+    return master is not None and (
+        master.startswith("spark://") or master.startswith("local-cluster")
+    )
+
+
+def _get_gpu_id(task_context: TaskContext) -> int:
     """Get the gpu id from the task resources"""
     if task_context is None:
         # This is a safety check.
diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py
index 2e0933a43..391f2bf9f 100644
--- a/python-package/xgboost/testing/__init__.py
+++ b/python-package/xgboost/testing/__init__.py
@@ -10,6 +10,7 @@ import os
 import platform
 import socket
 import sys
+import threading
 from concurrent.futures import ThreadPoolExecutor
 from contextlib import contextmanager
 from io import StringIO
@@ -34,6 +35,7 @@ import pytest
 from scipy import sparse
 
 import xgboost as xgb
+from xgboost import RabitTracker
 from xgboost.core import ArrayLike
 from xgboost.sklearn import SklObjective
 from xgboost.testing.data import (
@@ -938,3 +940,22 @@ def load_agaricus(path: str) -> Tuple[xgb.DMatrix, xgb.DMatrix]:
 
 def project_root(path: str) -> str:
     return normpath(os.path.join(demo_dir(path), os.path.pardir))
+
+
+def run_with_rabit(world_size: int, test_fn: Callable) -> None:
+    tracker = RabitTracker(host_ip="127.0.0.1", n_workers=world_size)
+    tracker.start(world_size)
+
+    def run_worker(rabit_env: Dict[str, Union[str, int]]) -> None:
+        with xgb.collective.CommunicatorContext(**rabit_env):
+            test_fn()
+
+    workers = []
+    for _ in range(world_size):
+        worker = threading.Thread(target=run_worker, args=(tracker.worker_envs(),))
+        workers.append(worker)
+        worker.start()
+    for worker in workers:
+        worker.join()
+
+    tracker.join()
diff --git a/rabit/CMakeLists.txt b/rabit/CMakeLists.txt
index 977d4867e..4562f864f 100644
--- a/rabit/CMakeLists.txt
+++ b/rabit/CMakeLists.txt
@@ -6,10 +6,10 @@ set(RABIT_SOURCES
   ${CMAKE_CURRENT_LIST_DIR}/src/allreduce_base.cc
   ${CMAKE_CURRENT_LIST_DIR}/src/rabit_c_api.cc)
 
-if (RABIT_MOCK)
+if(RABIT_MOCK)
   list(APPEND RABIT_SOURCES ${CMAKE_CURRENT_LIST_DIR}/src/engine_mock.cc)
-else ()
+else()
   list(APPEND RABIT_SOURCES ${CMAKE_CURRENT_LIST_DIR}/src/engine.cc)
-endif ()
+endif()
 
 set(RABIT_SOURCES ${RABIT_SOURCES} PARENT_SCOPE)
diff --git a/rabit/include/rabit/internal/io.h b/rabit/include/rabit/internal/io.h
index d93f32ff9..d5d0fee4d 100644
--- a/rabit/include/rabit/internal/io.h
+++ b/rabit/include/rabit/internal/io.h
@@ -16,8 +16,8 @@
 #include <string>
 #include <vector>
 
-#include "rabit/internal/utils.h"
-#include "rabit/serializable.h"
+#include "dmlc/io.h"
+#include "xgboost/logging.h"
 
 namespace rabit::utils {
 /*! \brief re-use definition of dmlc::SeekStream */
@@ -84,8 +84,7 @@ struct MemoryBufferStream : public SeekStream {
   }
   ~MemoryBufferStream() override = default;
   size_t Read(void *ptr, size_t size) override {
-    utils::Assert(curr_ptr_ <= p_buffer_->length(),
-                  "read can not have position excceed buffer length");
+    CHECK_LE(curr_ptr_, p_buffer_->length()) << "read can not have position excceed buffer length";
     size_t nread = std::min(p_buffer_->length() - curr_ptr_, size);
     if (nread != 0) std::memcpy(ptr, &(*p_buffer_)[0] + curr_ptr_, nread);
     curr_ptr_ += nread;
diff --git a/rabit/include/rabit/internal/socket.h b/rabit/include/rabit/internal/socket.h
index 6fb7fe725..89e324482 100644
--- a/rabit/include/rabit/internal/socket.h
+++ b/rabit/include/rabit/internal/socket.h
@@ -29,11 +29,10 @@
 #include <chrono>
 #include <cstring>
 #include <string>
+#include <system_error>  // make_error_code, errc
 #include <unordered_map>
 #include <vector>
 
-#include "utils.h"
-
 #if !defined(_WIN32)
 
 #include <sys/poll.h>
@@ -93,6 +92,17 @@ int PollImpl(PollFD* pfd, int nfds, std::chrono::seconds timeout) noexcept(true)
 #endif  // IS_MINGW()
 }
 
+template <typename E>
+std::enable_if_t<std::is_integral_v<E>, xgboost::collective::Result> PollError(E const& revents) {
+  if ((revents & POLLERR) != 0) {
+    return xgboost::system::FailWithCode("Poll error condition.");
+  }
+  if ((revents & POLLNVAL) != 0) {
+    return xgboost::system::FailWithCode("Invalid polling request.");
+  }
+  return xgboost::collective::Success();
+}
+
 /*! \brief helper data structure to perform poll */
 struct PollHelper {
  public:
@@ -160,25 +170,32 @@ struct PollHelper {
    *
    * @param timeout specify timeout in seconds. Block if negative.
    */
-  [[nodiscard]] xgboost::collective::Result Poll(std::chrono::seconds timeout) {
+  [[nodiscard]] xgboost::collective::Result Poll(std::chrono::seconds timeout,
+                                                 bool check_error = true) {
     std::vector<pollfd> fdset;
     fdset.reserve(fds.size());
     for (auto kv : fds) {
       fdset.push_back(kv.second);
     }
-    int ret = PollImpl(fdset.data(), fdset.size(), timeout);
+    std::int32_t ret = PollImpl(fdset.data(), fdset.size(), timeout);
     if (ret == 0) {
-      return xgboost::collective::Fail("Poll timeout.");
+      return xgboost::collective::Fail("Poll timeout.", std::make_error_code(std::errc::timed_out));
     } else if (ret < 0) {
       return xgboost::system::FailWithCode("Poll failed.");
-    } else {
-      for (auto& pfd : fdset) {
-        auto revents = pfd.revents & pfd.events;
-        if (!revents) {
-          fds.erase(pfd.fd);
-        } else {
-          fds[pfd.fd].events = revents;
-        }
+    }
+
+    for (auto& pfd : fdset) {
+      auto result = PollError(pfd.revents);
+      if (check_error && !result.OK()) {
+        return result;
+      }
+
+      auto revents = pfd.revents & pfd.events;
+      if (!revents) {
+        // FIXME(jiamingy): remove this once rabit is replaced.
+        fds.erase(pfd.fd);
+      } else {
+        fds[pfd.fd].events = revents;
       }
     }
     return xgboost::collective::Success();
diff --git a/rabit/src/allreduce_base.cc b/rabit/src/allreduce_base.cc
index bd48d3599..416801ee2 100644
--- a/rabit/src/allreduce_base.cc
+++ b/rabit/src/allreduce_base.cc
@@ -115,9 +115,12 @@ bool AllreduceBase::Init(int argc, char* argv[]) {
   // start socket
   xgboost::system::SocketStartup();
   utils::Assert(all_links.size() == 0, "can only call Init once");
-  this->host_uri = xgboost::collective::GetHostName();
+  auto rc = xgboost::collective::GetHostName(&this->host_uri);
+  if (!rc.OK()) {
+    LOG(FATAL) << rc.Report();
+  }
   // get information from tracker
-  auto rc = this->ReConnectLinks();
+  rc = this->ReConnectLinks();
   if (rc.OK()) {
     return true;
   }
@@ -406,13 +409,14 @@ void AllreduceBase::SetParam(const char *name, const char *val) {
       if (!match) all_links.emplace_back(std::move(r));
     }
     sock_listen.Close();
+
     this->parent_index = -1;
     // setup tree links and ring structure
     tree_links.plinks.clear();
     for (auto &all_link : all_links) {
       utils::Assert(!all_link.sock.BadSocket(), "ReConnectLink: bad socket");
       // set the socket to non-blocking mode, enable TCP keepalive
-      all_link.sock.SetNonBlock(true);
+      CHECK(all_link.sock.NonBlocking(true).OK());
       all_link.sock.SetKeepAlive();
       if (rabit_enable_tcp_no_delay) {
         all_link.sock.SetNoDelay();
@@ -545,7 +549,7 @@ AllreduceBase::TryAllreduceTree(void *sendrecvbuf_,
       break;
     }
     // select must return
-    auto poll_res = watcher.Poll(timeout_sec);
+    auto poll_res = watcher.Poll(timeout_sec, false);  // fail on macos
     if (!poll_res.OK()) {
       LOG(FATAL) << poll_res.Report();
     }
@@ -717,12 +721,11 @@ AllreduceBase::TryBroadcast(void *sendrecvbuf_, size_t total_size, int root) {
         }
         finished = false;
       }
-      watcher.WatchException(links[i].sock);
     }
     // finish running
     if (finished) break;
     // select
-    auto poll_res = watcher.Poll(timeout_sec);
+    auto poll_res = watcher.Poll(timeout_sec, false);  // fail on macos
     if (!poll_res.OK()) {
       LOG(FATAL) << poll_res.Report();
     }
@@ -811,7 +814,7 @@ AllreduceBase::TryAllgatherRing(void *sendrecvbuf_, size_t total_size,
       break;
     }
 
-    auto poll_res = watcher.Poll(timeout_sec);
+    auto poll_res = watcher.Poll(timeout_sec, false);  // fail on macos
     if (!poll_res.OK()) {
       LOG(FATAL) << poll_res.Report();
     }
@@ -916,7 +919,7 @@ AllreduceBase::TryReduceScatterRing(void *sendrecvbuf_,
     if (finished) {
       break;
     }
-    auto poll_res = watcher.Poll(timeout_sec);
+    auto poll_res = watcher.Poll(timeout_sec, false);  // fail on macos
     if (!poll_res.OK()) {
       LOG(FATAL) << poll_res.Report();
     }
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 052f70b4c..84b431c27 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -11,10 +11,10 @@ set_source_files_properties(
   PROPERTIES SKIP_UNITY_BUILD_INCLUSION ON)
 target_sources(objxgboost PRIVATE ${RABIT_SOURCES})
 
-if (USE_CUDA)
+if(USE_CUDA)
   file(GLOB_RECURSE CUDA_SOURCES *.cu *.cuh)
   target_sources(objxgboost PRIVATE ${CUDA_SOURCES})
-endif (USE_CUDA)
+endif()
 
 if (USE_HIP)
   file(GLOB_RECURSE HIP_SOURCES *.hip *.hip.h)
@@ -27,9 +27,9 @@ target_include_directories(objxgboost
   ${xgboost_SOURCE_DIR}/dmlc-core/include
   ${xgboost_SOURCE_DIR}/rabit/include)
 
-if (LOG_CAPI_INVOCATION)
+if(LOG_CAPI_INVOCATION)
   target_compile_definitions(objxgboost PRIVATE -DLOG_CAPI_INVOCATION=1)
-endif (LOG_CAPI_INVOCATION)
+endif()
 
 # For MSVC: Call msvc_use_static_runtime() once again to completely
 # replace /MD with /MT. See https://github.com/dmlc/xgboost/issues/4462
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index e74a3a4c4..cf922f5fd 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -271,8 +271,8 @@ XGB_DLL int XGDMatrixCreateFromDataIter(
   if (cache_info != nullptr) {
     scache = cache_info;
   }
-  xgboost::data::IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext,
-                                 XGBoostBatchCSR> adapter(data_handle, callback);
+  xgboost::data::IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR> adapter(
+      data_handle, callback);
   xgboost_CHECK_C_ARG_PTR(out);
   *out = new std::shared_ptr<DMatrix> {
     DMatrix::Create(
@@ -447,8 +447,11 @@ XGB_DLL int XGDMatrixCreateFromCSR(char const *indptr, char const *indices, char
   auto config = Json::Load(StringView{c_json_config});
   float missing = GetMissing(config);
   auto n_threads = OptionalArg<Integer, int64_t>(config, "nthread", 0);
+  auto data_split_mode =
+      static_cast<DataSplitMode>(OptionalArg<Integer, int64_t>(config, "data_split_mode", 0));
   xgboost_CHECK_C_ARG_PTR(out);
-  *out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, n_threads));
+  *out = new std::shared_ptr<DMatrix>(
+      DMatrix::Create(&adapter, missing, n_threads, "", data_split_mode));
   API_END();
 }
 
@@ -483,8 +486,11 @@ XGB_DLL int XGDMatrixCreateFromCSC(char const *indptr, char const *indices, char
   auto config = Json::Load(StringView{c_json_config});
   float missing = GetMissing(config);
   auto n_threads = OptionalArg<Integer, int64_t>(config, "nthread", common::OmpGetNumThreads(0));
+  auto data_split_mode =
+      static_cast<DataSplitMode>(OptionalArg<Integer, int64_t>(config, "data_split_mode", 0));
   xgboost_CHECK_C_ARG_PTR(out);
-  *out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, n_threads));
+  *out = new std::shared_ptr<DMatrix>(
+      DMatrix::Create(&adapter, missing, n_threads, "", data_split_mode));
 
   API_END();
 }
@@ -534,33 +540,8 @@ XGB_DLL int XGDMatrixCreateFromDT(void** data, const char** feature_stypes,
   API_END();
 }
 
-XGB_DLL int XGImportArrowRecordBatch(DataIterHandle data_handle, void *ptr_array,
-                                     void *ptr_schema) {
-  API_BEGIN();
-  static_cast<data::RecordBatchesIterAdapter *>(data_handle)
-      ->SetData(static_cast<struct ArrowArray *>(ptr_array),
-                static_cast<struct ArrowSchema *>(ptr_schema));
-  API_END();
-}
-
-XGB_DLL int XGDMatrixCreateFromArrowCallback(XGDMatrixCallbackNext *next, char const *config,
-                                             DMatrixHandle *out) {
-  API_BEGIN();
-  xgboost_CHECK_C_ARG_PTR(config);
-  auto jconfig = Json::Load(StringView{config});
-  auto missing = GetMissing(jconfig);
-  auto n_batches = RequiredArg<Integer>(jconfig, "nbatch", __func__);
-  auto n_threads = OptionalArg<Integer, std::int64_t>(jconfig, "nthread", 0);
-  data::RecordBatchesIterAdapter adapter(next, n_batches);
-  xgboost_CHECK_C_ARG_PTR(out);
-  *out = new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, n_threads));
-  API_END();
-}
-
-XGB_DLL int XGDMatrixSliceDMatrix(DMatrixHandle handle,
-                                  const int* idxset,
-                                  xgboost::bst_ulong len,
-                                  DMatrixHandle* out) {
+XGB_DLL int XGDMatrixSliceDMatrix(DMatrixHandle handle, const int *idxset, xgboost::bst_ulong len,
+                                  DMatrixHandle *out) {
   xgboost_CHECK_C_ARG_PTR(out);
   return XGDMatrixSliceDMatrixEx(handle, idxset, len, out, 0);
 }
@@ -749,6 +730,15 @@ XGB_DLL int XGDMatrixNumNonMissing(DMatrixHandle const handle, xgboost::bst_ulon
   API_END();
 }
 
+XGB_DLL int XGDMatrixDataSplitMode(DMatrixHandle handle, bst_ulong *out) {
+  API_BEGIN();
+  CHECK_HANDLE();
+  auto p_m = CastDMatrixHandle(handle);
+  xgboost_CHECK_C_ARG_PTR(out);
+  *out = static_cast<xgboost::bst_ulong>(p_m->Info().data_split_mode);
+  API_END();
+}
+
 XGB_DLL int XGDMatrixGetDataAsCSR(DMatrixHandle const handle, char const *config,
                                   xgboost::bst_ulong *out_indptr, unsigned *out_indices,
                                   float *out_data) {
@@ -1375,29 +1365,6 @@ XGB_DLL int XGBoosterSaveModelToBuffer(BoosterHandle handle, char const *json_co
   API_END();
 }
 
-XGB_DLL int XGBoosterGetModelRaw(BoosterHandle handle, xgboost::bst_ulong *out_len,
-                                 const char **out_dptr) {
-  API_BEGIN();
-  CHECK_HANDLE();
-
-  auto *learner = static_cast<Learner*>(handle);
-  std::string& raw_str = learner->GetThreadLocal().ret_str;
-  raw_str.resize(0);
-
-  common::MemoryBufferStream fo(&raw_str);
-  LOG(WARNING) << error::DeprecatedFunc(__func__, "1.6.0", "XGBoosterSaveModelToBuffer");
-
-  learner->Configure();
-  learner->SaveModel(&fo);
-
-  xgboost_CHECK_C_ARG_PTR(out_dptr);
-  xgboost_CHECK_C_ARG_PTR(out_len);
-
-  *out_dptr = dmlc::BeginPtr(raw_str);
-  *out_len = static_cast<xgboost::bst_ulong>(raw_str.length());
-  API_END();
-}
-
 // The following two functions are `Load` and `Save` for memory based
 // serialization methods. E.g. Python pickle.
 XGB_DLL int XGBoosterSerializeToBuffer(BoosterHandle handle, xgboost::bst_ulong *out_len,
@@ -1432,36 +1399,13 @@ XGB_DLL int XGBoosterUnserializeFromBuffer(BoosterHandle handle,
   API_END();
 }
 
-XGB_DLL int XGBoosterLoadRabitCheckpoint(BoosterHandle handle,
-                                         int* version) {
-  API_BEGIN();
-  CHECK_HANDLE();
-  auto* bst = static_cast<Learner*>(handle);
-  xgboost_CHECK_C_ARG_PTR(version);
-  *version = rabit::LoadCheckPoint();
-  if (*version != 0) {
-    bst->Configure();
-  }
-  API_END();
-}
-
-XGB_DLL int XGBoosterSaveRabitCheckpoint(BoosterHandle handle) {
-  API_BEGIN();
-  CHECK_HANDLE();
-  auto *learner = static_cast<Learner *>(handle);
-  learner->Configure();
-  rabit::CheckPoint();
-  API_END();
-}
-
-XGB_DLL int XGBoosterSlice(BoosterHandle handle, int begin_layer,
-                           int end_layer, int step,
+XGB_DLL int XGBoosterSlice(BoosterHandle handle, int begin_layer, int end_layer, int step,
                            BoosterHandle *out) {
   API_BEGIN();
   CHECK_HANDLE();
   xgboost_CHECK_C_ARG_PTR(out);
 
-  auto* learner = static_cast<Learner*>(handle);
+  auto *learner = static_cast<Learner *>(handle);
   bool out_of_bound = false;
   auto p_out = learner->Slice(begin_layer, end_layer, step, &out_of_bound);
   if (out_of_bound) {
@@ -1797,7 +1741,7 @@ XGB_DLL int XGCommunicatorAllreduce(void *send_receive_buffer, size_t count, int
 }
 
 #if defined(XGBOOST_USE_FEDERATED)
-XGB_DLL int XGBRunFederatedServer(int port, int world_size, char const *server_key_path,
+XGB_DLL int XGBRunFederatedServer(int port, std::size_t world_size, char const *server_key_path,
                                   char const *server_cert_path, char const *client_cert_path) {
   API_BEGIN();
   federated::RunServer(port, world_size, server_key_path, server_cert_path, client_cert_path);
@@ -1805,7 +1749,7 @@ XGB_DLL int XGBRunFederatedServer(int port, int world_size, char const *server_k
 }
 
 // Run a server without SSL for local testing.
-XGB_DLL int XGBRunInsecureFederatedServer(int port, int world_size) {
+XGB_DLL int XGBRunInsecureFederatedServer(int port, std::size_t world_size) {
   API_BEGIN();
   federated::RunInsecureServer(port, world_size);
   API_END();
diff --git a/src/c_api/c_api.cu b/src/c_api/c_api.cu
index de21e9749..d37ca5670 100644
--- a/src/c_api/c_api.cu
+++ b/src/c_api/c_api.cu
@@ -75,7 +75,7 @@ void CopyGradientFromCUDAArrays(Context const *ctx, ArrayInterface<2, false> con
   auto hess_dev = dh::CudaGetPointerDevice(hess.data);
   CHECK_EQ(grad_dev, hess_dev) << "gradient and hessian should be on the same device.";
   auto &gpair = *out_gpair;
-  gpair.SetDevice(grad_dev);
+  gpair.SetDevice(DeviceOrd::CUDA(grad_dev));
   gpair.Reshape(grad.Shape(0), grad.Shape(1));
   auto d_gpair = gpair.View(DeviceOrd::CUDA(grad_dev));
   auto cuctx = ctx->CUDACtx();
@@ -153,7 +153,7 @@ int InplacePreidctCUDA(BoosterHandle handle, char const *c_array_interface,
   if (learner->Ctx()->IsCUDA()) {
     CHECK(p_predt->DeviceCanRead() && !p_predt->HostCanRead());
   }
-  p_predt->SetDevice(proxy->DeviceIdx());
+  p_predt->SetDevice(proxy->Device());
 
   auto &shape = learner->GetThreadLocal().prediction_shape;
   size_t n_samples = p_m->Info().num_row_;
diff --git a/src/c_api/c_api_utils.h b/src/c_api/c_api_utils.h
index aee939466..5526619c0 100644
--- a/src/c_api/c_api_utils.h
+++ b/src/c_api/c_api_utils.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021-2023 by XGBoost Contributors
+ * Copyright 2021-2023, XGBoost Contributors
  */
 #ifndef XGBOOST_C_API_C_API_UTILS_H_
 #define XGBOOST_C_API_C_API_UTILS_H_
@@ -13,6 +13,7 @@
 #include <utility>  // for move
 #include <vector>
 
+#include "../common/json_utils.h"  // for TypeCheck
 #include "xgboost/c_api.h"
 #include "xgboost/data.h"         // DMatrix
 #include "xgboost/feature_map.h"  // for FeatureMap
@@ -254,28 +255,6 @@ inline void GenerateFeatureMap(Learner const *learner,
 
 void XGBBuildInfoDevice(Json* p_info);
 
-template <typename JT>
-auto const &RequiredArg(Json const &in, StringView key, StringView func) {
-  auto const &obj = get<Object const>(in);
-  auto it = obj.find(key);
-  if (it == obj.cend() || IsA<Null>(it->second)) {
-    LOG(FATAL) << "Argument `" << key << "` is required for `" << func << "`.";
-  }
-  TypeCheck<JT>(it->second, StringView{key});
-  return get<std::remove_const_t<JT> const>(it->second);
-}
-
-template <typename JT, typename T>
-auto const &OptionalArg(Json const &in, StringView key, T const &dft) {
-  auto const &obj = get<Object const>(in);
-  auto it = obj.find(key);
-  if (it != obj.cend() && !IsA<Null>(it->second)) {
-    TypeCheck<JT>(it->second, key);
-    return get<std::remove_const_t<JT> const>(it->second);
-  }
-  return dft;
-}
-
 /**
  * \brief Get shared ptr from DMatrix C handle with additional checks.
  */
diff --git a/src/collective/aggregator.cuh b/src/collective/aggregator.cuh
index a87a968ab..66766470b 100644
--- a/src/collective/aggregator.cuh
+++ b/src/collective/aggregator.cuh
@@ -15,8 +15,7 @@
 
 #include "communicator-inl.cuh"
 
-namespace xgboost {
-namespace collective {
+namespace xgboost::collective {
 
 /**
  * @brief Find the global sum of the given values across all workers.
@@ -31,10 +30,9 @@ namespace collective {
  * @param size Number of values to sum.
  */
 template <typename T>
-void GlobalSum(MetaInfo const& info, int device, T* values, size_t size) {
+void GlobalSum(MetaInfo const& info, DeviceOrd device, T* values, size_t size) {
   if (info.IsRowSplit()) {
-    collective::AllReduce<collective::Operation::kSum>(device, values, size);
+    collective::AllReduce<collective::Operation::kSum>(device.ordinal, values, size);
   }
 }
-}  // namespace collective
-}  // namespace xgboost
+}  // namespace xgboost::collective
diff --git a/src/collective/allgather.cc b/src/collective/allgather.cc
new file mode 100644
index 000000000..378a06911
--- /dev/null
+++ b/src/collective/allgather.cc
@@ -0,0 +1,88 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include "allgather.h"
+
+#include <algorithm>  // for min, copy_n
+#include <cstddef>    // for size_t
+#include <cstdint>    // for int8_t, int32_t, int64_t
+#include <memory>     // for shared_ptr
+#include <numeric>    // for partial_sum
+#include <vector>     // for vector
+
+#include "comm.h"                       // for Comm, Channel
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/span.h"               // for Span
+
+namespace xgboost::collective::cpu_impl {
+Result RingAllgather(Comm const& comm, common::Span<std::int8_t> data, std::size_t segment_size,
+                     std::int32_t worker_off, std::shared_ptr<Channel> prev_ch,
+                     std::shared_ptr<Channel> next_ch) {
+  auto world = comm.World();
+  auto rank = comm.Rank();
+  CHECK_LT(worker_off, world);
+
+  for (std::int32_t r = 0; r < world; ++r) {
+    auto send_rank = (rank + world - r + worker_off) % world;
+    auto send_off = send_rank * segment_size;
+    send_off = std::min(send_off, data.size_bytes());
+    auto send_seg = data.subspan(send_off, std::min(segment_size, data.size_bytes() - send_off));
+    next_ch->SendAll(send_seg.data(), send_seg.size_bytes());
+
+    auto recv_rank = (rank + world - r - 1 + worker_off) % world;
+    auto recv_off = recv_rank * segment_size;
+    recv_off = std::min(recv_off, data.size_bytes());
+    auto recv_seg = data.subspan(recv_off, std::min(segment_size, data.size_bytes() - recv_off));
+    prev_ch->RecvAll(recv_seg.data(), recv_seg.size_bytes());
+    auto rc = prev_ch->Block();
+    if (!rc.OK()) {
+      return rc;
+    }
+  }
+
+  return Success();
+}
+
+[[nodiscard]] Result RingAllgatherV(Comm const& comm, common::Span<std::int64_t const> sizes,
+                                    common::Span<std::int8_t const> data,
+                                    common::Span<std::int8_t> erased_result) {
+  auto world = comm.World();
+  auto rank = comm.Rank();
+
+  auto prev = BootstrapPrev(rank, comm.World());
+  auto next = BootstrapNext(rank, comm.World());
+
+  auto prev_ch = comm.Chan(prev);
+  auto next_ch = comm.Chan(next);
+
+  // get worker offset
+  std::vector<std::int64_t> offset(world + 1, 0);
+  std::partial_sum(sizes.cbegin(), sizes.cend(), offset.begin() + 1);
+  CHECK_EQ(*offset.cbegin(), 0);
+
+  // copy data
+  auto current = erased_result.subspan(offset[rank], data.size_bytes());
+  auto erased_data = EraseType(data);
+  std::copy_n(erased_data.data(), erased_data.size(), current.data());
+
+  for (std::int32_t r = 0; r < world; ++r) {
+    auto send_rank = (rank + world - r) % world;
+    auto send_off = offset[send_rank];
+    auto send_size = sizes[send_rank];
+    auto send_seg = erased_result.subspan(send_off, send_size);
+    next_ch->SendAll(send_seg);
+
+    auto recv_rank = (rank + world - r - 1) % world;
+    auto recv_off = offset[recv_rank];
+    auto recv_size = sizes[recv_rank];
+    auto recv_seg = erased_result.subspan(recv_off, recv_size);
+    prev_ch->RecvAll(recv_seg.data(), recv_seg.size_bytes());
+
+    auto rc = prev_ch->Block();
+    if (!rc.OK()) {
+      return rc;
+    }
+  }
+  return comm.Block();
+}
+}  // namespace xgboost::collective::cpu_impl
diff --git a/src/collective/allgather.h b/src/collective/allgather.h
new file mode 100644
index 000000000..cb5f5b8af
--- /dev/null
+++ b/src/collective/allgather.h
@@ -0,0 +1,72 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#pragma once
+#include <cstddef>      // for size_t
+#include <cstdint>      // for int32_t
+#include <memory>       // for shared_ptr
+#include <numeric>      // for accumulate
+#include <type_traits>  // for remove_cv_t
+#include <vector>       // for vector
+
+#include "comm.h"                       // for Comm, Channel, EraseType
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/span.h"               // for Span
+
+namespace xgboost::collective {
+namespace cpu_impl {
+/**
+ * @param worker_off Segment offset. For example, if the rank 2 worker specifis worker_off
+ *                   = 1, then it owns the third segment.
+ */
+[[nodiscard]] Result RingAllgather(Comm const& comm, common::Span<std::int8_t> data,
+                                   std::size_t segment_size, std::int32_t worker_off,
+                                   std::shared_ptr<Channel> prev_ch,
+                                   std::shared_ptr<Channel> next_ch);
+
+[[nodiscard]] Result RingAllgatherV(Comm const& comm, common::Span<std::int64_t const> sizes,
+                                    common::Span<std::int8_t const> data,
+                                    common::Span<std::int8_t> erased_result);
+}  // namespace cpu_impl
+
+template <typename T>
+[[nodiscard]] Result RingAllgather(Comm const& comm, common::Span<T> data, std::size_t size) {
+  auto n_bytes = sizeof(T) * size;
+  auto erased = EraseType(data);
+
+  auto rank = comm.Rank();
+  auto prev = BootstrapPrev(rank, comm.World());
+  auto next = BootstrapNext(rank, comm.World());
+
+  auto prev_ch = comm.Chan(prev);
+  auto next_ch = comm.Chan(next);
+  auto rc = cpu_impl::RingAllgather(comm, erased, n_bytes, 0, prev_ch, next_ch);
+  if (!rc.OK()) {
+    return rc;
+  }
+  return comm.Block();
+}
+
+template <typename T>
+[[nodiscard]] Result RingAllgatherV(Comm const& comm, common::Span<T> data,
+                                    std::vector<std::remove_cv_t<T>>* p_out) {
+  auto world = comm.World();
+  auto rank = comm.Rank();
+
+  std::vector<std::int64_t> sizes(world, 0);
+  sizes[rank] = data.size_bytes();
+  auto rc = RingAllgather(comm, common::Span{sizes.data(), sizes.size()}, 1);
+  if (!rc.OK()) {
+    return rc;
+  }
+
+  std::vector<T>& result = *p_out;
+  auto n_total_bytes = std::accumulate(sizes.cbegin(), sizes.cend(), 0);
+  result.resize(n_total_bytes / sizeof(T));
+  auto h_result = common::Span{result.data(), result.size()};
+  auto erased_result = EraseType(h_result);
+  auto erased_data = EraseType(data);
+
+  return cpu_impl::RingAllgatherV(comm, sizes, erased_data, erased_result);
+}
+}  // namespace xgboost::collective
diff --git a/src/collective/allreduce.cc b/src/collective/allreduce.cc
new file mode 100644
index 000000000..6948f6758
--- /dev/null
+++ b/src/collective/allreduce.cc
@@ -0,0 +1,90 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include "allreduce.h"
+
+#include <algorithm>  // for min
+#include <cstddef>    // for size_t
+#include <cstdint>    // for int32_t, int8_t
+#include <vector>     // for vector
+
+#include "../data/array_interface.h"    // for Type, DispatchDType
+#include "allgather.h"                  // for RingAllgather
+#include "comm.h"                       // for Comm
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/span.h"               // for Span
+
+namespace xgboost::collective::cpu_impl {
+template <typename T>
+Result RingScatterReduceTyped(Comm const& comm, common::Span<std::int8_t> data,
+                              std::size_t n_bytes_in_seg, Func const& op) {
+  auto rank = comm.Rank();
+  auto world = comm.World();
+
+  auto dst_rank = BootstrapNext(rank, world);
+  auto src_rank = BootstrapPrev(rank, world);
+  auto next_ch = comm.Chan(dst_rank);
+  auto prev_ch = comm.Chan(src_rank);
+
+  std::vector<std::int8_t> buffer(n_bytes_in_seg, 0);
+  auto s_buf = common::Span{buffer.data(), buffer.size()};
+
+  for (std::int32_t r = 0; r < world - 1; ++r) {
+    // send to ring next
+    auto send_off = ((rank + world - r) % world) * n_bytes_in_seg;
+    send_off = std::min(send_off, data.size_bytes());
+    auto seg_nbytes = std::min(data.size_bytes() - send_off, n_bytes_in_seg);
+    auto send_seg = data.subspan(send_off, seg_nbytes);
+
+    next_ch->SendAll(send_seg);
+
+    // receive from ring prev
+    auto recv_off = ((rank + world - r - 1) % world) * n_bytes_in_seg;
+    recv_off = std::min(recv_off, data.size_bytes());
+    seg_nbytes = std::min(data.size_bytes() - recv_off, n_bytes_in_seg);
+    CHECK_EQ(seg_nbytes % sizeof(T), 0);
+    auto recv_seg = data.subspan(recv_off, seg_nbytes);
+    auto seg = s_buf.subspan(0, recv_seg.size());
+
+    prev_ch->RecvAll(seg);
+    auto rc = prev_ch->Block();
+    if (!rc.OK()) {
+      return rc;
+    }
+
+    // accumulate to recv_seg
+    CHECK_EQ(seg.size(), recv_seg.size());
+    op(seg, recv_seg);
+  }
+
+  return Success();
+}
+
+Result RingAllreduce(Comm const& comm, common::Span<std::int8_t> data, Func const& op,
+                     ArrayInterfaceHandler::Type type) {
+  return DispatchDType(type, [&](auto t) {
+    using T = decltype(t);
+    // Divide the data into segments according to the number of workers.
+    auto n_bytes_elem = sizeof(T);
+    CHECK_EQ(data.size_bytes() % n_bytes_elem, 0);
+    auto n = data.size_bytes() / n_bytes_elem;
+    auto world = comm.World();
+    auto n_bytes_in_seg = common::DivRoundUp(n, world) * sizeof(T);
+    auto rc = RingScatterReduceTyped<T>(comm, data, n_bytes_in_seg, op);
+    if (!rc.OK()) {
+      return rc;
+    }
+
+    auto prev = BootstrapPrev(comm.Rank(), comm.World());
+    auto next = BootstrapNext(comm.Rank(), comm.World());
+    auto prev_ch = comm.Chan(prev);
+    auto next_ch = comm.Chan(next);
+
+    rc = RingAllgather(comm, data, n_bytes_in_seg, 1, prev_ch, next_ch);
+    if (!rc.OK()) {
+      return rc;
+    }
+    return comm.Block();
+  });
+}
+}  // namespace xgboost::collective::cpu_impl
diff --git a/src/collective/allreduce.h b/src/collective/allreduce.h
new file mode 100644
index 000000000..e3f8ab5b8
--- /dev/null
+++ b/src/collective/allreduce.h
@@ -0,0 +1,39 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#pragma once
+#include <cstdint>      // for int8_t
+#include <functional>   // for function
+#include <type_traits>  // for is_invocable_v
+
+#include "../data/array_interface.h"    // for ArrayInterfaceHandler
+#include "comm.h"                       // for Comm, RestoreType
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/span.h"               // for Span
+
+namespace xgboost::collective {
+namespace cpu_impl {
+using Func =
+    std::function<void(common::Span<std::int8_t const> lhs, common::Span<std::int8_t> out)>;
+
+Result RingAllreduce(Comm const& comm, common::Span<std::int8_t> data, Func const& op,
+                     ArrayInterfaceHandler::Type type);
+}  // namespace cpu_impl
+
+template <typename T, typename Fn>
+std::enable_if_t<std::is_invocable_v<Fn, common::Span<T const>, common::Span<T>>, Result> Allreduce(
+    Comm const& comm, common::Span<T> data, Fn redop) {
+  auto erased = EraseType(data);
+  auto type = ToDType<T>::kType;
+
+  auto erased_fn = [type, redop](common::Span<std::int8_t const> lhs,
+                                 common::Span<std::int8_t> out) {
+    CHECK_EQ(lhs.size(), out.size()) << "Invalid input for reduction.";
+    auto lhs_t = RestoreType<T const>(lhs);
+    auto rhs_t = RestoreType<T>(out);
+    redop(lhs_t, rhs_t);
+  };
+
+  return cpu_impl::RingAllreduce(comm, erased, erased_fn, type);
+}
+}  // namespace xgboost::collective
diff --git a/src/collective/broadcast.cc b/src/collective/broadcast.cc
new file mode 100644
index 000000000..660bb9130
--- /dev/null
+++ b/src/collective/broadcast.cc
@@ -0,0 +1,84 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include "broadcast.h"
+
+#include <cmath>    // for ceil, log2
+#include <cstdint>  // for int32_t, int8_t
+#include <utility>  // for move
+
+#include "../common/bitfield.h"         // for TrailingZeroBits, RBitField32
+#include "comm.h"                       // for Comm
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/span.h"               // for Span
+
+namespace xgboost::collective::cpu_impl {
+namespace {
+std::int32_t ShiftedParentRank(std::int32_t shifted_rank, std::int32_t depth) {
+  std::uint32_t mask{std::uint32_t{0} - 1};  // Oxff...
+  RBitField32 maskbits{common::Span<std::uint32_t>{&mask, 1}};
+  RBitField32 rankbits{
+      common::Span<std::uint32_t>{reinterpret_cast<std::uint32_t*>(&shifted_rank), 1}};
+  // prepare for counting trailing zeros.
+  for (std::int32_t i = 0; i < depth + 1; ++i) {
+    if (rankbits.Check(i)) {
+      maskbits.Set(i);
+    } else {
+      maskbits.Clear(i);
+    }
+  }
+
+  CHECK_NE(mask, 0);
+  auto k = TrailingZeroBits(mask);
+  auto shifted_parent = shifted_rank - (1 << k);
+  return shifted_parent;
+}
+
+// Shift the root node to rank 0
+std::int32_t ShiftLeft(std::int32_t rank, std::int32_t world, std::int32_t root) {
+  auto shifted_rank = (rank + world - root) % world;
+  return shifted_rank;
+}
+// shift back to the original rank
+std::int32_t ShiftRight(std::int32_t rank, std::int32_t world, std::int32_t root) {
+  auto orig = (rank + root) % world;
+  return orig;
+}
+}  // namespace
+
+Result Broadcast(Comm const& comm, common::Span<std::int8_t> data, std::int32_t root) {
+  // Binomial tree broadcast
+  // * Wiki
+  // https://en.wikipedia.org/wiki/Broadcast_(parallel_pattern)#Binomial_Tree_Broadcast
+  // * Impl
+  // https://people.mpi-inf.mpg.de/~mehlhorn/ftp/NewToolbox/collective.pdf
+
+  auto rank = comm.Rank();
+  auto world = comm.World();
+
+  // shift root to rank 0
+  auto shifted_rank = ShiftLeft(rank, world, root);
+  std::int32_t depth = std::ceil(std::log2(static_cast<double>(world))) - 1;
+
+  if (shifted_rank != 0) {  // not root
+    auto parent = ShiftRight(ShiftedParentRank(shifted_rank, depth), world, root);
+    comm.Chan(parent)->RecvAll(data);
+    auto rc = comm.Chan(parent)->Block();
+    if (!rc.OK()) {
+      return Fail("broadcast failed.", std::move(rc));
+    }
+  }
+
+  for (std::int32_t i = depth; i >= 0; --i) {
+    CHECK_GE((i + 1), 0);  // weird clang-tidy error that i might be negative
+    if (shifted_rank % (1 << (i + 1)) == 0 && shifted_rank + (1 << i) < world) {
+      auto sft_peer = shifted_rank + (1 << i);
+      auto peer = ShiftRight(sft_peer, world, root);
+      CHECK_NE(peer, root);
+      comm.Chan(peer)->SendAll(data);
+    }
+  }
+
+  return comm.Block();
+}
+}  // namespace xgboost::collective::cpu_impl
diff --git a/src/collective/broadcast.h b/src/collective/broadcast.h
new file mode 100644
index 000000000..28db83815
--- /dev/null
+++ b/src/collective/broadcast.h
@@ -0,0 +1,26 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#pragma once
+#include <cstdint>  // for int32_t, int8_t
+
+#include "comm.h"                       // for Comm
+#include "xgboost/collective/result.h"  // for
+#include "xgboost/span.h"               // for Span
+
+namespace xgboost::collective {
+namespace cpu_impl {
+Result Broadcast(Comm const& comm, common::Span<std::int8_t> data, std::int32_t root);
+}
+
+/**
+ * @brief binomial tree broadcast is used on CPU with the default implementation.
+ */
+template <typename T>
+[[nodiscard]] Result Broadcast(Comm const& comm, common::Span<T> data, std::int32_t root) {
+  auto n_total_bytes = data.size_bytes();
+  auto erased =
+      common::Span<std::int8_t>{reinterpret_cast<std::int8_t*>(data.data()), n_total_bytes};
+  return cpu_impl::Broadcast(comm, erased, root);
+}
+}  // namespace xgboost::collective
diff --git a/src/collective/comm.cc b/src/collective/comm.cc
new file mode 100644
index 000000000..9ee1e0e6a
--- /dev/null
+++ b/src/collective/comm.cc
@@ -0,0 +1,304 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include "comm.h"
+
+#include <algorithm>  // for copy
+#include <chrono>     // for seconds
+#include <memory>     // for shared_ptr
+#include <string>     // for string
+#include <utility>    // for move, forward
+
+#include "allgather.h"
+#include "protocol.h"                   // for kMagic
+#include "xgboost/base.h"               // for XGBOOST_STRICT_R_MODE
+#include "xgboost/collective/socket.h"  // for TCPSocket
+#include "xgboost/json.h"               // for Json, Object
+#include "xgboost/string_view.h"        // for StringView
+
+namespace xgboost::collective {
+Comm::Comm(std::string const& host, std::int32_t port, std::chrono::seconds timeout,
+           std::int32_t retry, std::string task_id)
+    : timeout_{timeout},
+      retry_{retry},
+      tracker_{host, port, -1},
+      task_id_{std::move(task_id)},
+      loop_{std::make_shared<Loop>(timeout)} {}
+
+Result ConnectTrackerImpl(proto::PeerInfo info, std::chrono::seconds timeout, std::int32_t retry,
+                          std::string const& task_id, TCPSocket* out, std::int32_t rank,
+                          std::int32_t world) {
+  // get information from tracker
+  CHECK(!info.host.empty());
+  auto rc = Connect(info.host, info.port, retry, timeout, out);
+  if (!rc.OK()) {
+    return Fail("Failed to connect to the tracker.", std::move(rc));
+  }
+
+  TCPSocket& tracker = *out;
+  return std::move(rc)
+      << [&] { return tracker.NonBlocking(false); }
+      << [&] { return tracker.RecvTimeout(timeout); }
+      << [&] { return proto::Magic{}.Verify(&tracker); }
+      << [&] { return proto::Connect{}.WorkerSend(&tracker, world, rank, task_id); };
+}
+
+[[nodiscard]] Result Comm::ConnectTracker(TCPSocket* out) const {
+  return ConnectTrackerImpl(this->TrackerInfo(), this->Timeout(), this->retry_, this->task_id_, out,
+                            this->Rank(), this->World());
+}
+
+[[nodiscard]] Result ConnectWorkers(Comm const& comm, TCPSocket* listener, std::int32_t lport,
+                                    proto::PeerInfo ninfo, std::chrono::seconds timeout,
+                                    std::int32_t retry,
+                                    std::vector<std::shared_ptr<TCPSocket>>* out_workers) {
+  auto next = std::make_shared<TCPSocket>();
+  auto prev = std::make_shared<TCPSocket>();
+
+  auto rc = Success() << [&] {
+    auto rc = Connect(ninfo.host, ninfo.port, retry, timeout, next.get());
+    if (!rc.OK()) {
+      return Fail("Bootstrap failed to connect to ring next.", std::move(rc));
+    }
+    return rc;
+  } << [&] {
+    return next->NonBlocking(true);
+  } << [&] {
+    SockAddrV4 addr;
+    return listener->Accept(prev.get(), &addr);
+  } << [&] { return prev->NonBlocking(true); };
+  if (!rc.OK()) {
+    return rc;
+  }
+
+  // exchange host name and port
+  std::vector<std::int8_t> buffer(HOST_NAME_MAX * comm.World(), 0);
+  auto s_buffer = common::Span{buffer.data(), buffer.size()};
+  auto next_host = s_buffer.subspan(HOST_NAME_MAX * comm.Rank(), HOST_NAME_MAX);
+  if (next_host.size() < ninfo.host.size()) {
+    return Fail("Got an invalid host name.");
+  }
+  std::copy(ninfo.host.cbegin(), ninfo.host.cend(), next_host.begin());
+
+  auto prev_ch = std::make_shared<Channel>(comm, prev);
+  auto next_ch = std::make_shared<Channel>(comm, next);
+
+  auto block = [&] {
+    for (auto ch : {prev_ch, next_ch}) {
+      auto rc = ch->Block();
+      if (!rc.OK()) {
+        return rc;
+      }
+    }
+    return Success();
+  };
+
+  rc = std::move(rc) << [&] {
+    return cpu_impl::RingAllgather(comm, s_buffer, HOST_NAME_MAX, 0, prev_ch, next_ch);
+  } << [&] { return block(); };
+  if (!rc.OK()) {
+    return Fail("Failed to get host names from peers.", std::move(rc));
+  }
+
+  std::vector<std::int32_t> peers_port(comm.World(), -1);
+  peers_port[comm.Rank()] = ninfo.port;
+  rc = std::move(rc) << [&] {
+    auto s_ports = common::Span{reinterpret_cast<std::int8_t*>(peers_port.data()),
+                                peers_port.size() * sizeof(ninfo.port)};
+    return cpu_impl::RingAllgather(comm, s_ports, sizeof(ninfo.port), 0, prev_ch, next_ch);
+  } << [&] { return block(); };
+  if (!rc.OK()) {
+    return Fail("Failed to get the port from peers.", std::move(rc));
+  }
+
+  std::vector<proto::PeerInfo> peers(comm.World());
+  for (auto r = 0; r < comm.World(); ++r) {
+    auto nhost = s_buffer.subspan(HOST_NAME_MAX * r, HOST_NAME_MAX);
+    auto nport = peers_port[r];
+    auto nrank = BootstrapNext(r, comm.World());
+
+    peers[nrank] = {std::string{reinterpret_cast<char const*>(nhost.data())}, nport, nrank};
+  }
+  CHECK_EQ(peers[comm.Rank()].port, lport);
+  for (auto const& p : peers) {
+    CHECK_NE(p.port, -1);
+  }
+
+  std::vector<std::shared_ptr<TCPSocket>>& workers = *out_workers;
+  workers.resize(comm.World());
+
+  for (std::int32_t r = (comm.Rank() + 1); r < comm.World(); ++r) {
+    auto const& peer = peers[r];
+    std::shared_ptr<TCPSocket> worker{TCPSocket::CreatePtr(comm.Domain())};
+    rc = std::move(rc)
+         << [&] { return Connect(peer.host, peer.port, retry, timeout, worker.get()); }
+         << [&] { return worker->RecvTimeout(timeout); };
+    if (!rc.OK()) {
+      return rc;
+    }
+
+    auto rank = comm.Rank();
+    auto n_bytes = worker->SendAll(&rank, sizeof(comm.Rank()));
+    if (n_bytes != sizeof(comm.Rank())) {
+      return Fail("Failed to send rank.");
+    }
+    workers[r] = std::move(worker);
+  }
+
+  for (std::int32_t r = 0; r < comm.Rank(); ++r) {
+    SockAddrV4 addr;
+    auto peer = std::shared_ptr<TCPSocket>(TCPSocket::CreatePtr(comm.Domain()));
+    rc = std::move(rc) << [&] { return listener->Accept(peer.get(), &addr); }
+                       << [&] { return peer->RecvTimeout(timeout); };
+    if (!rc.OK()) {
+      return rc;
+    }
+    std::int32_t rank{-1};
+    auto n_bytes = peer->RecvAll(&rank, sizeof(rank));
+    if (n_bytes != sizeof(comm.Rank())) {
+      return Fail("Failed to recv rank.");
+    }
+    workers[rank] = std::move(peer);
+  }
+
+  for (std::int32_t r = 0; r < comm.World(); ++r) {
+    if (r == comm.Rank()) {
+      continue;
+    }
+    CHECK(workers[r]);
+  }
+
+  return Success();
+}
+
+RabitComm::RabitComm(std::string const& host, std::int32_t port, std::chrono::seconds timeout,
+                     std::int32_t retry, std::string task_id)
+    : Comm{std::move(host), port, timeout, retry, std::move(task_id)} {
+  auto rc = this->Bootstrap(timeout_, retry_, task_id_);
+  CHECK(rc.OK()) << rc.Report();
+}
+
+[[nodiscard]] Result RabitComm::Bootstrap(std::chrono::seconds timeout, std::int32_t retry,
+                                          std::string task_id) {
+  TCPSocket tracker;
+  std::int32_t world{-1};
+  auto rc = ConnectTrackerImpl(this->TrackerInfo(), timeout, retry, task_id, &tracker, this->Rank(),
+                               world);
+  if (!rc.OK()) {
+    return Fail("Bootstrap failed.", std::move(rc));
+  }
+
+  this->domain_ = tracker.Domain();
+
+  // Start command
+  TCPSocket listener = TCPSocket::Create(tracker.Domain());
+  std::int32_t lport = listener.BindHost();
+  listener.Listen();
+
+  // create worker for listening to error notice.
+  auto domain = tracker.Domain();
+  std::shared_ptr<TCPSocket> error_sock{TCPSocket::CreatePtr(domain)};
+  auto eport = error_sock->BindHost();
+  error_sock->Listen();
+  error_worker_ = std::thread{[this, error_sock = std::move(error_sock)] {
+    auto conn = error_sock->Accept();
+    // On Windows accept returns an invalid socket after network is shutdown.
+    if (conn.IsClosed()) {
+      return;
+    }
+    LOG(WARNING) << "Another worker is running into error.";
+    std::string scmd;
+    conn.Recv(&scmd);
+    auto jcmd = Json::Load(scmd);
+    auto rc = this->Shutdown();
+    if (!rc.OK()) {
+      LOG(WARNING) << "Fail to shutdown worker:" << rc.Report();
+    }
+#if !defined(XGBOOST_STRICT_R_MODE) || XGBOOST_STRICT_R_MODE == 0
+    exit(-1);
+#else
+    LOG(FATAL) << rc.Report();
+#endif
+  }};
+  error_worker_.detach();
+
+  proto::Start start;
+  rc = std::move(rc) << [&] { return start.WorkerSend(lport, &tracker, eport); }
+                     << [&] { return start.WorkerRecv(&tracker, &world); };
+  if (!rc.OK()) {
+    return rc;
+  }
+  this->world_ = world;
+
+  // get ring neighbors
+  std::string snext;
+  tracker.Recv(&snext);
+  auto jnext = Json::Load(StringView{snext});
+
+  proto::PeerInfo ninfo{jnext};
+
+  // get the rank of this worker
+  this->rank_ = BootstrapPrev(ninfo.rank, world);
+  this->tracker_.rank = rank_;
+
+  std::vector<std::shared_ptr<TCPSocket>> workers;
+  rc = ConnectWorkers(*this, &listener, lport, ninfo, timeout, retry, &workers);
+  if (!rc.OK()) {
+    return rc;
+  }
+
+  CHECK(this->channels_.empty());
+  for (auto& w : workers) {
+    if (w) {
+      w->SetNoDelay();
+      rc = w->NonBlocking(true);
+    }
+    if (!rc.OK()) {
+      return rc;
+    }
+    this->channels_.emplace_back(std::make_shared<Channel>(*this, w));
+  }
+  return rc;
+}
+
+RabitComm::~RabitComm() noexcept(false) {
+  if (!IsDistributed()) {
+    return;
+  }
+  auto rc = this->Shutdown();
+  if (!rc.OK()) {
+    LOG(WARNING) << rc.Report();
+  }
+}
+
+[[nodiscard]] Result RabitComm::Shutdown() {
+  TCPSocket tracker;
+  return Success() << [&] {
+    return ConnectTrackerImpl(tracker_, timeout_, retry_, task_id_, &tracker, Rank(), World());
+  } << [&] {
+    return this->Block();
+  } << [&] {
+    Json jcmd{Object{}};
+    jcmd["cmd"] = Integer{static_cast<std::int32_t>(proto::CMD::kShutdown)};
+    auto scmd = Json::Dump(jcmd);
+    auto n_bytes = tracker.Send(scmd);
+    if (n_bytes != scmd.size()) {
+      return Fail("Faled to send cmd.");
+    }
+    return Success();
+  };
+}
+
+[[nodiscard]] Result RabitComm::LogTracker(std::string msg) const {
+  TCPSocket out;
+  proto::Print print;
+  return Success() << [&] { return this->ConnectTracker(&out); }
+                   << [&] { return print.WorkerSend(&out, msg); };
+}
+
+[[nodiscard]] Result RabitComm::SignalError(Result const& res) {
+  TCPSocket out;
+  return Success() << [&] { return this->ConnectTracker(&out); }
+                   << [&] { return proto::ErrorCMD{}.WorkerSend(&out, res); };
+}
+}  // namespace xgboost::collective
diff --git a/src/collective/comm.h b/src/collective/comm.h
new file mode 100644
index 000000000..b501fcddd
--- /dev/null
+++ b/src/collective/comm.h
@@ -0,0 +1,156 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#pragma once
+#include <chrono>       // for seconds
+#include <cstddef>      // for size_t
+#include <cstdint>      // for int32_t
+#include <memory>       // for shared_ptr
+#include <string>       // for string
+#include <thread>       // for thread
+#include <type_traits>  // for remove_const_t
+#include <utility>      // for move
+#include <vector>       // for vector
+
+#include "loop.h"                       // for Loop
+#include "protocol.h"                   // for PeerInfo
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/collective/socket.h"  // for TCPSocket
+#include "xgboost/span.h"               // for Span
+
+namespace xgboost::collective {
+
+inline constexpr std::int32_t DefaultTimeoutSec() { return 300; }  // 5min
+inline constexpr std::int32_t DefaultRetry() { return 3; }
+
+// indexing into the ring
+inline std::int32_t BootstrapNext(std::int32_t r, std::int32_t world) {
+  auto nrank = (r + world + 1) % world;
+  return nrank;
+}
+
+inline std::int32_t BootstrapPrev(std::int32_t r, std::int32_t world) {
+  auto nrank = (r + world - 1) % world;
+  return nrank;
+}
+
+class Channel;
+
+/**
+ * @brief Base communicator storing info about the tracker and other communicators.
+ */
+class Comm {
+ protected:
+  std::int32_t world_{1};
+  std::int32_t rank_{0};
+  std::chrono::seconds timeout_{DefaultTimeoutSec()};
+  std::int32_t retry_{DefaultRetry()};
+
+  proto::PeerInfo tracker_;
+  SockDomain domain_{SockDomain::kV4};
+  std::thread error_worker_;
+  std::string task_id_;
+  std::vector<std::shared_ptr<Channel>> channels_;
+  std::shared_ptr<Loop> loop_{new Loop{std::chrono::seconds{
+      DefaultTimeoutSec()}}};  // fixme: require federated comm to have a timeout
+
+ public:
+  Comm() = default;
+  Comm(std::string const& host, std::int32_t port, std::chrono::seconds timeout, std::int32_t retry,
+       std::string task_id);
+  virtual ~Comm() noexcept(false) {}  // NOLINT
+
+  Comm(Comm const& that) = delete;
+  Comm& operator=(Comm const& that) = delete;
+  Comm(Comm&& that) = delete;
+  Comm& operator=(Comm&& that) = delete;
+
+  [[nodiscard]] auto TrackerInfo() const { return tracker_; }
+  [[nodiscard]] Result ConnectTracker(TCPSocket* out) const;
+  [[nodiscard]] auto Domain() const { return domain_; }
+  [[nodiscard]] auto Timeout() const { return timeout_; }
+
+  [[nodiscard]] auto Rank() const { return rank_; }
+  [[nodiscard]] auto World() const { return world_; }
+  [[nodiscard]] bool IsDistributed() const { return World() > 1; }
+  void Submit(Loop::Op op) const { loop_->Submit(op); }
+  [[nodiscard]] Result Block() const { return loop_->Block(); }
+
+  [[nodiscard]] virtual std::shared_ptr<Channel> Chan(std::int32_t rank) const {
+    return channels_.at(rank);
+  }
+  [[nodiscard]] virtual bool IsFederated() const = 0;
+  [[nodiscard]] virtual Result LogTracker(std::string msg) const = 0;
+
+  [[nodiscard]] virtual Result SignalError(Result const&) { return Success(); }
+};
+
+class RabitComm : public Comm {
+  [[nodiscard]] Result Bootstrap(std::chrono::seconds timeout, std::int32_t retry,
+                                 std::string task_id);
+  [[nodiscard]] Result Shutdown();
+
+ public:
+  // bootstrapping construction.
+  RabitComm() = default;
+  // ctor for testing where environment is known.
+  RabitComm(std::string const& host, std::int32_t port, std::chrono::seconds timeout,
+            std::int32_t retry, std::string task_id);
+  ~RabitComm() noexcept(false) override;
+
+  [[nodiscard]] bool IsFederated() const override { return false; }
+  [[nodiscard]] Result LogTracker(std::string msg) const override;
+
+  [[nodiscard]] Result SignalError(Result const&) override;
+};
+
+/**
+ * @brief Communication channel between workers.
+ */
+class Channel {
+  std::shared_ptr<TCPSocket> sock_{nullptr};
+  Result rc_;
+  Comm const& comm_;
+
+ public:
+  explicit Channel(Comm const& comm, std::shared_ptr<TCPSocket> sock)
+      : sock_{std::move(sock)}, comm_{comm} {}
+
+  void SendAll(std::int8_t const* ptr, std::size_t n) {
+    Loop::Op op{Loop::Op::kWrite, comm_.Rank(), const_cast<std::int8_t*>(ptr), n, sock_.get(), 0};
+    CHECK(sock_.get());
+    comm_.Submit(std::move(op));
+  }
+  void SendAll(common::Span<std::int8_t const> data) {
+    this->SendAll(data.data(), data.size_bytes());
+  }
+
+  void RecvAll(std::int8_t* ptr, std::size_t n) {
+    Loop::Op op{Loop::Op::kRead, comm_.Rank(), ptr, n, sock_.get(), 0};
+    CHECK(sock_.get());
+    comm_.Submit(std::move(op));
+  }
+  void RecvAll(common::Span<std::int8_t> data) { this->RecvAll(data.data(), data.size_bytes()); }
+
+  [[nodiscard]] auto Socket() const { return sock_; }
+  [[nodiscard]] Result Block() { return comm_.Block(); }
+};
+
+enum class Op { kMax = 0, kMin = 1, kSum = 2, kBitwiseAND = 3, kBitwiseOR = 4, kBitwiseXOR = 5 };
+
+template <typename T, typename U = std::conditional_t<std::is_const_v<T>,
+                                                      std::add_const_t<std::int8_t>, std::int8_t>>
+common::Span<U> EraseType(common::Span<T> data) {
+  auto n_total_bytes = data.size_bytes();
+  auto erased = common::Span{reinterpret_cast<std::add_pointer_t<U>>(data.data()), n_total_bytes};
+  return erased;
+}
+
+template <typename T, typename U>
+common::Span<T> RestoreType(common::Span<U> data) {
+  static_assert(std::is_same_v<std::remove_const_t<U>, std::int8_t>);
+  auto n_total_bytes = data.size_bytes();
+  auto restored = common::Span{reinterpret_cast<T*>(data.data()), n_total_bytes / sizeof(T)};
+  return restored;
+}
+}  // namespace xgboost::collective
diff --git a/src/collective/communicator-inl.h b/src/collective/communicator-inl.h
index 59cc4cc45..c58a9f3bc 100644
--- a/src/collective/communicator-inl.h
+++ b/src/collective/communicator-inl.h
@@ -57,9 +57,7 @@ namespace collective {
  *   - federated_client_key: Client key file path. Only needed for the SSL mode.
  *   - federated_client_cert: Client certificate file path. Only needed for the SSL mode.
  */
-inline void Init(Json const& config) {
-  Communicator::Init(config);
-}
+inline void Init(Json const &config) { Communicator::Init(config); }
 
 /*!
  * \brief Finalize the collective communicator.
@@ -141,17 +139,89 @@ inline void Broadcast(std::string *sendrecv_data, int root) {
   }
 }
 
+/**
+ * @brief Gathers a single value all processes and distributes the result to all processes.
+ *
+ * @param input The single value.
+ */
+template <typename T>
+inline std::vector<T> Allgather(T const &input) {
+  std::string_view str_input{reinterpret_cast<char const *>(&input), sizeof(T)};
+  auto const output = Communicator::Get()->AllGather(str_input);
+  CHECK_EQ(output.size() % sizeof(T), 0);
+  std::vector<T> result(output.size() / sizeof(T));
+  std::memcpy(reinterpret_cast<void *>(result.data()), output.data(), output.size());
+  return result;
+}
+
 /**
  * @brief Gathers data from all processes and distributes it to all processes.
  *
- * This assumes all ranks have the same size, and input data has been sliced into the
- * corresponding position.
+ * This assumes all ranks have the same size.
  *
- * @param send_receive_buffer Buffer storing the data.
- * @param size                Size of the data in bytes.
+ * @param input Buffer storing the data.
  */
-inline void Allgather(void *send_receive_buffer, std::size_t size) {
-  Communicator::Get()->AllGather(send_receive_buffer, size);
+template <typename T>
+inline std::vector<T> Allgather(std::vector<T> const &input) {
+  if (input.empty()) {
+    return input;
+  }
+  std::string_view str_input{reinterpret_cast<char const *>(input.data()),
+                             input.size() * sizeof(T)};
+  auto const output = Communicator::Get()->AllGather(str_input);
+  CHECK_EQ(output.size() % sizeof(T), 0);
+  std::vector<T> result(output.size() / sizeof(T));
+  std::memcpy(reinterpret_cast<void *>(result.data()), output.data(), output.size());
+  return result;
+}
+
+/**
+ * @brief Gathers variable-length data from all processes and distributes it to all processes.
+ * @param input Buffer storing the data.
+ */
+template <typename T>
+inline std::vector<T> AllgatherV(std::vector<T> const &input) {
+  std::string_view str_input{reinterpret_cast<char const *>(input.data()),
+                             input.size() * sizeof(T)};
+  auto const output = Communicator::Get()->AllGatherV(str_input);
+  CHECK_EQ(output.size() % sizeof(T), 0);
+  std::vector<T> result(output.size() / sizeof(T));
+  if (!output.empty()) {
+    std::memcpy(reinterpret_cast<void *>(result.data()), output.data(), output.size());
+  }
+  return result;
+}
+
+/**
+ * @brief Gathers variable-length strings from all processes and distributes them to all processes.
+ * @param input Variable-length list of variable-length strings.
+ */
+inline std::vector<std::string> AllgatherStrings(std::vector<std::string> const &input) {
+  std::size_t total_size{0};
+  for (auto const &s : input) {
+    total_size += s.length() + 1;  // +1 for null-terminators
+  }
+  std::string flat_string;
+  flat_string.reserve(total_size);
+  for (auto const &s : input) {
+    flat_string.append(s);
+    flat_string.push_back('\0');  // Append a null-terminator after each string
+  }
+
+  auto const output = Communicator::Get()->AllGatherV(flat_string);
+
+  std::vector<std::string> result;
+  std::size_t start_index = 0;
+  // Iterate through the output, find each null-terminated substring.
+  for (std::size_t i = 0; i < output.size(); i++) {
+    if (output[i] == '\0') {
+      // Construct a std::string from the char* substring
+      result.emplace_back(&output[start_index]);
+      // Move to the next substring
+      start_index = i + 1;
+    }
+  }
+  return result;
 }
 
 /*!
@@ -226,7 +296,7 @@ inline void Allreduce(double *send_receive_buffer, size_t count) {
 }
 
 template <typename T>
-struct AllgatherVResult {
+struct SpecialAllgatherVResult {
   std::vector<std::size_t> offsets;
   std::vector<std::size_t> sizes;
   std::vector<T> result;
@@ -241,14 +311,10 @@ struct AllgatherVResult {
  * @param sizes  Sizes of each input.
  */
 template <typename T>
-inline AllgatherVResult<T> AllgatherV(std::vector<T> const &inputs,
-                                      std::vector<std::size_t> const &sizes) {
-  auto num_inputs = sizes.size();
-
+inline SpecialAllgatherVResult<T> SpecialAllgatherV(std::vector<T> const &inputs,
+                                                    std::vector<std::size_t> const &sizes) {
   // Gather the sizes across all workers.
-  std::vector<std::size_t> all_sizes(num_inputs * GetWorldSize());
-  std::copy_n(sizes.cbegin(), sizes.size(), all_sizes.begin() + num_inputs * GetRank());
-  collective::Allgather(all_sizes.data(), all_sizes.size() * sizeof(std::size_t));
+  auto const all_sizes = Allgather(sizes);
 
   // Calculate input offsets (std::exclusive_scan).
   std::vector<std::size_t> offsets(all_sizes.size());
@@ -257,11 +323,7 @@ inline AllgatherVResult<T> AllgatherV(std::vector<T> const &inputs,
   }
 
   // Gather all the inputs.
-  auto total_input_size = offsets.back() + all_sizes.back();
-  std::vector<T> all_inputs(total_input_size);
-  std::copy_n(inputs.cbegin(), inputs.size(), all_inputs.begin() + offsets[num_inputs * GetRank()]);
-  // We cannot use allgather here, since each worker might have a different size.
-  Allreduce<Operation::kMax>(all_inputs.data(), all_inputs.size());
+  auto const all_inputs = AllgatherV(inputs);
 
   return {offsets, all_sizes, all_inputs};
 }
diff --git a/src/collective/communicator.cc b/src/collective/communicator.cc
index bb69fe1d5..5d228c2fd 100644
--- a/src/collective/communicator.cc
+++ b/src/collective/communicator.cc
@@ -11,9 +11,7 @@
 #include "../../plugin/federated/federated_communicator.h"
 #endif
 
-namespace xgboost {
-namespace collective {
-
+namespace xgboost::collective {
 thread_local std::unique_ptr<Communicator> Communicator::communicator_{new NoOpCommunicator()};
 thread_local CommunicatorType Communicator::type_{};
 
@@ -57,6 +55,4 @@ void Communicator::Finalize() {
   communicator_.reset(new NoOpCommunicator());
 }
 #endif
-
-}  // namespace collective
-}  // namespace xgboost
+}  // namespace xgboost::collective
diff --git a/src/collective/communicator.h b/src/collective/communicator.h
index 510bd476c..04fd6cc08 100644
--- a/src/collective/communicator.h
+++ b/src/collective/communicator.h
@@ -125,13 +125,17 @@ class Communicator {
   /**
    * @brief Gathers data from all processes and distributes it to all processes.
    *
-   * This assumes all ranks have the same size, and input data has been sliced into the
-   * corresponding position.
+   * This assumes all ranks have the same size.
    *
-   * @param send_receive_buffer Buffer storing the data.
-   * @param size                Size of the data in bytes.
+   * @param input Buffer storing the data.
    */
-  virtual void AllGather(void *send_receive_buffer, std::size_t size) = 0;
+  virtual std::string AllGather(std::string_view input) = 0;
+
+  /**
+   * @brief Gathers variable-length data from all processes and distributes it to all processes.
+   * @param input Buffer storing the data.
+   */
+  virtual std::string AllGatherV(std::string_view input) = 0;
 
   /**
    * @brief Combines values from all processes and distributes the result back to all processes.
diff --git a/src/collective/device_communicator_adapter.cuh b/src/collective/device_communicator_adapter.cuh
index 0ffa28770..d149348a6 100644
--- a/src/collective/device_communicator_adapter.cuh
+++ b/src/collective/device_communicator_adapter.cuh
@@ -40,12 +40,10 @@ class DeviceCommunicatorAdapter : public DeviceCommunicator {
     }
 
     dh::safe_cuda(cudaSetDevice(device_ordinal_));
-    host_buffer_.resize(send_size * world_size_);
-    dh::safe_cuda(cudaMemcpy(host_buffer_.data() + rank_ * send_size, send_buffer, send_size,
-                             cudaMemcpyDefault));
-    Allgather(host_buffer_.data(), host_buffer_.size());
-    dh::safe_cuda(
-        cudaMemcpy(receive_buffer, host_buffer_.data(), host_buffer_.size(), cudaMemcpyDefault));
+    host_buffer_.resize(send_size);
+    dh::safe_cuda(cudaMemcpy(host_buffer_.data(), send_buffer, send_size, cudaMemcpyDefault));
+    auto const output = Allgather(host_buffer_);
+    dh::safe_cuda(cudaMemcpy(receive_buffer, output.data(), output.size(), cudaMemcpyDefault));
   }
 
   void AllGatherV(void const *send_buffer, size_t length_bytes, std::vector<std::size_t> *segments,
diff --git a/src/collective/in_memory_communicator.h b/src/collective/in_memory_communicator.h
index f41029af1..c712d32a8 100644
--- a/src/collective/in_memory_communicator.h
+++ b/src/collective/in_memory_communicator.h
@@ -60,11 +60,16 @@ class InMemoryCommunicator : public Communicator {
   bool IsDistributed() const override { return true; }
   bool IsFederated() const override { return false; }
 
-  void AllGather(void* in_out, std::size_t size) override {
+  std::string AllGather(std::string_view input) override {
     std::string output;
-    handler_.Allgather(static_cast<const char*>(in_out), size, &output, sequence_number_++,
-                       GetRank());
-    output.copy(static_cast<char*>(in_out), size);
+    handler_.Allgather(input.data(), input.size(), &output, sequence_number_++, GetRank());
+    return output;
+  }
+
+  std::string AllGatherV(std::string_view input) override {
+    std::string output;
+    handler_.AllgatherV(input.data(), input.size(), &output, sequence_number_++, GetRank());
+    return output;
   }
 
   void AllReduce(void* in_out, std::size_t size, DataType data_type, Operation operation) override {
diff --git a/src/collective/in_memory_handler.cc b/src/collective/in_memory_handler.cc
index a45fe3e7d..944e5077b 100644
--- a/src/collective/in_memory_handler.cc
+++ b/src/collective/in_memory_handler.cc
@@ -16,23 +16,49 @@ class AllgatherFunctor {
  public:
   std::string const name{"Allgather"};
 
-  AllgatherFunctor(int world_size, int rank) : world_size_{world_size}, rank_{rank} {}
+  AllgatherFunctor(std::size_t world_size, std::size_t rank)
+      : world_size_{world_size}, rank_{rank} {}
 
   void operator()(char const* input, std::size_t bytes, std::string* buffer) const {
     if (buffer->empty()) {
-      // Copy the input if this is the first request.
-      buffer->assign(input, bytes);
-    } else {
-      // Splice the input into the common buffer.
-      auto const per_rank = bytes / world_size_;
-      auto const index = rank_ * per_rank;
-      buffer->replace(index, per_rank, input + index, per_rank);
+      // Resize the buffer if this is the first request.
+      buffer->resize(bytes * world_size_);
+    }
+
+    // Splice the input into the common buffer.
+    buffer->replace(rank_ * bytes, bytes, input, bytes);
+  }
+
+ private:
+  std::size_t world_size_;
+  std::size_t rank_;
+};
+
+/**
+ * @brief Functor for variable-length allgather.
+ */
+class AllgatherVFunctor {
+ public:
+  std::string const name{"AllgatherV"};
+
+  AllgatherVFunctor(std::size_t world_size, std::size_t rank,
+                    std::map<std::size_t, std::string_view>* data)
+      : world_size_{world_size}, rank_{rank}, data_{data} {}
+
+  void operator()(char const* input, std::size_t bytes, std::string* buffer) const {
+    data_->emplace(rank_, std::string_view{input, bytes});
+    if (data_->size() == world_size_) {
+      for (auto const& kv : *data_) {
+        buffer->append(kv.second);
+      }
+      data_->clear();
     }
   }
 
  private:
-  int world_size_;
-  int rank_;
+  std::size_t world_size_;
+  std::size_t rank_;
+  std::map<std::size_t, std::string_view>* data_;
 };
 
 /**
@@ -154,7 +180,7 @@ class BroadcastFunctor {
  public:
   std::string const name{"Broadcast"};
 
-  BroadcastFunctor(int rank, int root) : rank_{rank}, root_{root} {}
+  BroadcastFunctor(std::size_t rank, std::size_t root) : rank_{rank}, root_{root} {}
 
   void operator()(char const* input, std::size_t bytes, std::string* buffer) const {
     if (rank_ == root_) {
@@ -164,11 +190,11 @@ class BroadcastFunctor {
   }
 
  private:
-  int rank_;
-  int root_;
+  std::size_t rank_;
+  std::size_t root_;
 };
 
-void InMemoryHandler::Init(int world_size, int) {
+void InMemoryHandler::Init(std::size_t world_size, std::size_t) {
   CHECK(world_size_ < world_size) << "In memory handler already initialized.";
 
   std::unique_lock<std::mutex> lock(mutex_);
@@ -178,7 +204,7 @@ void InMemoryHandler::Init(int world_size, int) {
   cv_.notify_all();
 }
 
-void InMemoryHandler::Shutdown(uint64_t sequence_number, int) {
+void InMemoryHandler::Shutdown(uint64_t sequence_number, std::size_t) {
   CHECK(world_size_ > 0) << "In memory handler already shutdown.";
 
   std::unique_lock<std::mutex> lock(mutex_);
@@ -194,24 +220,30 @@ void InMemoryHandler::Shutdown(uint64_t sequence_number, int) {
 }
 
 void InMemoryHandler::Allgather(char const* input, std::size_t bytes, std::string* output,
-                                std::size_t sequence_number, int rank) {
+                                std::size_t sequence_number, std::size_t rank) {
   Handle(input, bytes, output, sequence_number, rank, AllgatherFunctor{world_size_, rank});
 }
 
+void InMemoryHandler::AllgatherV(char const* input, std::size_t bytes, std::string* output,
+                                 std::size_t sequence_number, std::size_t rank) {
+  Handle(input, bytes, output, sequence_number, rank, AllgatherVFunctor{world_size_, rank, &aux_});
+}
+
 void InMemoryHandler::Allreduce(char const* input, std::size_t bytes, std::string* output,
-                                std::size_t sequence_number, int rank, DataType data_type,
+                                std::size_t sequence_number, std::size_t rank, DataType data_type,
                                 Operation op) {
   Handle(input, bytes, output, sequence_number, rank, AllreduceFunctor{data_type, op});
 }
 
 void InMemoryHandler::Broadcast(char const* input, std::size_t bytes, std::string* output,
-                                std::size_t sequence_number, int rank, int root) {
+                                std::size_t sequence_number, std::size_t rank, std::size_t root) {
   Handle(input, bytes, output, sequence_number, rank, BroadcastFunctor{rank, root});
 }
 
 template <class HandlerFunctor>
 void InMemoryHandler::Handle(char const* input, std::size_t bytes, std::string* output,
-                             std::size_t sequence_number, int rank, HandlerFunctor const& functor) {
+                             std::size_t sequence_number, std::size_t rank,
+                             HandlerFunctor const& functor) {
   // Pass through if there is only 1 client.
   if (world_size_ == 1) {
     if (input != output->data()) {
diff --git a/src/collective/in_memory_handler.h b/src/collective/in_memory_handler.h
index 4182c7b3d..f9ac52007 100644
--- a/src/collective/in_memory_handler.h
+++ b/src/collective/in_memory_handler.h
@@ -3,6 +3,7 @@
  */
 #pragma once
 #include <condition_variable>
+#include <map>
 #include <string>
 
 #include "communicator.h"
@@ -31,7 +32,7 @@ class InMemoryHandler {
    *
    * This is used when the handler only needs to be initialized once with a known world size.
    */
-  explicit InMemoryHandler(int worldSize) : world_size_{worldSize} {}
+  explicit InMemoryHandler(std::size_t worldSize) : world_size_{worldSize} {}
 
   /**
    * @brief Initialize the handler with the world size and rank.
@@ -41,7 +42,7 @@ class InMemoryHandler {
    * This is used when multiple objects/threads are accessing the same handler and need to
    * initialize it collectively.
    */
-  void Init(int world_size, int rank);
+  void Init(std::size_t world_size, std::size_t rank);
 
   /**
    * @brief Shut down the handler.
@@ -51,7 +52,7 @@ class InMemoryHandler {
    * This is used when multiple objects/threads are accessing the same handler and need to
    * shut it down collectively.
    */
-  void Shutdown(uint64_t sequence_number, int rank);
+  void Shutdown(uint64_t sequence_number, std::size_t rank);
 
   /**
    * @brief Perform allgather.
@@ -62,7 +63,18 @@ class InMemoryHandler {
    * @param rank Index of the worker.
    */
   void Allgather(char const* input, std::size_t bytes, std::string* output,
-                 std::size_t sequence_number, int rank);
+                 std::size_t sequence_number, std::size_t rank);
+
+  /**
+   * @brief Perform variable-length allgather.
+   * @param input The input buffer.
+   * @param bytes Number of bytes in the input buffer.
+   * @param output The output buffer.
+   * @param sequence_number Call sequence number.
+   * @param rank Index of the worker.
+   */
+  void AllgatherV(char const* input, std::size_t bytes, std::string* output,
+                  std::size_t sequence_number, std::size_t rank);
 
   /**
    * @brief Perform allreduce.
@@ -75,7 +87,7 @@ class InMemoryHandler {
    * @param op The reduce operation.
    */
   void Allreduce(char const* input, std::size_t bytes, std::string* output,
-                 std::size_t sequence_number, int rank, DataType data_type, Operation op);
+                 std::size_t sequence_number, std::size_t rank, DataType data_type, Operation op);
 
   /**
    * @brief Perform broadcast.
@@ -87,7 +99,7 @@ class InMemoryHandler {
    * @param root Index of the worker to broadcast from.
    */
   void Broadcast(char const* input, std::size_t bytes, std::string* output,
-                 std::size_t sequence_number, int rank, int root);
+                 std::size_t sequence_number, std::size_t rank, std::size_t root);
 
  private:
   /**
@@ -102,15 +114,16 @@ class InMemoryHandler {
    */
   template <class HandlerFunctor>
   void Handle(char const* input, std::size_t size, std::string* output, std::size_t sequence_number,
-              int rank, HandlerFunctor const& functor);
+              std::size_t rank, HandlerFunctor const& functor);
 
-  int world_size_{};                    /// Number of workers.
-  int received_{};                      /// Number of calls received with the current sequence.
-  int sent_{};                          /// Number of calls completed with the current sequence.
-  std::string buffer_{};                /// A shared common buffer.
-  uint64_t sequence_number_{};          /// Call sequence number.
-  mutable std::mutex mutex_;            /// Lock.
-  mutable std::condition_variable cv_;  /// Conditional variable to wait on.
+  std::size_t world_size_{};  /// Number of workers.
+  std::size_t received_{};    /// Number of calls received with the current sequence.
+  std::size_t sent_{};        /// Number of calls completed with the current sequence.
+  std::string buffer_{};      /// A shared common buffer.
+  std::map<std::size_t, std::string_view> aux_{};  /// A shared auxiliary map.
+  uint64_t sequence_number_{};                     /// Call sequence number.
+  mutable std::mutex mutex_;                       /// Lock.
+  mutable std::condition_variable cv_;             /// Conditional variable to wait on.
 };
 
 }  // namespace collective
diff --git a/src/collective/loop.cc b/src/collective/loop.cc
new file mode 100644
index 000000000..95a1019ac
--- /dev/null
+++ b/src/collective/loop.cc
@@ -0,0 +1,167 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include "loop.h"
+
+#include <queue>  // for queue
+
+#include "rabit/internal/socket.h"      // for PollHelper
+#include "xgboost/collective/socket.h"  // for FailWithCode
+#include "xgboost/logging.h"            // for CHECK
+
+namespace xgboost::collective {
+Result Loop::EmptyQueue() {
+  timer_.Start(__func__);
+  auto error = [this] {
+    this->stop_ = true;
+    timer_.Stop(__func__);
+  };
+
+  while (!queue_.empty() && !stop_) {
+    std::queue<Op> qcopy;
+    rabit::utils::PollHelper poll;
+
+    // watch all ops
+    while (!queue_.empty()) {
+      auto op = queue_.front();
+      queue_.pop();
+
+      switch (op.code) {
+        case Op::kRead: {
+          poll.WatchRead(*op.sock);
+          break;
+        }
+        case Op::kWrite: {
+          poll.WatchWrite(*op.sock);
+          break;
+        }
+        default: {
+          error();
+          return Fail("Invalid socket operation.");
+        }
+      }
+      qcopy.push(op);
+    }
+
+    // poll, work on fds that are ready.
+    timer_.Start("poll");
+    auto rc = poll.Poll(timeout_);
+    timer_.Stop("poll");
+    if (!rc.OK()) {
+      error();
+      return rc;
+    }
+    // we wonldn't be here if the queue is empty.
+    CHECK(!qcopy.empty());
+
+    while (!qcopy.empty() && !stop_) {
+      auto op = qcopy.front();
+      qcopy.pop();
+
+      std::int32_t n_bytes_done{0};
+      CHECK(op.sock->NonBlocking());
+
+      switch (op.code) {
+        case Op::kRead: {
+          if (poll.CheckRead(*op.sock)) {
+            n_bytes_done = op.sock->Recv(op.ptr + op.off, op.n - op.off);
+          }
+          break;
+        }
+        case Op::kWrite: {
+          if (poll.CheckWrite(*op.sock)) {
+            n_bytes_done = op.sock->Send(op.ptr + op.off, op.n - op.off);
+          }
+          break;
+        }
+        default: {
+          error();
+          return Fail("Invalid socket operation.");
+        }
+      }
+
+      if (n_bytes_done == -1 && !system::LastErrorWouldBlock()) {
+        stop_ = true;
+        auto rc = system::FailWithCode("Invalid socket output.");
+        error();
+        return rc;
+      }
+      op.off += n_bytes_done;
+      CHECK_LE(op.off, op.n);
+
+      if (op.off != op.n) {
+        // not yet finished, push back to queue for next round.
+        queue_.push(op);
+      }
+    }
+  }
+  timer_.Stop(__func__);
+  return Success();
+}
+
+void Loop::Process() {
+  // consumer
+  while (true) {
+    std::unique_lock lock{mu_};
+    cv_.wait(lock, [this] { return !this->queue_.empty() || stop_; });
+    if (stop_) {
+      break;
+    }
+    CHECK(!mu_.try_lock());
+
+    this->rc_ = this->EmptyQueue();
+    if (!rc_.OK()) {
+      stop_ = true;
+      cv_.notify_one();
+      break;
+    }
+
+    CHECK(queue_.empty());
+    CHECK(!mu_.try_lock());
+    cv_.notify_one();
+  }
+
+  if (rc_.OK()) {
+    CHECK(queue_.empty());
+  }
+}
+
+Result Loop::Stop() {
+  std::unique_lock lock{mu_};
+  stop_ = true;
+  lock.unlock();
+
+  CHECK_EQ(this->Block().OK(), this->rc_.OK());
+
+  if (curr_exce_) {
+    std::rethrow_exception(curr_exce_);
+  }
+
+  return Success();
+}
+
+Loop::Loop(std::chrono::seconds timeout) : timeout_{timeout} {
+  timer_.Init(__func__);
+  worker_ = std::thread{[this] {
+    try {
+      this->Process();
+    } catch (std::exception const& e) {
+      std::lock_guard<std::mutex> guard{mu_};
+      if (!curr_exce_) {
+        curr_exce_ = std::current_exception();
+        rc_ = Fail("Exception was thrown");
+      }
+      stop_ = true;
+      cv_.notify_all();
+    } catch (...) {
+      std::lock_guard<std::mutex> guard{mu_};
+      if (!curr_exce_) {
+        curr_exce_ = std::current_exception();
+        rc_ = Fail("Exception was thrown");
+      }
+      stop_ = true;
+      cv_.notify_all();
+    }
+  }};
+}
+}  // namespace xgboost::collective
diff --git a/src/collective/loop.h b/src/collective/loop.h
new file mode 100644
index 000000000..0bccbc0d0
--- /dev/null
+++ b/src/collective/loop.h
@@ -0,0 +1,83 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#pragma once
+#include <chrono>              // for seconds
+#include <condition_variable>  // for condition_variable
+#include <cstddef>             // for size_t
+#include <cstdint>             // for int8_t, int32_t
+#include <exception>           // for exception_ptr
+#include <mutex>               // for unique_lock, mutex
+#include <queue>               // for queue
+#include <thread>              // for thread
+#include <utility>             // for move
+
+#include "../common/timer.h"            // for Monitor
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/collective/socket.h"  // for TCPSocket
+
+namespace xgboost::collective {
+class Loop {
+ public:
+  struct Op {
+    enum Code : std::int8_t { kRead = 0, kWrite = 1 } code;
+    std::int32_t rank{-1};
+    std::int8_t* ptr{nullptr};
+    std::size_t n{0};
+    TCPSocket* sock{nullptr};
+    std::size_t off{0};
+
+    Op(Code c, std::int32_t rank, std::int8_t* ptr, std::size_t n, TCPSocket* sock, std::size_t off)
+        : code{c}, rank{rank}, ptr{ptr}, n{n}, sock{sock}, off{off} {}
+    Op(Op const&) = default;
+    Op& operator=(Op const&) = default;
+    Op(Op&&) = default;
+    Op& operator=(Op&&) = default;
+  };
+
+ private:
+  std::thread worker_;
+  std::condition_variable cv_;
+  std::mutex mu_;
+  std::queue<Op> queue_;
+  std::chrono::seconds timeout_;
+  Result rc_;
+  bool stop_{false};
+  std::exception_ptr curr_exce_{nullptr};
+  common::Monitor timer_;
+
+  Result EmptyQueue();
+  void Process();
+
+ public:
+  Result Stop();
+
+  void Submit(Op op) {
+    // producer
+    std::unique_lock lock{mu_};
+    queue_.push(op);
+    lock.unlock();
+    cv_.notify_one();
+  }
+
+  [[nodiscard]] Result Block() {
+    {
+      std::unique_lock lock{mu_};
+      cv_.notify_all();
+    }
+    std::unique_lock lock{mu_};
+    cv_.wait(lock, [this] { return this->queue_.empty() || stop_; });
+    return std::move(rc_);
+  }
+
+  explicit Loop(std::chrono::seconds timeout);
+
+  ~Loop() noexcept(false) {
+    this->Stop();
+
+    if (worker_.joinable()) {
+      worker_.join();
+    }
+  }
+};
+}  // namespace xgboost::collective
diff --git a/src/collective/noop_communicator.h b/src/collective/noop_communicator.h
index 28a0a1cad..2d88fd802 100644
--- a/src/collective/noop_communicator.h
+++ b/src/collective/noop_communicator.h
@@ -17,10 +17,11 @@ class NoOpCommunicator : public Communicator {
   NoOpCommunicator() : Communicator(1, 0) {}
   bool IsDistributed() const override { return false; }
   bool IsFederated() const override { return false; }
-  void AllGather(void *, std::size_t) override {}
+  std::string AllGather(std::string_view) override { return {}; }
+  std::string AllGatherV(std::string_view) override { return {}; }
   void AllReduce(void *, std::size_t, DataType, Operation) override {}
   void Broadcast(void *, std::size_t, int) override {}
-  std::string GetProcessorName() override { return ""; }
+  std::string GetProcessorName() override { return {}; }
   void Print(const std::string &message) override { LOG(CONSOLE) << message; }
 
  protected:
diff --git a/src/collective/protocol.h b/src/collective/protocol.h
new file mode 100644
index 000000000..96edf4e29
--- /dev/null
+++ b/src/collective/protocol.h
@@ -0,0 +1,214 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#pragma once
+#include <cstdint>  // for int32_t
+#include <string>   // for string
+#include <utility>  // for move
+
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/collective/socket.h"  // for TCPSocket
+#include "xgboost/json.h"               // for Json
+
+namespace xgboost::collective::proto {
+struct PeerInfo {
+  std::string host;
+  std::int32_t port{-1};
+  std::int32_t rank{-1};
+
+  PeerInfo() = default;
+  PeerInfo(std::string host, std::int32_t port, std::int32_t rank)
+      : host{std::move(host)}, port{port}, rank{rank} {}
+
+  explicit PeerInfo(Json const& peer)
+      : host{get<String>(peer["host"])},
+        port{static_cast<std::int32_t>(get<Integer const>(peer["port"]))},
+        rank{static_cast<std::int32_t>(get<Integer const>(peer["rank"]))} {}
+
+  [[nodiscard]] Json ToJson() const {
+    Json info{Object{}};
+    info["rank"] = rank;
+    info["host"] = String{host};
+    info["port"] = Integer{port};
+    return info;
+  }
+
+  [[nodiscard]] auto HostPort() const { return host + ":" + std::to_string(this->port); }
+};
+
+struct Magic {
+  static constexpr std::int32_t kMagic = 0xff99;
+
+  [[nodiscard]] Result Verify(xgboost::collective::TCPSocket* p_sock) {
+    std::int32_t magic{kMagic};
+    auto n_bytes = p_sock->SendAll(&magic, sizeof(magic));
+    if (n_bytes != sizeof(magic)) {
+      return Fail("Failed to verify.");
+    }
+
+    magic = 0;
+    n_bytes = p_sock->RecvAll(&magic, sizeof(magic));
+    if (n_bytes != sizeof(magic)) {
+      return Fail("Failed to verify.");
+    }
+    if (magic != kMagic) {
+      return xgboost::collective::Fail("Invalid verification number.");
+    }
+    return Success();
+  }
+};
+
+enum class CMD : std::int32_t {
+  kInvalid = 0,
+  kStart = 1,
+  kShutdown = 2,
+  kError = 3,
+  kPrint = 4,
+};
+
+struct Connect {
+  [[nodiscard]] Result WorkerSend(TCPSocket* tracker, std::int32_t world, std::int32_t rank,
+                                  std::string task_id) const {
+    Json jinit{Object{}};
+    jinit["world_size"] = Integer{world};
+    jinit["rank"] = Integer{rank};
+    jinit["task_id"] = String{task_id};
+    std::string msg;
+    Json::Dump(jinit, &msg);
+    auto n_bytes = tracker->Send(msg);
+    if (n_bytes != msg.size()) {
+      return Fail("Failed to send init command from worker.");
+    }
+    return Success();
+  }
+  [[nodiscard]] Result TrackerRecv(TCPSocket* sock, std::int32_t* world, std::int32_t* rank,
+                                   std::string* task_id) const {
+    std::string init;
+    sock->Recv(&init);
+    auto jinit = Json::Load(StringView{init});
+    *world = get<Integer const>(jinit["world_size"]);
+    *rank = get<Integer const>(jinit["rank"]);
+    *task_id = get<String const>(jinit["task_id"]);
+    return Success();
+  }
+};
+
+class Start {
+ private:
+  [[nodiscard]] Result TrackerSend(std::int32_t world, TCPSocket* worker) const {
+    Json jcmd{Object{}};
+    jcmd["world_size"] = Integer{world};
+    auto scmd = Json::Dump(jcmd);
+    auto n_bytes = worker->Send(scmd);
+    if (n_bytes != scmd.size()) {
+      return Fail("Failed to send init command from tracker.");
+    }
+    return Success();
+  }
+
+ public:
+  [[nodiscard]] Result WorkerSend(std::int32_t lport, TCPSocket* tracker,
+                                  std::int32_t eport) const {
+    Json jcmd{Object{}};
+    jcmd["cmd"] = Integer{static_cast<std::int32_t>(CMD::kStart)};
+    jcmd["port"] = Integer{lport};
+    jcmd["error_port"] = Integer{eport};
+    auto scmd = Json::Dump(jcmd);
+    auto n_bytes = tracker->Send(scmd);
+    if (n_bytes != scmd.size()) {
+      return Fail("Failed to send init command from worker.");
+    }
+    return Success();
+  }
+  [[nodiscard]] Result WorkerRecv(TCPSocket* tracker, std::int32_t* p_world) const {
+    std::string scmd;
+    auto n_bytes = tracker->Recv(&scmd);
+    if (n_bytes <= 0) {
+      return Fail("Failed to recv init command from tracker.");
+    }
+    auto jcmd = Json::Load(scmd);
+    auto world = get<Integer const>(jcmd["world_size"]);
+    if (world <= 0) {
+      return Fail("Invalid world size.");
+    }
+    *p_world = world;
+    return Success();
+  }
+  [[nodiscard]] Result TrackerHandle(Json jcmd, std::int32_t* recv_world, std::int32_t world,
+                                     std::int32_t* p_port, TCPSocket* p_sock,
+                                     std::int32_t* eport) const {
+    *p_port = get<Integer const>(jcmd["port"]);
+    if (*p_port <= 0) {
+      return Fail("Invalid port.");
+    }
+    if (*recv_world != -1) {
+      return Fail("Invalid initialization sequence.");
+    }
+    *recv_world = world;
+    *eport = get<Integer const>(jcmd["error_port"]);
+    return TrackerSend(world, p_sock);
+  }
+};
+
+struct Print {
+  [[nodiscard]] Result WorkerSend(TCPSocket* tracker, std::string msg) const {
+    Json jcmd{Object{}};
+    jcmd["cmd"] = Integer{static_cast<std::int32_t>(CMD::kPrint)};
+    jcmd["msg"] = String{std::move(msg)};
+    auto scmd = Json::Dump(jcmd);
+    auto n_bytes = tracker->Send(scmd);
+    if (n_bytes != scmd.size()) {
+      return Fail("Failed to send print command from worker.");
+    }
+    return Success();
+  }
+  [[nodiscard]] Result TrackerHandle(Json jcmd, std::string* p_msg) const {
+    if (!IsA<String>(jcmd["msg"])) {
+      return Fail("Invalid print command.");
+    }
+    auto msg = get<String const>(jcmd["msg"]);
+    *p_msg = msg;
+    return Success();
+  }
+};
+
+struct ErrorCMD {
+  [[nodiscard]] Result WorkerSend(TCPSocket* tracker, Result const& res) const {
+    auto msg = res.Report();
+    auto code = res.Code().value();
+    Json jcmd{Object{}};
+    jcmd["msg"] = String{std::move(msg)};
+    jcmd["code"] = Integer{code};
+    jcmd["cmd"] = Integer{static_cast<std::int32_t>(CMD::kError)};
+    auto scmd = Json::Dump(jcmd);
+    auto n_bytes = tracker->Send(scmd);
+    if (n_bytes != scmd.size()) {
+      return Fail("Failed to send error command from worker.");
+    }
+    return Success();
+  }
+  [[nodiscard]] Result TrackerHandle(Json jcmd, std::string* p_msg, int* p_code) const {
+    if (!IsA<String>(jcmd["msg"]) || !IsA<Integer>(jcmd["code"])) {
+      return Fail("Invalid error command.");
+    }
+    auto msg = get<String const>(jcmd["msg"]);
+    auto code = get<Integer const>(jcmd["code"]);
+    *p_msg = msg;
+    *p_code = code;
+    return Success();
+  }
+};
+
+struct ShutdownCMD {
+  [[nodiscard]] Result Send(TCPSocket* peer) const {
+    Json jcmd{Object{}};
+    jcmd["cmd"] = Integer{static_cast<std::int32_t>(proto::CMD::kShutdown)};
+    auto scmd = Json::Dump(jcmd);
+    auto n_bytes = peer->Send(scmd);
+    if (n_bytes != scmd.size()) {
+      return Fail("Failed to send shutdown command from worker.");
+    }
+    return Success();
+  }
+};
+}  // namespace xgboost::collective::proto
diff --git a/src/collective/rabit_communicator.h b/src/collective/rabit_communicator.h
index 9b79624a2..452e9ad9c 100644
--- a/src/collective/rabit_communicator.h
+++ b/src/collective/rabit_communicator.h
@@ -7,6 +7,7 @@
 #include <string>
 #include <vector>
 
+#include "communicator-inl.h"
 #include "communicator.h"
 #include "xgboost/json.h"
 
@@ -55,10 +56,29 @@ class RabitCommunicator : public Communicator {
 
   bool IsFederated() const override { return false; }
 
-  void AllGather(void *send_receive_buffer, std::size_t size) override {
-    auto const per_rank = size / GetWorldSize();
+  std::string AllGather(std::string_view input) override {
+    auto const per_rank = input.size();
+    auto const total_size = per_rank * GetWorldSize();
     auto const index = per_rank * GetRank();
-    rabit::Allgather(static_cast<char *>(send_receive_buffer), size, index, per_rank, per_rank);
+    std::string result(total_size, '\0');
+    result.replace(index, per_rank, input);
+    rabit::Allgather(result.data(), total_size, index, per_rank, per_rank);
+    return result;
+  }
+
+  std::string AllGatherV(std::string_view input) override {
+    auto const size_node_slice = input.size();
+    auto const all_sizes = collective::Allgather(size_node_slice);
+    auto const total_size = std::accumulate(all_sizes.cbegin(), all_sizes.cend(), 0ul);
+    auto const begin_index =
+        std::accumulate(all_sizes.cbegin(), all_sizes.cbegin() + GetRank(), 0ul);
+    auto const size_prev_slice =
+        GetRank() == 0 ? all_sizes[GetWorldSize() - 1] : all_sizes[GetRank() - 1];
+
+    std::string result(total_size, '\0');
+    result.replace(begin_index, size_node_slice, input);
+    rabit::Allgather(result.data(), total_size, begin_index, size_node_slice, size_prev_slice);
+    return result;
   }
 
   void AllReduce(void *send_receive_buffer, std::size_t count, DataType data_type,
diff --git a/src/collective/socket.cc b/src/collective/socket.cc
index 78dc3d79b..43da366bd 100644
--- a/src/collective/socket.cc
+++ b/src/collective/socket.cc
@@ -3,6 +3,7 @@
  */
 #include "xgboost/collective/socket.h"
 
+#include <array>         // for array
 #include <cstddef>       // std::size_t
 #include <cstdint>       // std::int32_t
 #include <cstring>       // std::memcpy, std::memset
@@ -92,13 +93,18 @@ std::size_t TCPSocket::Recv(std::string *p_str) {
 
   conn = TCPSocket::Create(addr.Domain());
   CHECK_EQ(static_cast<std::int32_t>(conn.Domain()), static_cast<std::int32_t>(addr.Domain()));
-  conn.SetNonBlock(true);
+  auto non_blocking = conn.NonBlocking();
+  auto rc = conn.NonBlocking(true);
+  if (!rc.OK()) {
+    return Fail("Failed to set socket option.", std::move(rc));
+  }
 
   Result last_error;
-  auto log_failure = [&host, &last_error](Result err, char const *file, std::int32_t line) {
+  auto log_failure = [&host, &last_error, port](Result err, char const *file, std::int32_t line) {
     last_error = std::move(err);
     LOG(WARNING) << std::filesystem::path{file}.filename().string() << "(" << line
-                 << "): Failed to connect to:" << host << " Error:" << last_error.Report();
+                 << "): Failed to connect to:" << host << ":" << port
+                 << " Error:" << last_error.Report();
   };
 
   for (std::int32_t attempt = 0; attempt < std::max(retry, 1); ++attempt) {
@@ -112,39 +118,42 @@ std::size_t TCPSocket::Recv(std::string *p_str) {
     }
 
     auto rc = connect(conn.Handle(), addr_handle, addr_len);
-    if (rc != 0) {
-      auto errcode = system::LastError();
-      if (!system::ErrorWouldBlock(errcode)) {
-        log_failure(Fail("connect failed.", std::error_code{errcode, std::system_category()}),
-                    __FILE__, __LINE__);
-        continue;
-      }
-
-      rabit::utils::PollHelper poll;
-      poll.WatchWrite(conn);
-      auto result = poll.Poll(timeout);
-      if (!result.OK()) {
-        log_failure(std::move(result), __FILE__, __LINE__);
-        continue;
-      }
-      if (!poll.CheckWrite(conn)) {
-        log_failure(Fail("poll failed.", std::error_code{errcode, std::system_category()}),
-                    __FILE__, __LINE__);
-        continue;
-      }
-      result = conn.GetSockError();
-      if (!result.OK()) {
-        log_failure(std::move(result), __FILE__, __LINE__);
-        continue;
-      }
-
-      conn.SetNonBlock(false);
-      return Success();
-
-    } else {
-      conn.SetNonBlock(false);
-      return Success();
+    if (rc == 0) {
+      return conn.NonBlocking(non_blocking);
     }
+
+    auto errcode = system::LastError();
+    if (!system::ErrorWouldBlock(errcode)) {
+      log_failure(Fail("connect failed.", std::error_code{errcode, std::system_category()}),
+                  __FILE__, __LINE__);
+      continue;
+    }
+
+    rabit::utils::PollHelper poll;
+    poll.WatchWrite(conn);
+    auto result = poll.Poll(timeout);
+    if (!result.OK()) {
+      // poll would fail if there's a socket error, we log the root cause instead of the
+      // poll failure.
+      auto sockerr = conn.GetSockError();
+      if (!sockerr.OK()) {
+        result = std::move(sockerr);
+      }
+      log_failure(std::move(result), __FILE__, __LINE__);
+      continue;
+    }
+    if (!poll.CheckWrite(conn)) {
+      log_failure(Fail("poll failed.", std::error_code{errcode, std::system_category()}), __FILE__,
+                  __LINE__);
+      continue;
+    }
+    result = conn.GetSockError();
+    if (!result.OK()) {
+      log_failure(std::move(result), __FILE__, __LINE__);
+      continue;
+    }
+
+    return conn.NonBlocking(non_blocking);
   }
 
   std::stringstream ss;
@@ -152,4 +161,13 @@ std::size_t TCPSocket::Recv(std::string *p_str) {
   conn.Close();
   return Fail(ss.str(), std::move(last_error));
 }
+
+[[nodiscard]] Result GetHostName(std::string *p_out) {
+  std::array<char, HOST_NAME_MAX> buf;
+  if (gethostname(&buf[0], HOST_NAME_MAX) != 0) {
+    return system::FailWithCode("Failed to get host name.");
+  }
+  *p_out = buf.data();
+  return Success();
+}
 }  // namespace xgboost::collective
diff --git a/src/collective/tracker.cc b/src/collective/tracker.cc
new file mode 100644
index 000000000..043e93359
--- /dev/null
+++ b/src/collective/tracker.cc
@@ -0,0 +1,296 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#if defined(__unix__) || defined(__APPLE__)
+#include <netdb.h>       // gethostbyname
+#include <sys/socket.h>  // socket, AF_INET6, AF_INET, connect, getsockname
+#endif                   // defined(__unix__) || defined(__APPLE__)
+
+#if !defined(NOMINMAX) && defined(_WIN32)
+#define NOMINMAX
+#endif  // !defined(NOMINMAX)
+
+#if defined(_WIN32)
+#include <winsock2.h>
+#include <ws2tcpip.h>
+#endif  // defined(_WIN32)
+
+#include <algorithm>  // for sort
+#include <chrono>     // for seconds
+#include <cstdint>    // for int32_t
+#include <string>     // for string
+#include <utility>    // for move, forward
+
+#include "../common/json_utils.h"
+#include "comm.h"
+#include "protocol.h"  // for kMagic, PeerInfo
+#include "tracker.h"
+#include "xgboost/collective/result.h"  // for Result, Fail, Success
+#include "xgboost/collective/socket.h"  // for GetHostName, FailWithCode, MakeSockAddress, ...
+#include "xgboost/json.h"
+
+namespace xgboost::collective {
+Tracker::Tracker(Json const& config)
+    : n_workers_{static_cast<std::int32_t>(
+          RequiredArg<Integer const>(config, "n_workers", __func__))},
+      port_{static_cast<std::int32_t>(OptionalArg<Integer const>(config, "port", Integer::Int{0}))},
+      timeout_{std::chrono::seconds{OptionalArg<Integer const>(
+          config, "timeout", static_cast<std::int64_t>(collective::DefaultTimeoutSec()))}} {}
+
+RabitTracker::WorkerProxy::WorkerProxy(std::int32_t world, TCPSocket sock, SockAddrV4 addr)
+    : sock_{std::move(sock)} {
+  auto host = addr.Addr();
+
+  std::int32_t rank{0};
+  rc_ = Success()
+        << [&] { return proto::Magic{}.Verify(&sock_); }
+        << [&] { return proto::Connect{}.TrackerRecv(&sock_, &world_, &rank, &task_id_); };
+  if (!rc_.OK()) {
+    return;
+  }
+
+  std::string cmd;
+  sock_.Recv(&cmd);
+  auto jcmd = Json::Load(StringView{cmd});
+  cmd_ = static_cast<proto::CMD>(get<Integer const>(jcmd["cmd"]));
+  std::int32_t port{0};
+  if (cmd_ == proto::CMD::kStart) {
+    proto::Start start;
+    rc_ = start.TrackerHandle(jcmd, &world_, world, &port, &sock_, &eport_);
+  } else if (cmd_ == proto::CMD::kPrint) {
+    proto::Print print;
+    rc_ = print.TrackerHandle(jcmd, &msg_);
+  } else if (cmd_ == proto::CMD::kError) {
+    proto::ErrorCMD error;
+    rc_ = error.TrackerHandle(jcmd, &msg_, &code_);
+  }
+  if (!rc_.OK()) {
+    return;
+  }
+
+  info_ = proto::PeerInfo{host, port, rank};
+}
+
+RabitTracker::RabitTracker(Json const& config) : Tracker{config} {
+  std::string self;
+  auto rc = collective::GetHostAddress(&self);
+  auto host = OptionalArg<String>(config, "host", self);
+
+  listener_ = TCPSocket::Create(SockDomain::kV4);
+  rc = listener_.Bind(host, &this->port_);
+  CHECK(rc.OK()) << rc.Report();
+  listener_.Listen();
+}
+
+Result RabitTracker::Bootstrap(std::vector<WorkerProxy>* p_workers) {
+  auto& workers = *p_workers;
+
+  std::sort(workers.begin(), workers.end(), WorkerCmp{});
+
+  std::vector<std::thread> bootstrap_threads;
+  for (std::int32_t r = 0; r < n_workers_; ++r) {
+    auto& worker = workers[r];
+    auto next = BootstrapNext(r, n_workers_);
+    auto const& next_w = workers[next];
+    bootstrap_threads.emplace_back([next, &worker, &next_w] {
+      auto jnext = proto::PeerInfo{next_w.Host(), next_w.Port(), next}.ToJson();
+      std::string str;
+      Json::Dump(jnext, &str);
+      worker.Send(StringView{str});
+    });
+  }
+
+  for (auto& t : bootstrap_threads) {
+    t.join();
+  }
+
+  for (auto const& w : workers) {
+    worker_error_handles_.emplace_back(w.Host(), w.ErrorPort());
+  }
+  return Success();
+}
+
+[[nodiscard]] std::future<Result> RabitTracker::Run() {
+  // a state machine to keep track of consistency.
+  struct State {
+    std::int32_t const n_workers;
+
+    std::int32_t n_shutdown{0};
+    bool during_restart{false};
+    std::vector<WorkerProxy> pending;
+
+    explicit State(std::int32_t world) : n_workers{world} {}
+    State(State const& that) = delete;
+    State& operator=(State&& that) = delete;
+
+    void Start(WorkerProxy&& worker) {
+      CHECK_LT(pending.size(), n_workers);
+      CHECK_LE(n_shutdown, n_workers);
+
+      pending.emplace_back(std::forward<WorkerProxy>(worker));
+
+      CHECK_LE(pending.size(), n_workers);
+    }
+    void Shutdown() {
+      CHECK_GE(n_shutdown, 0);
+      CHECK_LT(n_shutdown, n_workers);
+
+      ++n_shutdown;
+
+      CHECK_LE(n_shutdown, n_workers);
+    }
+    void Error() {
+      CHECK_LE(pending.size(), n_workers);
+      CHECK_LE(n_shutdown, n_workers);
+
+      during_restart = true;
+    }
+    [[nodiscard]] bool Ready() const {
+      CHECK_LE(pending.size(), n_workers);
+      return static_cast<std::int32_t>(pending.size()) == n_workers;
+    }
+    void Bootstrap() {
+      CHECK_EQ(pending.size(), n_workers);
+      CHECK_LE(n_shutdown, n_workers);
+
+      // A reset.
+      n_shutdown = 0;
+      during_restart = false;
+      pending.clear();
+    }
+    [[nodiscard]] bool ShouldContinue() const {
+      CHECK_LE(pending.size(), n_workers);
+      CHECK_LE(n_shutdown, n_workers);
+      // - Without error, we should shutdown after all workers are offline.
+      // - With error, all workers are offline, and we have during_restart as true.
+      return n_shutdown != n_workers || during_restart;
+    }
+  };
+
+  return std::async(std::launch::async, [this] {
+    State state{this->n_workers_};
+
+    while (state.ShouldContinue()) {
+      TCPSocket sock;
+      SockAddrV4 addr;
+      auto rc = listener_.Accept(&sock, &addr);
+      if (!rc.OK()) {
+        return Fail("Failed to accept connection.", std::move(rc));
+      }
+
+      auto worker = WorkerProxy{n_workers_, std::move(sock), std::move(addr)};
+      if (!worker.Status().OK()) {
+        return Fail("Failed to initialize worker proxy.", std::move(worker.Status()));
+      }
+      switch (worker.Command()) {
+        case proto::CMD::kStart: {
+          state.Start(std::move(worker));
+          if (state.Ready()) {
+            rc = this->Bootstrap(&state.pending);
+            state.Bootstrap();
+          }
+          if (!rc.OK()) {
+            return rc;
+          }
+          continue;
+        }
+        case proto::CMD::kShutdown: {
+          state.Shutdown();
+          continue;
+        }
+        case proto::CMD::kError: {
+          if (state.during_restart) {
+            continue;
+          }
+          state.Error();
+          auto msg = worker.Msg();
+          auto code = worker.Code();
+          LOG(WARNING) << "Recieved error from [" << worker.Host() << ":" << worker.Rank()
+                       << "]: " << msg << " code:" << code;
+          auto host = worker.Host();
+          // We signal all workers for the error, if they haven't aborted already.
+          for (auto& w : worker_error_handles_) {
+            if (w.first == host) {
+              continue;
+            }
+            TCPSocket out;
+            // retry is set to 1, just let the worker timeout or error. Otherwise the
+            // tracker and the worker might be waiting for each other.
+            auto rc = Connect(w.first, w.second, 1, timeout_, &out);
+            // send signal to stop the worker.
+            proto::ShutdownCMD shutdown;
+            rc = shutdown.Send(&out);
+            if (!rc.OK()) {
+              return Fail("Failed to inform workers to stop.");
+            }
+          }
+
+          continue;
+        }
+        case proto::CMD::kPrint: {
+          LOG(CONSOLE) << worker.Msg();
+          continue;
+        }
+        case proto::CMD::kInvalid:
+        default: {
+          return Fail("Invalid command received.");
+        }
+      }
+    }
+    return Success();
+  });
+}
+
+[[nodiscard]] Result GetHostAddress(std::string* out) {
+  auto rc = GetHostName(out);
+  if (!rc.OK()) {
+    return rc;
+  }
+  auto host = gethostbyname(out->c_str());
+
+  // get ip address from host
+  std::string ip;
+  rc = INetNToP(host, &ip);
+  if (!rc.OK()) {
+    return rc;
+  }
+
+  if (!(ip.size() >= 4 && ip.substr(0, 4) == "127.")) {
+    // return if this is a public IP address.
+    // not entirely accurate, we have other reserved IPs
+    *out = ip;
+    return Success();
+  }
+
+  // Create an UDP socket to prob the public IP address, it's fine even if it's
+  // unreachable.
+  auto sock = socket(AF_INET, SOCK_DGRAM, 0);
+  if (sock == -1) {
+    return Fail("Failed to create socket.");
+  }
+
+  auto paddr = MakeSockAddress(StringView{"10.255.255.255"}, 1);
+  sockaddr const* addr_handle = reinterpret_cast<const sockaddr*>(&paddr.V4().Handle());
+  socklen_t addr_len{sizeof(paddr.V4().Handle())};
+  auto err = connect(sock, addr_handle, addr_len);
+  if (err != 0) {
+    return system::FailWithCode("Failed to find IP address.");
+  }
+
+  // get the IP address from socket desrciptor
+  struct sockaddr_in addr;
+  socklen_t len = sizeof(addr);
+  if (getsockname(sock, reinterpret_cast<struct sockaddr*>(&addr), &len) == -1) {
+    return Fail("Failed to get sock name.");
+  }
+  ip = inet_ntoa(addr.sin_addr);
+
+  err = system::CloseSocket(sock);
+  if (err != 0) {
+    return system::FailWithCode("Failed to close socket.");
+  }
+
+  *out = ip;
+  return Success();
+}
+}  // namespace xgboost::collective
diff --git a/src/collective/tracker.h b/src/collective/tracker.h
new file mode 100644
index 000000000..7bbee3c8d
--- /dev/null
+++ b/src/collective/tracker.h
@@ -0,0 +1,141 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#pragma once
+#include <chrono>   // for seconds
+#include <cstdint>  // for int32_t
+#include <future>   // for future
+#include <string>   // for string
+#include <utility>  // for pair
+#include <vector>   // for vector
+
+#include "protocol.h"
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/collective/socket.h"  // for TCPSocket
+#include "xgboost/json.h"               // for Json
+
+namespace xgboost::collective {
+/**
+ *
+ * @brief Implementation of RABIT tracker.
+ *
+ * * What is a tracker
+ *
+ *   The implementation of collective follows what RABIT did in the past. It requires a
+ *   tracker to coordinate initialization and error recovery of workers. While the
+ *   original implementation attempted to attain error resislient inside the collective
+ *   module, which turned out be too challenging due to large amount of external
+ *   states. The new implementation here differs from RABIT in the way that neither state
+ *   recovery nor resislient is handled inside the collective, it merely provides the
+ *   mechanism to signal error to other workers through the use of a centralized tracker.
+ *
+ *   There are three major functionalities provided the a tracker, namely:
+ *   - Initialization. Share the node addresses among all workers.
+ *   - Logging.
+ *   - Signal error. If an exception is thrown in one (or many) of the workers, it can
+ *     signal an error to the tracker and the tracker will notify other workers.
+ */
+class Tracker {
+ protected:
+  std::int32_t n_workers_{0};
+  std::int32_t port_{-1};
+  std::chrono::seconds timeout_{0};
+
+ public:
+  explicit Tracker(Json const& config);
+  Tracker(std::int32_t n_worders, std::int32_t port, std::chrono::seconds timeout)
+      : n_workers_{n_worders}, port_{port}, timeout_{timeout} {}
+
+  virtual ~Tracker() noexcept(false){};  // NOLINT
+  [[nodiscard]] virtual std::future<Result> Run() = 0;
+  [[nodiscard]] virtual Json WorkerArgs() const = 0;
+  [[nodiscard]] std::chrono::seconds Timeout() const { return timeout_; }
+};
+
+class RabitTracker : public Tracker {
+  // a wrapper for connected worker socket.
+  class WorkerProxy {
+    TCPSocket sock_;
+    proto::PeerInfo info_;
+    std::int32_t eport_{0};
+    std::int32_t world_{-1};
+    std::string task_id_;
+
+    proto::CMD cmd_{proto::CMD::kInvalid};
+    std::string msg_;
+    std::int32_t code_{0};
+    Result rc_;
+
+   public:
+    explicit WorkerProxy(std::int32_t world, TCPSocket sock, SockAddrV4 addr);
+    WorkerProxy(WorkerProxy const& that) = delete;
+    WorkerProxy(WorkerProxy&& that) = default;
+    WorkerProxy& operator=(WorkerProxy const&) = delete;
+    WorkerProxy& operator=(WorkerProxy&&) = default;
+
+    [[nodiscard]] auto Host() const { return info_.host; }
+    [[nodiscard]] auto TaskID() const { return task_id_; }
+    [[nodiscard]] auto Port() const { return info_.port; }
+    [[nodiscard]] auto Rank() const { return info_.rank; }
+    [[nodiscard]] auto ErrorPort() const { return eport_; }
+    [[nodiscard]] auto Command() const { return cmd_; }
+    [[nodiscard]] auto Msg() const { return msg_; }
+    [[nodiscard]] auto Code() const { return code_; }
+
+    [[nodiscard]] Result const& Status() const { return rc_; }
+    [[nodiscard]] Result& Status() { return rc_; }
+
+    void Send(StringView value) { this->sock_.Send(value); }
+  };
+  // provide an ordering for workers, this helps us get deterministic topology.
+  struct WorkerCmp {
+    [[nodiscard]] bool operator()(WorkerProxy const& lhs, WorkerProxy const& rhs) {
+      auto const& lh = lhs.Host();
+      auto const& rh = rhs.Host();
+
+      if (lh != rh) {
+        return lh < rh;
+      }
+      return lhs.TaskID() < rhs.TaskID();
+    }
+  };
+
+ private:
+  std::string host_;
+  // record for how to reach out to workers if error happens.
+  std::vector<std::pair<std::string, std::int32_t>> worker_error_handles_;
+  // listening socket for incoming workers.
+  TCPSocket listener_;
+
+  Result Bootstrap(std::vector<WorkerProxy>* p_workers);
+
+ public:
+  explicit RabitTracker(StringView host, std::int32_t n_worders, std::int32_t port,
+                        std::chrono::seconds timeout)
+      : Tracker{n_worders, port, timeout}, host_{host.c_str(), host.size()} {
+    listener_ = TCPSocket::Create(SockDomain::kV4);
+    auto rc = listener_.Bind(host, &this->port_);
+    CHECK(rc.OK()) << rc.Report();
+    listener_.Listen();
+  }
+
+  explicit RabitTracker(Json const& config);
+  ~RabitTracker() noexcept(false) override = default;
+
+  std::future<Result> Run() override;
+
+  [[nodiscard]] std::int32_t Port() const { return port_; }
+  [[nodiscard]] Json WorkerArgs() const override {
+    Json args{Object{}};
+    args["DMLC_TRACKER_URI"] = String{host_};
+    args["DMLC_TRACKER_PORT"] = this->Port();
+    return args;
+  }
+};
+
+// Prob the public IP address of the host, need a better method.
+//
+// This is directly translated from the previous Python implementation, we should find a
+// more riguous approach, can use some expertise in network programming.
+[[nodiscard]] Result GetHostAddress(std::string* out);
+}  // namespace xgboost::collective
diff --git a/src/common/bitfield.h b/src/common/bitfield.h
index 050fda906..511769e63 100644
--- a/src/common/bitfield.h
+++ b/src/common/bitfield.h
@@ -5,17 +5,16 @@
 #ifndef XGBOOST_COMMON_BITFIELD_H_
 #define XGBOOST_COMMON_BITFIELD_H_
 
-#include <algorithm>
-#include <bitset>
-#include <cinttypes>
-#include <iostream>
-#include <sstream>
-#include <string>
-#include <vector>
+#include <algorithm>    // for min
+#include <bitset>       // for bitset
+#include <cstdint>      // for uint32_t, uint64_t, uint8_t
+#include <ostream>      // for ostream
+#include <type_traits>  // for conditional_t, is_signed_v
 
 #if defined(__CUDACC__)
 #include <thrust/copy.h>
 #include <thrust/device_ptr.h>
+
 #include "device_helpers.cuh"
 #elif defined(__HIP_PLATFORM_AMD__)
 #include <thrust/copy.h>
@@ -23,8 +22,8 @@
 #include "device_helpers.hip.h"
 #endif  // defined(__CUDACC__) || defined(__HIP_PLATFORM_AMD__)
 
-#include "xgboost/span.h"
 #include "common.h"
+#include "xgboost/span.h"  // for Span
 
 namespace xgboost {
 
@@ -79,7 +78,7 @@ struct BitFieldContainer {
  private:
   value_type* bits_{nullptr};
   size_type n_values_{0};
-  static_assert(!std::is_signed<VT>::value, "Must use an unsiged type as the underlying storage.");
+  static_assert(!std::is_signed_v<VT>, "Must use an unsiged type as the underlying storage.");
 
  public:
   XGBOOST_DEVICE static Pos ToBitPos(index_type pos) {
@@ -244,11 +243,39 @@ struct RBitsPolicy : public BitFieldContainer<VT, RBitsPolicy<VT>> {
 
 // Format: <Const><Direction>BitField<size of underlying type in bits>, underlying type
 // must be unsigned.
-using LBitField64 = BitFieldContainer<uint64_t, LBitsPolicy<uint64_t>>;
-using RBitField8 = BitFieldContainer<uint8_t, RBitsPolicy<unsigned char>>;
+using LBitField64 = BitFieldContainer<std::uint64_t, LBitsPolicy<std::uint64_t>>;
+using RBitField8 = BitFieldContainer<std::uint8_t, RBitsPolicy<unsigned char>>;
 
-using LBitField32 = BitFieldContainer<uint32_t, LBitsPolicy<uint32_t>>;
-using CLBitField32 = BitFieldContainer<uint32_t, LBitsPolicy<uint32_t, true>, true>;
+using LBitField32 = BitFieldContainer<std::uint32_t, LBitsPolicy<std::uint32_t>>;
+using CLBitField32 = BitFieldContainer<std::uint32_t, LBitsPolicy<std::uint32_t, true>, true>;
+using RBitField32 = BitFieldContainer<std::uint32_t, RBitsPolicy<std::uint32_t>>;
+
+namespace detail {
+inline std::uint32_t TrailingZeroBitsImpl(std::uint32_t value) {
+  auto n = sizeof(value) * 8;
+  std::uint32_t cnt{0};
+  for (decltype(n) i = 0; i < n; i++) {
+    if ((value >> i) & 1) {
+      break;
+    }
+    cnt++;
+  }
+  return cnt;
+}
+}  // namespace detail
+
+inline std::uint32_t TrailingZeroBits(std::uint32_t value) {
+  if (value == 0) {
+    return sizeof(value) * 8;
+  }
+#if defined(__GNUC__)
+  return __builtin_ctz(value);
+#elif defined(_MSC_VER)
+  return _tzcnt_u32(value);
+#else
+  return detail::TrailingZeroBitsImpl(value);
+#endif  //  __GNUC__
+}
 }       // namespace xgboost
 
 #endif  // XGBOOST_COMMON_BITFIELD_H_
diff --git a/src/common/common.h b/src/common/common.h
index ef11437c8..31fffb955 100644
--- a/src/common/common.h
+++ b/src/common/common.h
@@ -6,7 +6,6 @@
 #ifndef XGBOOST_COMMON_COMMON_H_
 #define XGBOOST_COMMON_COMMON_H_
 
-#include <algorithm>  // for max
 #include <array>      // for array
 #include <cmath>      // for ceil
 #include <cstddef>    // for size_t
@@ -203,7 +202,7 @@ inline void SetDevice(std::int32_t device) {
 #endif
 
 /**
- * Last index of a group in a CSR style of index pointer.
+ * @brief Last index of a group in a CSR style of index pointer.
  */
 template <typename Indexable>
 XGBOOST_DEVICE size_t LastOf(size_t group, Indexable const &indptr) {
diff --git a/src/common/hist_util.cu b/src/common/hist_util.cu
index f727384de..7bdd90eb9 100644
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@@ -135,7 +135,7 @@ void SortByWeight(dh::device_vector<float>* weights, dh::device_vector<Entry>* s
 #endif
 }
 
-void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
+void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
                                 dh::device_vector<Entry>* p_sorted_entries,
                                 dh::device_vector<float>* p_sorted_weights,
                                 dh::caching_device_vector<size_t>* p_column_sizes_scan) {
@@ -252,13 +252,13 @@ void ProcessWeightedBatch(Context const* ctx, const SparsePage& page, MetaInfo c
       sorted_entries.data().get(), [] __device__(Entry const& e) -> data::COOTuple {
         return {0, e.index, e.fvalue};  // row_idx is not needed for scaning column size.
       });
-  detail::GetColumnSizesScan(ctx->Ordinal(), info.num_col_, num_cuts_per_feature,
+  detail::GetColumnSizesScan(ctx->Device(), info.num_col_, num_cuts_per_feature,
                              IterSpan{batch_it, sorted_entries.size()}, dummy_is_valid, &cuts_ptr,
                              &column_sizes_scan);
   auto d_cuts_ptr = cuts_ptr.DeviceSpan();
   if (sketch_container->HasCategorical()) {
     auto p_weight = entry_weight.empty() ? nullptr : &entry_weight;
-    detail::RemoveDuplicatedCategories(ctx->Ordinal(), info, d_cuts_ptr, &sorted_entries, p_weight,
+    detail::RemoveDuplicatedCategories(ctx->Device(), info, d_cuts_ptr, &sorted_entries, p_weight,
                                        &column_sizes_scan);
   }
 
@@ -359,7 +359,7 @@ HistogramCuts DeviceSketchWithHessian(Context const* ctx, DMatrix* p_fmat, bst_b
 
   HistogramCuts cuts;
   SketchContainer sketch_container(info.feature_types, max_bin, info.num_col_, info.num_row_,
-                                   ctx->Ordinal());
+                                   ctx->Device());
   CHECK_EQ(has_weight || !hessian.empty(), !d_weight.empty());
   for (const auto& page : p_fmat->GetBatches<SparsePage>()) {
     std::size_t page_nnz = page.data.Size();
diff --git a/src/common/hist_util.cuh b/src/common/hist_util.cuh
index f86685eda..feddba99e 100644
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@@ -86,9 +86,9 @@ __global__ void GetColumnSizeSharedMemKernel(IterSpan<BatchIt> batch_iter,
 }
 
 template <std::uint32_t kBlockThreads, typename Kernel>
-std::uint32_t EstimateGridSize(std::int32_t device, Kernel kernel, std::size_t shared_mem) {
+std::uint32_t EstimateGridSize(DeviceOrd device, Kernel kernel, std::size_t shared_mem) {
   int n_mps = 0;
-  dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device));
+  dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device.ordinal));
   int n_blocks_per_mp = 0;
   dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
                                                               kBlockThreads, shared_mem));
@@ -110,11 +110,11 @@ std::uint32_t EstimateGridSize(std::int32_t device, Kernel kernel, std::size_t s
  * \param out_column_size Output buffer for the size of each column.
  */
 template <typename BatchIt, bool force_use_global_memory = false, bool force_use_u64 = false>
-void LaunchGetColumnSizeKernel(std::int32_t device, IterSpan<BatchIt> batch_iter,
+void LaunchGetColumnSizeKernel(DeviceOrd device, IterSpan<BatchIt> batch_iter,
                                data::IsValidFunctor is_valid, Span<std::size_t> out_column_size) {
   thrust::fill_n(thrust::device, dh::tbegin(out_column_size), out_column_size.size(), 0);
 
-  std::size_t max_shared_memory = dh::MaxSharedMemory(device);
+  std::size_t max_shared_memory = dh::MaxSharedMemory(device.ordinal);
   // Not strictly correct as we should use number of samples to determine the type of
   // counter. However, the sample size is not known due to sliding window on number of
   // elements.
@@ -158,7 +158,7 @@ void LaunchGetColumnSizeKernel(std::int32_t device, IterSpan<BatchIt> batch_iter
 }
 
 template <typename BatchIt>
-void GetColumnSizesScan(int device, size_t num_columns, std::size_t num_cuts_per_feature,
+void GetColumnSizesScan(DeviceOrd device, size_t num_columns, std::size_t num_cuts_per_feature,
                         IterSpan<BatchIt> batch_iter, data::IsValidFunctor is_valid,
                         HostDeviceVector<SketchContainer::OffsetT>* cuts_ptr,
                         dh::caching_device_vector<size_t>* column_sizes_scan) {
@@ -228,7 +228,8 @@ size_t RequiredMemory(bst_row_t num_rows, bst_feature_t num_columns, size_t nnz,
 // Count the valid entries in each column and copy them out.
 template <typename AdapterBatch, typename BatchIter>
 void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Range1d range,
-                            float missing, size_t columns, size_t cuts_per_feature, int device,
+                            float missing, size_t columns, size_t cuts_per_feature,
+                            DeviceOrd device,
                             HostDeviceVector<SketchContainer::OffsetT>* cut_sizes_scan,
                             dh::caching_device_vector<size_t>* column_sizes_scan,
                             dh::device_vector<Entry>* sorted_entries) {
@@ -252,7 +253,7 @@ void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Ran
 void SortByWeight(dh::device_vector<float>* weights,
                   dh::device_vector<Entry>* sorted_entries);
 
-void RemoveDuplicatedCategories(int32_t device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
+void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
                                 dh::device_vector<Entry>* p_sorted_entries,
                                 dh::device_vector<float>* p_sorted_weights,
                                 dh::caching_device_vector<size_t>* p_column_sizes_scan);
@@ -290,7 +291,7 @@ inline HistogramCuts DeviceSketch(Context const* ctx, DMatrix* p_fmat, bst_bin_t
 
 template <typename AdapterBatch>
 void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
-                          int device, size_t columns, size_t begin, size_t end,
+                          DeviceOrd device, size_t columns, size_t begin, size_t end,
                           float missing, SketchContainer *sketch_container,
                           int num_cuts) {
   // Copy current subset of valid elements into temporary storage and sort
@@ -335,11 +336,11 @@ void ProcessSlidingWindow(AdapterBatch const &batch, MetaInfo const &info,
 template <typename Batch>
 void ProcessWeightedSlidingWindow(Batch batch, MetaInfo const& info,
                                   int num_cuts_per_feature,
-                                  bool is_ranking, float missing, int device,
+                                  bool is_ranking, float missing, DeviceOrd device,
                                   size_t columns, size_t begin, size_t end,
                                   SketchContainer *sketch_container) {
   dh::XGBCachingDeviceAllocator<char> alloc;
-  dh::safe_cuda(cudaSetDevice(device));
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
   info.weights_.SetDevice(device);
   auto weights = info.weights_.ConstDeviceSpan();
 
@@ -451,14 +452,14 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
   size_t num_rows = batch.NumRows();
   size_t num_cols = batch.NumCols();
   size_t num_cuts_per_feature = detail::RequiredSampleCutsPerColumn(num_bins, num_rows);
-  int32_t device = sketch_container->DeviceIdx();
+  auto device = sketch_container->DeviceIdx();
   bool weighted = !info.weights_.Empty();
 
   if (weighted) {
     sketch_batch_num_elements = detail::SketchBatchNumElements(
         sketch_batch_num_elements,
         num_rows, num_cols, std::numeric_limits<size_t>::max(),
-        device, num_cuts_per_feature, true);
+        device.ordinal, num_cuts_per_feature, true);
     for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
       size_t end =
           std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
@@ -471,7 +472,7 @@ void AdapterDeviceSketch(Batch batch, int num_bins,
     sketch_batch_num_elements = detail::SketchBatchNumElements(
         sketch_batch_num_elements,
         num_rows, num_cols, std::numeric_limits<size_t>::max(),
-        device, num_cuts_per_feature, false);
+        device.ordinal, num_cuts_per_feature, false);
     for (auto begin = 0ull; begin < batch.Size(); begin += sketch_batch_num_elements) {
       size_t end =
           std::min(batch.Size(), static_cast<std::size_t>(begin + sketch_batch_num_elements));
diff --git a/src/common/host_device_vector.cc b/src/common/host_device_vector.cc
index 8ba6cabf4..f755426af 100644
--- a/src/common/host_device_vector.cc
+++ b/src/common/host_device_vector.cc
@@ -33,19 +33,19 @@ struct HostDeviceVectorImpl {
 };
 
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int)
+HostDeviceVector<T>::HostDeviceVector(size_t size, T v, DeviceOrd)
   : impl_(nullptr) {
   impl_ = new HostDeviceVectorImpl<T>(size, v);
 }
 
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int)
+HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, DeviceOrd)
   : impl_(nullptr) {
   impl_ = new HostDeviceVectorImpl<T>(init);
 }
 
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int)
+HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, DeviceOrd)
   : impl_(nullptr) {
   impl_ = new HostDeviceVectorImpl<T>(init);
 }
@@ -81,7 +81,7 @@ template <typename T>
 size_t HostDeviceVector<T>::Size() const { return impl_->Vec().size(); }
 
 template <typename T>
-int HostDeviceVector<T>::DeviceIdx() const { return -1; }
+DeviceOrd HostDeviceVector<T>::Device() const { return DeviceOrd::CPU(); }
 
 template <typename T>
 T* HostDeviceVector<T>::DevicePointer() { return nullptr; }
@@ -165,9 +165,6 @@ bool HostDeviceVector<T>::DeviceCanWrite() const {
   return false;
 }
 
-template <typename T>
-void HostDeviceVector<T>::SetDevice(int) const {}
-
 template <typename T>
 void HostDeviceVector<T>::SetDevice(DeviceOrd) const {}
 
@@ -178,6 +175,7 @@ template class HostDeviceVector<GradientPair>;
 template class HostDeviceVector<GradientPairPrecise>;
 template class HostDeviceVector<int32_t>;   // bst_node_t
 template class HostDeviceVector<uint8_t>;
+template class HostDeviceVector<int8_t>;
 template class HostDeviceVector<FeatureType>;
 template class HostDeviceVector<Entry>;
 template class HostDeviceVector<uint64_t>;  // bst_row_t
diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu
index a9102f668..d9ae38ced 100644
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@@ -25,8 +25,8 @@ void SetCudaSetDeviceHandler(void (*handler)(int)) {
 template <typename T>
 class HostDeviceVectorImpl {
  public:
-  HostDeviceVectorImpl(size_t size, T v, int device) : device_(device) {
-    if (device >= 0) {
+  HostDeviceVectorImpl(size_t size, T v, DeviceOrd device) : device_(device) {
+    if (device.IsCUDA()) {
       gpu_access_ = GPUAccess::kWrite;
       SetDevice();
       data_d_->resize(size, v);
@@ -37,8 +37,8 @@ class HostDeviceVectorImpl {
 
   // Initializer can be std::vector<T> or std::initializer_list<T>
   template <class Initializer>
-  HostDeviceVectorImpl(const Initializer& init, int device) : device_(device) {
-    if (device >= 0) {
+  HostDeviceVectorImpl(const Initializer& init, DeviceOrd device) : device_(device) {
+    if (device.IsCUDA()) {
       gpu_access_ = GPUAccess::kWrite;
       LazyResizeDevice(init.size());
       Copy(init);
@@ -54,16 +54,16 @@ class HostDeviceVectorImpl {
     gpu_access_{that.gpu_access_} {}
 
   ~HostDeviceVectorImpl() {
-    if (device_ >= 0) {
+    if (device_.IsCUDA()) {
       SetDevice();
     }
   }
 
-  size_t Size() const {
+  [[nodiscard]] size_t Size() const {
     return HostCanRead() ? data_h_.size() : data_d_ ? data_d_->size() : 0;
   }
 
-  int DeviceIdx() const { return device_; }
+  [[nodiscard]] DeviceOrd Device() const { return device_; }
 
   T* DevicePointer() {
     LazySyncDevice(GPUAccess::kWrite);
@@ -138,8 +138,7 @@ class HostDeviceVectorImpl {
     } else {
       auto ptr = other->ConstDevicePointer();
       SetDevice();
-      CHECK_EQ(this->DeviceIdx(), other->DeviceIdx());
-
+      CHECK_EQ(this->Device(), other->Device());
       dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size,
                                     ptr,
                                     other->Size() * sizeof(T),
@@ -157,24 +156,25 @@ class HostDeviceVectorImpl {
     return data_h_;
   }
 
-  void SetDevice(int device) {
+  void SetDevice(DeviceOrd device) {
     if (device_ == device) { return; }
-    if (device_ >= 0) {
+    if (device_.IsCUDA()) {
       LazySyncHost(GPUAccess::kNone);
     }
 
-    if (device_ >= 0 && device >= 0) {
-      CHECK_EQ(device_, device) << "New device ordinal is different from previous one.";
+    if (device_.IsCUDA() && device.IsCUDA()) {
+      CHECK_EQ(device_.ordinal, device.ordinal)
+          << "New device ordinal is different from previous one.";
     }
     device_ = device;
-    if (device_ >= 0) {
+    if (device_.IsCUDA()) {
       LazyResizeDevice(data_h_.size());
     }
   }
 
   void Resize(size_t new_size, T v) {
     if (new_size == Size()) { return; }
-    if ((Size() == 0 && device_ >= 0) || (DeviceCanWrite() && device_ >= 0)) {
+    if ((Size() == 0 && device_.IsCUDA()) || (DeviceCanWrite() && device_.IsCUDA())) {
       // fast on-device resize
       gpu_access_ = GPUAccess::kWrite;
       SetDevice();
@@ -221,16 +221,16 @@ class HostDeviceVectorImpl {
     gpu_access_ = access;
   }
 
-  bool HostCanAccess(GPUAccess access) const { return gpu_access_ <= access; }
-  bool HostCanRead() const { return HostCanAccess(GPUAccess::kRead); }
-  bool HostCanWrite() const { return HostCanAccess(GPUAccess::kNone); }
-  bool DeviceCanAccess(GPUAccess access) const { return gpu_access_ >= access; }
-  bool DeviceCanRead() const { return DeviceCanAccess(GPUAccess::kRead); }
-  bool DeviceCanWrite() const { return DeviceCanAccess(GPUAccess::kWrite); }
-  GPUAccess Access() const { return gpu_access_; }
+  [[nodiscard]] bool HostCanAccess(GPUAccess access) const { return gpu_access_ <= access; }
+  [[nodiscard]] bool HostCanRead() const { return HostCanAccess(GPUAccess::kRead); }
+  [[nodiscard]] bool HostCanWrite() const { return HostCanAccess(GPUAccess::kNone); }
+  [[nodiscard]] bool DeviceCanAccess(GPUAccess access) const { return gpu_access_ >= access; }
+  [[nodiscard]] bool DeviceCanRead() const { return DeviceCanAccess(GPUAccess::kRead); }
+  [[nodiscard]] bool DeviceCanWrite() const { return DeviceCanAccess(GPUAccess::kWrite); }
+  [[nodiscard]] GPUAccess Access() const { return gpu_access_; }
 
  private:
-  int device_{-1};
+  DeviceOrd device_{DeviceOrd::CPU()};
   std::vector<T> data_h_{};
   std::unique_ptr<dh::device_vector<T>> data_d_{};
   GPUAccess gpu_access_{GPUAccess::kNone};
@@ -264,11 +264,11 @@ class HostDeviceVectorImpl {
   }
 
   void SetDevice() {
-    CHECK_GE(device_, 0);
+    CHECK_GE(device_.ordinal, 0);
     if (cudaSetDeviceHandler == nullptr) {
-      dh::safe_cuda(cudaSetDevice(device_));
+      dh::safe_cuda(cudaSetDevice(device_.ordinal));
     } else {
-      (*cudaSetDeviceHandler)(device_);
+      (*cudaSetDeviceHandler)(device_.ordinal);
     }
 
     if (!data_d_) {
@@ -278,15 +278,15 @@ class HostDeviceVectorImpl {
 };
 
 template<typename T>
-HostDeviceVector<T>::HostDeviceVector(size_t size, T v, int device)
+HostDeviceVector<T>::HostDeviceVector(size_t size, T v, DeviceOrd device)
     : impl_(new HostDeviceVectorImpl<T>(size, v, device)) {}
 
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, int device)
+HostDeviceVector<T>::HostDeviceVector(std::initializer_list<T> init, DeviceOrd device)
     : impl_(new HostDeviceVectorImpl<T>(init, device)) {}
 
 template <typename T>
-HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, int device)
+HostDeviceVector<T>::HostDeviceVector(const std::vector<T>& init, DeviceOrd device)
     : impl_(new HostDeviceVectorImpl<T>(init, device)) {}
 
 template <typename T>
@@ -314,7 +314,9 @@ template <typename T>
 size_t HostDeviceVector<T>::Size() const { return impl_->Size(); }
 
 template <typename T>
-int HostDeviceVector<T>::DeviceIdx() const { return impl_->DeviceIdx(); }
+DeviceOrd HostDeviceVector<T>::Device() const {
+  return impl_->Device();
+}
 
 template <typename T>
 T* HostDeviceVector<T>::DevicePointer() {
@@ -394,14 +396,9 @@ GPUAccess HostDeviceVector<T>::DeviceAccess() const {
   return impl_->Access();
 }
 
-template <typename T>
-void HostDeviceVector<T>::SetDevice(int device) const {
-  impl_->SetDevice(device);
-}
-
 template <typename T>
 void HostDeviceVector<T>::SetDevice(DeviceOrd device) const {
-  impl_->SetDevice(device.ordinal);
+  impl_->SetDevice(device);
 }
 
 template <typename T>
@@ -416,6 +413,7 @@ template class HostDeviceVector<GradientPair>;
 template class HostDeviceVector<GradientPairPrecise>;
 template class HostDeviceVector<int32_t>;   // bst_node_t
 template class HostDeviceVector<uint8_t>;
+template class HostDeviceVector<int8_t>;
 template class HostDeviceVector<FeatureType>;
 template class HostDeviceVector<Entry>;
 template class HostDeviceVector<uint64_t>;  // bst_row_t
diff --git a/src/common/io.h b/src/common/io.h
index 07bb60787..5e9d27582 100644
--- a/src/common/io.h
+++ b/src/common/io.h
@@ -8,7 +8,7 @@
 #define XGBOOST_COMMON_IO_H_
 
 #include <dmlc/io.h>
-#include <rabit/rabit.h>
+#include <rabit/internal/io.h>  // for MemoryFixSizeBuffer, MemoryBufferStream
 
 #include <algorithm>    // for min, fill_n, copy_n
 #include <array>        // for array
@@ -382,7 +382,8 @@ class PrivateMmapConstStream : public AlignedResourceReadStream {
    * @param length    See the `length` parameter of `mmap` for details.
    */
   explicit PrivateMmapConstStream(std::string path, std::size_t offset, std::size_t length)
-      : AlignedResourceReadStream{std::make_shared<MmapResource>(path, offset, length)} {}
+      : AlignedResourceReadStream{std::shared_ptr<MmapResource>{  // NOLINT
+            new MmapResource{std::move(path), offset, length}}} {}
   ~PrivateMmapConstStream() noexcept(false) override;
 };
 
diff --git a/src/common/json_utils.h b/src/common/json_utils.h
new file mode 100644
index 000000000..a2a8a3cae
--- /dev/null
+++ b/src/common/json_utils.h
@@ -0,0 +1,74 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ *
+ * @brief Utils tailored for XGBoost.
+ */
+#pragma once
+
+#include <string>       // for string
+#include <type_traits>  // for enable_if_t, remove_const_t
+
+#include "xgboost/json.h"
+#include "xgboost/string_view.h"  // for StringView
+
+namespace xgboost {
+namespace detail {
+template <typename Head>
+bool TypeCheckImpl(Json const &value) {
+  return IsA<Head>(value);
+}
+
+template <typename Head, typename... JT>
+std::enable_if_t<sizeof...(JT) != 0, bool> TypeCheckImpl(Json const &value) {
+  return IsA<Head>(value) || TypeCheckImpl<JT...>(value);
+}
+
+template <typename Head>
+std::string TypeCheckError() {
+  return "`" + Head{}.TypeStr() + "`";
+}
+
+template <typename Head, typename... JT>
+std::enable_if_t<sizeof...(JT) != 0, std::string> TypeCheckError() {
+  return "`" + Head{}.TypeStr() + "`, " + TypeCheckError<JT...>();
+}
+}  // namespace detail
+
+/**
+ * @brief Type check for JSON-based parameters
+ *
+ * @tparam JT    Expected JSON types.
+ * @param  value Value to be checked.
+ */
+template <typename... JT>
+void TypeCheck(Json const &value, StringView name) {
+  if (!detail::TypeCheckImpl<JT...>(value)) {
+    LOG(FATAL) << "Invalid type for: `" << name << "`, expecting one of the: {`"
+               << detail::TypeCheckError<JT...>() << "}, got: `" << value.GetValue().TypeStr()
+               << "`";
+  }
+}
+
+template <typename JT>
+auto const &RequiredArg(Json const &in, StringView key, StringView func) {
+  auto const &obj = get<Object const>(in);
+  auto it = obj.find(key);
+  if (it == obj.cend() || IsA<Null>(it->second)) {
+    LOG(FATAL) << "Argument `" << key << "` is required for `" << func << "`.";
+  }
+  TypeCheck<JT>(it->second, StringView{key});
+  return get<std::remove_const_t<JT> const>(it->second);
+}
+
+template <typename JT, typename T>
+auto const &OptionalArg(Json const &in, StringView key, T const &dft) {
+  auto const &obj = get<Object const>(in);
+  auto it = obj.find(key);
+  if (it != obj.cend() && !IsA<Null>(it->second)) {
+    TypeCheck<JT>(it->second, key);
+
+    return get<std::remove_const_t<JT> const>(it->second);
+  }
+  return dft;
+}
+}  // namespace xgboost
diff --git a/src/common/linalg_op.cuh b/src/common/linalg_op.cuh
index 1f68c6ce7..a05c75ba7 100644
--- a/src/common/linalg_op.cuh
+++ b/src/common/linalg_op.cuh
@@ -44,7 +44,7 @@ void ElementWiseTransformDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_
 
 template <typename T, int32_t D, typename Fn>
 void ElementWiseKernel(Context const* ctx, linalg::TensorView<T, D> t, Fn&& fn) {
-  ctx->IsCPU() ? ElementWiseKernelHost(t, ctx->Threads(), fn) : ElementWiseKernelDevice(t, fn);
+  ctx->IsCUDA() ? ElementWiseKernelDevice(t, fn) : ElementWiseKernelHost(t, ctx->Threads(), fn);
 }
 }  // namespace linalg
 }  // namespace xgboost
diff --git a/src/common/linalg_op.h b/src/common/linalg_op.h
index dae2112c0..325208685 100644
--- a/src/common/linalg_op.h
+++ b/src/common/linalg_op.h
@@ -55,7 +55,7 @@ void ElementWiseTransformDevice(linalg::TensorView<T, D>, Fn&&, void* = nullptr)
 
 template <typename T, int32_t D, typename Fn>
 void ElementWiseKernel(Context const* ctx, linalg::TensorView<T, D> t, Fn&& fn) {
-  if (!ctx->IsCPU()) {
+  if (ctx->IsCUDA()) {
     common::AssertGPUSupport();
   }
   ElementWiseKernelHost(t, ctx->Threads(), fn);
diff --git a/src/common/numeric.cc b/src/common/numeric.cc
index 240e0234a..f19932311 100644
--- a/src/common/numeric.cc
+++ b/src/common/numeric.cc
@@ -11,13 +11,14 @@
 namespace xgboost {
 namespace common {
 double Reduce(Context const* ctx, HostDeviceVector<float> const& values) {
-  if (ctx->IsCPU()) {
+  if (ctx->IsCUDA()) {
+    return cuda_impl::Reduce(ctx, values);
+  } else {
     auto const& h_values = values.ConstHostVector();
     auto result = cpu_impl::Reduce(ctx, h_values.cbegin(), h_values.cend(), 0.0);
     static_assert(std::is_same<decltype(result), double>::value);
     return result;
   }
-  return cuda_impl::Reduce(ctx, values);
 }
 }  // namespace common
 }  // namespace xgboost
diff --git a/src/common/numeric.cu b/src/common/numeric.cu
index ce8035f7e..8d115506a 100644
--- a/src/common/numeric.cu
+++ b/src/common/numeric.cu
@@ -8,11 +8,9 @@
 #include "xgboost/context.h"             // Context
 #include "xgboost/host_device_vector.h"  // HostDeviceVector
 
-namespace xgboost {
-namespace common {
-namespace cuda_impl {
+namespace xgboost::common::cuda_impl {
 double Reduce(Context const* ctx, HostDeviceVector<float> const& values) {
-  values.SetDevice(ctx->gpu_id);
+  values.SetDevice(ctx->Device());
   auto const d_values = values.ConstDeviceSpan();
   dh::XGBCachingDeviceAllocator<char> alloc;
 
@@ -24,6 +22,4 @@ double Reduce(Context const* ctx, HostDeviceVector<float> const& values) {
                     thrust::plus<float>{});
 #endif
 }
-}  // namespace cuda_impl
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common::cuda_impl
diff --git a/src/common/optional_weight.h b/src/common/optional_weight.h
index c2844d73f..bbfd365c8 100644
--- a/src/common/optional_weight.h
+++ b/src/common/optional_weight.h
@@ -24,9 +24,9 @@ struct OptionalWeights {
 inline OptionalWeights MakeOptionalWeights(Context const* ctx,
                                            HostDeviceVector<float> const& weights) {
   if (ctx->IsCUDA()) {
-    weights.SetDevice(ctx->gpu_id);
+    weights.SetDevice(ctx->Device());
   }
-  return OptionalWeights{ctx->IsCPU() ? weights.ConstHostSpan() : weights.ConstDeviceSpan()};
+  return OptionalWeights{ctx->IsCUDA() ? weights.ConstDeviceSpan() : weights.ConstHostSpan()};
 }
 }  // namespace xgboost::common
 #endif  // XGBOOST_COMMON_OPTIONAL_WEIGHT_H_
diff --git a/src/common/quantile.cu b/src/common/quantile.cu
index 9896165ad..6040e266f 100644
--- a/src/common/quantile.cu
+++ b/src/common/quantile.cu
@@ -242,11 +242,10 @@ common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
 // summary does the output element come from) result by definition of merged rank.  So we
 // run it in 2 passes to obtain the merge path and then customize the standard merge
 // algorithm.
-void MergeImpl(int32_t device, Span<SketchEntry const> const &d_x,
+void MergeImpl(DeviceOrd device, Span<SketchEntry const> const &d_x,
                Span<bst_row_t const> const &x_ptr, Span<SketchEntry const> const &d_y,
                Span<bst_row_t const> const &y_ptr, Span<SketchEntry> out, Span<bst_row_t> out_ptr) {
-  dh::safe_cuda(cudaSetDevice(device));
-
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
   CHECK_EQ(d_x.size() + d_y.size(), out.size());
   CHECK_EQ(x_ptr.size(), out_ptr.size());
   CHECK_EQ(y_ptr.size(), out_ptr.size());
@@ -344,8 +343,7 @@ void MergeImpl(int32_t device, Span<SketchEntry const> const &d_x,
 void SketchContainer::Push(Span<Entry const> entries, Span<size_t> columns_ptr,
                            common::Span<OffsetT> cuts_ptr,
                            size_t total_cuts, Span<float> weights) {
-  dh::safe_cuda(cudaSetDevice(device_));
-
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
   Span<SketchEntry> out;
   dh::device_vector<SketchEntry> cuts;
   bool first_window = this->Current().empty();
@@ -404,7 +402,7 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
    * pruning or merging. We preserve the first type and remove the second type.
    */
   timer_.Start(__func__);
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
   CHECK_EQ(d_columns_ptr_in.size(), num_columns_ + 1);
   dh::XGBCachingDeviceAllocator<char> alloc;
 
@@ -461,7 +459,7 @@ size_t SketchContainer::ScanInput(Span<SketchEntry> entries, Span<OffsetT> d_col
 
 void SketchContainer::Prune(size_t to) {
   timer_.Start(__func__);
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
 
   OffsetT to_total = 0;
   auto& h_columns_ptr = columns_ptr_b_.HostVector();
@@ -496,8 +494,7 @@ void SketchContainer::Prune(size_t to) {
 
 void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
                             Span<SketchEntry const> that) {
-  dh::safe_cuda(cudaSetDevice(device_));
-
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
   timer_.Start(__func__);
   if (this->Current().size() == 0) {
     CHECK_EQ(this->columns_ptr_.HostVector().back(), 0);
@@ -532,8 +529,7 @@ void SketchContainer::Merge(Span<OffsetT const> d_that_columns_ptr,
 }
 
 void SketchContainer::FixError() {
-  dh::safe_cuda(cudaSetDevice(device_));
-
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
   auto d_columns_ptr = this->columns_ptr_.ConstDeviceSpan();
   auto in = dh::ToSpan(this->Current());
   dh::LaunchN(in.size(), [=] __device__(size_t idx) {
@@ -558,7 +554,7 @@ void SketchContainer::FixError() {
 }
 
 void SketchContainer::AllReduce(bool is_column_split) {
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
   auto world = collective::GetWorldSize();
   if (world == 1 || is_column_split) {
     return;
@@ -585,15 +581,15 @@ void SketchContainer::AllReduce(bool is_column_split) {
   auto offset = rank * d_columns_ptr.size();
   thrust::copy(thrust::device, d_columns_ptr.data(), d_columns_ptr.data() + d_columns_ptr.size(),
                gathered_ptrs.begin() + offset);
-  collective::AllReduce<collective::Operation::kSum>(device_, gathered_ptrs.data().get(),
+  collective::AllReduce<collective::Operation::kSum>(device_.ordinal, gathered_ptrs.data().get(),
                                                      gathered_ptrs.size());
 
   // Get the data from all workers.
   std::vector<size_t> recv_lengths;
   dh::caching_device_vector<char> recvbuf;
-  collective::AllGatherV(device_, this->Current().data().get(),
+  collective::AllGatherV(device_.ordinal, this->Current().data().get(),
                          dh::ToSpan(this->Current()).size_bytes(), &recv_lengths, &recvbuf);
-  collective::Synchronize(device_);
+  collective::Synchronize(device_.ordinal);
 
   // Segment the received data.
   auto s_recvbuf = dh::ToSpan(recvbuf);
@@ -640,7 +636,7 @@ struct InvalidCatOp {
 
 void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) {
   timer_.Start(__func__);
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
   p_cuts->min_vals_.Resize(num_columns_);
 
   // Sync between workers.
@@ -690,21 +686,41 @@ void SketchContainer::MakeCuts(HistogramCuts* p_cuts, bool is_column_split) {
         });
     CHECK_EQ(num_columns_, d_in_columns_ptr.size() - 1);
     max_values.resize(d_in_columns_ptr.size() - 1);
+
+    // In some cases (e.g. column-wise data split), we may have empty columns, so we need to keep
+    // track of the unique keys (feature indices) after the thrust::reduce_by_key` call.
+    dh::caching_device_vector<size_t> d_max_keys(d_in_columns_ptr.size() - 1);
     dh::caching_device_vector<SketchEntry> d_max_values(d_in_columns_ptr.size() - 1);
-
 #if defined(XGBOOST_USE_CUDA)
-    thrust::reduce_by_key(thrust::cuda::par(alloc), key_it, key_it + in_cut_values.size(), val_it,
-                          thrust::make_discard_iterator(), d_max_values.begin(),
-                          thrust::equal_to<bst_feature_t>{},
-                          [] __device__(auto l, auto r) { return l.value > r.value ? l : r; });
-#elif defined(XGBOOST_USE_HIP)
-    thrust::reduce_by_key(thrust::hip::par(alloc), key_it, key_it + in_cut_values.size(), val_it,
-                          thrust::make_discard_iterator(), d_max_values.begin(),
-                          thrust::equal_to<bst_feature_t>{},
-                          [] __device__(auto l, auto r) { return l.value > r.value ? l : r; });
-#endif
+    auto new_end = thrust::reduce_by_key(
+        thrust::cuda::par(alloc), key_it, key_it + in_cut_values.size(), val_it, d_max_keys.begin(),
+        d_max_values.begin(), thrust::equal_to<bst_feature_t>{},
+        [] __device__(auto l, auto r) { return l.value > r.value ? l : r; });
+    d_max_keys.erase(new_end.first, d_max_keys.end());
+    d_max_values.erase(new_end.second, d_max_values.end());
 
-    dh::CopyDeviceSpanToVector(&max_values, dh::ToSpan(d_max_values));
+    // The device vector needs to be initialized explicitly since we may have some missing columns.
+    SketchEntry default_entry{};
+    dh::caching_device_vector<SketchEntry> d_max_results(d_in_columns_ptr.size() - 1,
+                                                         default_entry);
+    thrust::scatter(thrust::cuda::par(alloc), d_max_values.begin(), d_max_values.end(),
+                    d_max_keys.begin(), d_max_results.begin());
+#elif defined(XGBOOST_USE_HIP)
+    auto new_end = thrust::reduce_by_key(
+        thrust::hip::par(alloc), key_it, key_it + in_cut_values.size(), val_it, d_max_keys.begin(),
+        d_max_values.begin(), thrust::equal_to<bst_feature_t>{},
+        [] __device__(auto l, auto r) { return l.value > r.value ? l : r; });
+    d_max_keys.erase(new_end.first, d_max_keys.end());
+    d_max_values.erase(new_end.second, d_max_values.end());
+
+    // The device vector needs to be initialized explicitly since we may have some missing columns.
+    SketchEntry default_entry{};
+    dh::caching_device_vector<SketchEntry> d_max_results(d_in_columns_ptr.size() - 1,
+                                                         default_entry);
+    thrust::scatter(thrust::hip::par(alloc), d_max_values.begin(), d_max_values.end(),
+                    d_max_keys.begin(), d_max_results.begin());
+#endif
+    dh::CopyDeviceSpanToVector(&max_values, dh::ToSpan(d_max_results));
     auto max_it = MakeIndexTransformIter([&](auto i) {
       if (IsCat(h_feature_types, i)) {
         return max_values[i].value;
diff --git a/src/common/quantile.cuh b/src/common/quantile.cuh
index 221706274..1eaa15c70 100644
--- a/src/common/quantile.cuh
+++ b/src/common/quantile.cuh
@@ -41,7 +41,7 @@ class SketchContainer {
   bst_row_t num_rows_;
   bst_feature_t num_columns_;
   int32_t num_bins_;
-  int32_t device_;
+  DeviceOrd device_;
 
   // Double buffer as neither prune nor merge can be performed inplace.
   dh::device_vector<SketchEntry> entries_a_;
@@ -93,35 +93,32 @@ class SketchContainer {
    * \param num_rows    Total number of rows in known dataset (typically the rows in current worker).
    * \param device      GPU ID.
    */
-   SketchContainer(HostDeviceVector<FeatureType> const &feature_types,
-                   int32_t max_bin, bst_feature_t num_columns,
-                   bst_row_t num_rows, int32_t device)
-       : num_rows_{num_rows},
-         num_columns_{num_columns}, num_bins_{max_bin}, device_{device} {
-     CHECK_GE(device, 0);
-     // Initialize Sketches for this dmatrix
-     this->columns_ptr_.SetDevice(device_);
-     this->columns_ptr_.Resize(num_columns + 1);
-     this->columns_ptr_b_.SetDevice(device_);
-     this->columns_ptr_b_.Resize(num_columns + 1);
+  SketchContainer(HostDeviceVector<FeatureType> const& feature_types, int32_t max_bin,
+                  bst_feature_t num_columns, bst_row_t num_rows, DeviceOrd device)
+      : num_rows_{num_rows}, num_columns_{num_columns}, num_bins_{max_bin}, device_{device} {
+    CHECK(device.IsCUDA());
+    // Initialize Sketches for this dmatrix
+    this->columns_ptr_.SetDevice(device_);
+    this->columns_ptr_.Resize(num_columns + 1);
+    this->columns_ptr_b_.SetDevice(device_);
+    this->columns_ptr_b_.Resize(num_columns + 1);
 
-     this->feature_types_.Resize(feature_types.Size());
-     this->feature_types_.Copy(feature_types);
-     // Pull to device.
-     this->feature_types_.SetDevice(device);
-     this->feature_types_.ConstDeviceSpan();
-     this->feature_types_.ConstHostSpan();
+    this->feature_types_.Resize(feature_types.Size());
+    this->feature_types_.Copy(feature_types);
+    // Pull to device.
+    this->feature_types_.SetDevice(device);
+    this->feature_types_.ConstDeviceSpan();
+    this->feature_types_.ConstHostSpan();
 
-     auto d_feature_types = feature_types_.ConstDeviceSpan();
-     has_categorical_ =
-         !d_feature_types.empty() &&
-         thrust::any_of(dh::tbegin(d_feature_types), dh::tend(d_feature_types),
-                        common::IsCatOp{});
+    auto d_feature_types = feature_types_.ConstDeviceSpan();
+    has_categorical_ =
+        !d_feature_types.empty() &&
+        thrust::any_of(dh::tbegin(d_feature_types), dh::tend(d_feature_types), common::IsCatOp{});
 
-     timer_.Init(__func__);
-   }
+    timer_.Init(__func__);
+  }
   /* \brief Return GPU ID for this container. */
-  int32_t DeviceIdx() const { return device_; }
+  [[nodiscard]] DeviceOrd DeviceIdx() const { return device_; }
   /* \brief Whether the predictor matrix contains categorical features. */
   bool HasCategorical() const { return has_categorical_; }
   /* \brief Accumulate weights of duplicated entries in input. */
@@ -175,9 +172,7 @@ class SketchContainer {
   template <typename KeyComp = thrust::equal_to<size_t>>
   size_t Unique(KeyComp key_comp = thrust::equal_to<size_t>{}) {
     timer_.Start(__func__);
-
-    dh::safe_cuda(cudaSetDevice(device_));
-
+    dh::safe_cuda(cudaSetDevice(device_.ordinal));
     this->columns_ptr_.SetDevice(device_);
     Span<OffsetT> d_column_scan = this->columns_ptr_.DeviceSpan();
     CHECK_EQ(d_column_scan.size(), num_columns_ + 1);
@@ -195,7 +190,7 @@ class SketchContainer {
         d_column_scan.data() + d_column_scan.size(), entries.data(),
         entries.data() + entries.size(), scan_out.DevicePointer(),
         entries.data(), detail::SketchUnique{}, key_comp);
-#else
+#elif defined(XGBOOST_USE_CUDA)
     size_t n_uniques = dh::SegmentedUnique(
         thrust::cuda::par(alloc), d_column_scan.data(),
         d_column_scan.data() + d_column_scan.size(), entries.data(),
diff --git a/src/common/quantile.h b/src/common/quantile.h
index 48758b8dc..47db5f875 100644
--- a/src/common/quantile.h
+++ b/src/common/quantile.h
@@ -35,13 +35,13 @@ struct WQSummary {
   /*! \brief an entry in the sketch summary */
   struct Entry {
     /*! \brief minimum rank */
-    RType rmin;
+    RType rmin{};
     /*! \brief maximum rank */
-    RType rmax;
+    RType rmax{};
     /*! \brief maximum weight */
-    RType wmin;
+    RType wmin{};
     /*! \brief the value of data */
-    DType value;
+    DType value{};
     // constructor
     XGBOOST_DEVICE Entry() {}  // NOLINT
     // constructor
diff --git a/src/common/quantile_loss_utils.cc b/src/common/quantile_loss_utils.cc
index 59397b701..df2fa6edd 100644
--- a/src/common/quantile_loss_utils.cc
+++ b/src/common/quantile_loss_utils.cc
@@ -1,19 +1,19 @@
 /**
- * Copyright 2023 by XGBoost contributors
+ * Copyright 2023, XGBoost contributors
  */
 #include "quantile_loss_utils.h"
 
-#include <cctype>             // std::isspace
-#include <istream>            // std::istream
-#include <ostream>            // std::ostream
-#include <string>             // std::string
-#include <vector>             // std::vector
+#include <cctype>   // for isspace
+#include <istream>  // for istream
+#include <ostream>  // for ostream
+#include <string>   // for string
+#include <vector>   // for vector
 
-#include "xgboost/json.h"     // F32Array,TypeCheck,get,Number
-#include "xgboost/json_io.h"  // JsonWriter
+#include "../common/json_utils.h"  // for TypeCheck
+#include "xgboost/json.h"          // for F32Array, get, Number
+#include "xgboost/json_io.h"       // for JsonWriter
 
-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 std::ostream& operator<<(std::ostream& os, const ParamFloatArray& array) {
   auto const& t = array.Get();
   xgboost::F32Array arr{t.size()};
@@ -70,5 +70,4 @@ std::istream& operator>>(std::istream& is, ParamFloatArray& array) {
 }
 
 DMLC_REGISTER_PARAMETER(QuantileLossParam);
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
diff --git a/src/common/ranking_utils.h b/src/common/ranking_utils.h
index 31531a597..e6b87ed4b 100644
--- a/src/common/ranking_utils.h
+++ b/src/common/ranking_utils.h
@@ -197,10 +197,10 @@ class RankingCache {
       CHECK_EQ(info.group_ptr_.back(), info.labels.Size())
           << error::GroupSize() << "the size of label.";
     }
-    if (ctx->IsCPU()) {
-      this->InitOnCPU(ctx, info);
-    } else {
+    if (ctx->IsCUDA()) {
       this->InitOnCUDA(ctx, info);
+    } else {
+      this->InitOnCPU(ctx, info);
     }
     if (!info.weights_.Empty()) {
       CHECK_EQ(Groups(), info.weights_.Size()) << error::GroupWeight();
@@ -218,7 +218,7 @@ class RankingCache {
   // Constructed as [1, n_samples] if group ptr is not supplied by the user
   common::Span<bst_group_t const> DataGroupPtr(Context const* ctx) const {
     group_ptr_.SetDevice(ctx->Device());
-    return ctx->IsCPU() ? group_ptr_.ConstHostSpan() : group_ptr_.ConstDeviceSpan();
+    return ctx->IsCUDA() ? group_ptr_.ConstDeviceSpan() : group_ptr_.ConstHostSpan();
   }
 
   [[nodiscard]] auto const& Param() const { return param_; }
@@ -231,10 +231,10 @@ class RankingCache {
       sorted_idx_cache_.SetDevice(ctx->Device());
       sorted_idx_cache_.Resize(predt.size());
     }
-    if (ctx->IsCPU()) {
-      return this->MakeRankOnCPU(ctx, predt);
-    } else {
+    if (ctx->IsCUDA()) {
       return this->MakeRankOnCUDA(ctx, predt);
+    } else {
+      return this->MakeRankOnCPU(ctx, predt);
     }
   }
   // The function simply returns a uninitialized buffer as this is only used by the
@@ -307,10 +307,10 @@ class NDCGCache : public RankingCache {
  public:
   NDCGCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p)
       : RankingCache{ctx, info, p} {
-    if (ctx->IsCPU()) {
-      this->InitOnCPU(ctx, info);
-    } else {
+    if (ctx->IsCUDA()) {
       this->InitOnCUDA(ctx, info);
+    } else {
+      this->InitOnCPU(ctx, info);
     }
   }
 
@@ -318,7 +318,7 @@ class NDCGCache : public RankingCache {
     return inv_idcg_.View(ctx->Device());
   }
   common::Span<double const> Discount(Context const* ctx) const {
-    return ctx->IsCPU() ? discounts_.ConstHostSpan() : discounts_.ConstDeviceSpan();
+    return ctx->IsCUDA() ? discounts_.ConstDeviceSpan() : discounts_.ConstHostSpan();
   }
   linalg::VectorView<double> Dcg(Context const* ctx) {
     if (dcg_.Size() == 0) {
@@ -387,10 +387,10 @@ class PreCache : public RankingCache {
  public:
   PreCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p)
       : RankingCache{ctx, info, p} {
-    if (ctx->IsCPU()) {
-      this->InitOnCPU(ctx, info);
-    } else {
+    if (ctx->IsCUDA()) {
       this->InitOnCUDA(ctx, info);
+    } else {
+      this->InitOnCPU(ctx, info);
     }
   }
 
@@ -399,7 +399,7 @@ class PreCache : public RankingCache {
       pre_.SetDevice(ctx->Device());
       pre_.Resize(this->Groups());
     }
-    return ctx->IsCPU() ? pre_.HostSpan() : pre_.DeviceSpan();
+    return ctx->IsCUDA() ? pre_.DeviceSpan() : pre_.HostSpan();
   }
 };
 
@@ -418,10 +418,10 @@ class MAPCache : public RankingCache {
  public:
   MAPCache(Context const* ctx, MetaInfo const& info, LambdaRankParam const& p)
       : RankingCache{ctx, info, p}, n_samples_{static_cast<std::size_t>(info.num_row_)} {
-    if (ctx->IsCPU()) {
-      this->InitOnCPU(ctx, info);
-    } else {
+    if (ctx->IsCUDA()) {
       this->InitOnCUDA(ctx, info);
+    } else {
+      this->InitOnCPU(ctx, info);
     }
   }
 
@@ -430,21 +430,21 @@ class MAPCache : public RankingCache {
       n_rel_.SetDevice(ctx->Device());
       n_rel_.Resize(n_samples_);
     }
-    return ctx->IsCPU() ? n_rel_.HostSpan() : n_rel_.DeviceSpan();
+    return ctx->IsCUDA() ? n_rel_.DeviceSpan() : n_rel_.HostSpan();
   }
   common::Span<double> Acc(Context const* ctx) {
     if (acc_.Empty()) {
       acc_.SetDevice(ctx->Device());
       acc_.Resize(n_samples_);
     }
-    return ctx->IsCPU() ? acc_.HostSpan() : acc_.DeviceSpan();
+    return ctx->IsCUDA() ? acc_.DeviceSpan() : acc_.HostSpan();
   }
   common::Span<double> Map(Context const* ctx) {
     if (map_.Empty()) {
       map_.SetDevice(ctx->Device());
       map_.Resize(this->Groups());
     }
-    return ctx->IsCPU() ? map_.HostSpan() : map_.DeviceSpan();
+    return ctx->IsCUDA() ? map_.DeviceSpan() : map_.HostSpan();
   }
 };
 
diff --git a/src/common/ref_resource_view.h b/src/common/ref_resource_view.h
index 0fadf846d..d4f82e615 100644
--- a/src/common/ref_resource_view.h
+++ b/src/common/ref_resource_view.h
@@ -76,7 +76,7 @@ class RefResourceView {
 
   [[nodiscard]] size_type size() const { return size_; }  // NOLINT
   [[nodiscard]] size_type size_bytes() const {            // NOLINT
-    return Span{data(), size()}.size_bytes();
+    return Span<const value_type>{data(), size()}.size_bytes();
   }
   [[nodiscard]] value_type* data() { return ptr_; };              // NOLINT
   [[nodiscard]] value_type const* data() const { return ptr_; };  // NOLINT
diff --git a/src/common/stats.cc b/src/common/stats.cc
index 03ee00b87..bbf969fcc 100644
--- a/src/common/stats.cc
+++ b/src/common/stats.cc
@@ -15,8 +15,7 @@
 #include "xgboost/linalg.h"              // Tensor, UnravelIndex, Apply
 #include "xgboost/logging.h"             // CHECK_EQ
 
-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 void Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
             HostDeviceVector<float> const& weights, linalg::Tensor<float, 1>* out) {
   if (!ctx->IsCPU()) {
@@ -46,11 +45,13 @@ void Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
 }
 
 void Mean(Context const* ctx, linalg::Vector<float> const& v, linalg::Vector<float>* out) {
-  v.SetDevice(ctx->gpu_id);
-  out->SetDevice(ctx->gpu_id);
+  v.SetDevice(ctx->Device());
+  out->SetDevice(ctx->Device());
   out->Reshape(1);
 
-  if (ctx->IsCPU()) {
+  if (ctx->IsCUDA()) {
+    cuda_impl::Mean(ctx, v.View(ctx->Device()), out->View(ctx->Device()));
+  } else {
     auto h_v = v.HostView();
     float n = v.Size();
     MemStackAllocator<float, DefaultMaxThreads()> tloc(ctx->Threads(), 0.0f);
@@ -58,9 +59,6 @@ void Mean(Context const* ctx, linalg::Vector<float> const& v, linalg::Vector<flo
                 [&](auto i) { tloc[omp_get_thread_num()] += h_v(i) / n; });
     auto ret = std::accumulate(tloc.cbegin(), tloc.cend(), .0f);
     out->HostView()(0) = ret;
-  } else {
-    cuda_impl::Mean(ctx, v.View(ctx->Device()), out->View(ctx->Device()));
   }
 }
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
diff --git a/src/common/stats.cu b/src/common/stats.cu
index fbc19b8da..6cfcd6bae 100644
--- a/src/common/stats.cu
+++ b/src/common/stats.cu
@@ -15,19 +15,16 @@
 #include "xgboost/host_device_vector.h"  // HostDeviceVector
 #include "xgboost/linalg.h"              // linalg::TensorView, UnravelIndex, Apply
 
-namespace xgboost {
-namespace common {
-namespace cuda_impl {
-
 #if defined(XGBOOST_USE_HIP)
 namespace cub = hipcub;
 #endif
 
+namespace xgboost::common::cuda_impl {
 void Median(Context const* ctx, linalg::TensorView<float const, 2> t,
             common::OptionalWeights weights, linalg::Tensor<float, 1>* out) {
   CHECK_GE(t.Shape(1), 1);
   HostDeviceVector<std::size_t> segments(t.Shape(1) + 1, 0);
-  segments.SetDevice(ctx->gpu_id);
+  segments.SetDevice(ctx->Device());
   auto d_segments = segments.DeviceSpan();
   dh::LaunchN(d_segments.size(), ctx->CUDACtx()->Stream(),
               [=] XGBOOST_DEVICE(std::size_t i) { d_segments[i] = t.Shape(0) * i; });
@@ -36,7 +33,7 @@ void Median(Context const* ctx, linalg::TensorView<float const, 2> t,
         return linalg::detail::Apply(t, linalg::UnravelIndex(i, t.Shape()));
       });
 
-  out->SetDevice(ctx->gpu_id);
+  out->SetDevice(ctx->Device());
   out->Reshape(t.Shape(1));
   if (weights.Empty()) {
     common::SegmentedQuantile(ctx, 0.5, dh::tcbegin(d_segments), dh::tcend(d_segments), val_it,
@@ -65,6 +62,4 @@ void Mean(Context const* ctx, linalg::VectorView<float const> v, linalg::VectorV
   dh::TemporaryArray<char> temp{bytes};
   cub::DeviceReduce::Sum(temp.data().get(), bytes, it, out.Values().data(), v.Size(), s);
 }
-}  // namespace cuda_impl
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common::cuda_impl
diff --git a/src/common/stats.cuh b/src/common/stats.cuh
index 16a22f877..d61adc41a 100644
--- a/src/common/stats.cuh
+++ b/src/common/stats.cuh
@@ -160,7 +160,7 @@ void SegmentedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_begin, Se
   auto d_sorted_idx = dh::ToSpan(sorted_idx);
   auto val = thrust::make_permutation_iterator(val_begin, dh::tcbegin(d_sorted_idx));
 
-  quantiles->SetDevice(ctx->gpu_id);
+  quantiles->SetDevice(ctx->Device());
   quantiles->Resize(n_segments);
   auto d_results = quantiles->DeviceSpan();
 
@@ -226,7 +226,7 @@ void SegmentedWeightedQuantile(Context const* ctx, AlphaIt alpha_it, SegIt seg_b
 #endif
 
   auto n_segments = std::distance(seg_beg, seg_end) - 1;
-  quantiles->SetDevice(ctx->gpu_id);
+  quantiles->SetDevice(ctx->Device());
   quantiles->Resize(n_segments);
   auto d_results = quantiles->DeviceSpan();
   auto d_weight_cdf = dh::ToSpan(weights_cdf);
diff --git a/src/common/threading_utils.cc b/src/common/threading_utils.cc
index 349cc0ba7..5e730e96d 100644
--- a/src/common/threading_utils.cc
+++ b/src/common/threading_utils.cc
@@ -3,14 +3,23 @@
  */
 #include "threading_utils.h"
 
-#include <fstream>
-#include <string>
+#include <algorithm>   // for max
+#include <exception>   // for exception
+#include <filesystem>  // for path, exists
+#include <fstream>     // for ifstream
+#include <string>      // for string
 
-#include "xgboost/logging.h"
+#include "common.h"  // for DivRoundUp
 
-namespace xgboost {
-namespace common {
-int32_t GetCfsCPUCount() noexcept {
+namespace xgboost::common {
+/**
+ * Modified from
+ * github.com/psiha/sweater/blob/master/include/boost/sweater/hardware_concurrency.hpp
+ *
+ * MIT License: Copyright (c) 2016 Domagoj Šarić
+ */
+std::int32_t GetCGroupV1Count(std::filesystem::path const& quota_path,
+                              std::filesystem::path const& peroid_path) {
 #if defined(__linux__)
   // https://bugs.openjdk.java.net/browse/JDK-8146115
   // http://hg.openjdk.java.net/jdk/hs/rev/7f22774a5f42
@@ -31,8 +40,8 @@ int32_t GetCfsCPUCount() noexcept {
     }
   };
   // complete fair scheduler from Linux
-  auto const cfs_quota(read_int("/sys/fs/cgroup/cpu/cpu.cfs_quota_us"));
-  auto const cfs_period(read_int("/sys/fs/cgroup/cpu/cpu.cfs_period_us"));
+  auto const cfs_quota(read_int(quota_path.c_str()));
+  auto const cfs_period(read_int(peroid_path.c_str()));
   if ((cfs_quota > 0) && (cfs_period > 0)) {
     return std::max(cfs_quota / cfs_period, 1);
   }
@@ -40,6 +49,47 @@ int32_t GetCfsCPUCount() noexcept {
   return -1;
 }
 
+std::int32_t GetCGroupV2Count(std::filesystem::path const& bandwidth_path) noexcept(true) {
+  std::int32_t cnt{-1};
+#if defined(__linux__)
+  namespace fs = std::filesystem;
+
+  std::int32_t a{0}, b{0};
+
+  auto warn = [] { LOG(WARNING) << "Invalid cgroupv2 file."; };
+  try {
+    std::ifstream fin{bandwidth_path, std::ios::in};
+    fin >> a;
+    fin >> b;
+  } catch (std::exception const&) {
+    warn();
+    return cnt;
+  }
+  if (a > 0 && b > 0) {
+    cnt = std::max(common::DivRoundUp(a, b), 1);
+  }
+#endif  //  defined(__linux__)
+  return cnt;
+}
+
+std::int32_t GetCfsCPUCount() noexcept {
+  namespace fs = std::filesystem;
+  fs::path const bandwidth_path{"/sys/fs/cgroup/cpu.max"};
+  auto has_v2 = fs::exists(bandwidth_path);
+  if (has_v2) {
+    return GetCGroupV2Count(bandwidth_path);
+  }
+
+  fs::path const quota_path{"/sys/fs/cgroup/cpu/cpu.cfs_quota_us"};
+  fs::path const peroid_path{"/sys/fs/cgroup/cpu/cpu.cfs_period_us"};
+  auto has_v1 = fs::exists(quota_path) && fs::exists(peroid_path);
+  if (has_v1) {
+    return GetCGroupV1Count(quota_path, peroid_path);
+  }
+
+  return -1;
+}
+
 std::int32_t OmpGetNumThreads(std::int32_t n_threads) {
   // Don't use parallel if we are in a parallel region.
   if (omp_in_parallel()) {
@@ -54,5 +104,4 @@ std::int32_t OmpGetNumThreads(std::int32_t n_threads) {
   n_threads = std::max(n_threads, 1);
   return n_threads;
 }
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
diff --git a/src/common/threading_utils.h b/src/common/threading_utils.h
index 4ca4ca070..ac7119035 100644
--- a/src/common/threading_utils.h
+++ b/src/common/threading_utils.h
@@ -253,11 +253,6 @@ inline std::int32_t OmpGetThreadLimit() {
  * \brief Get thread limit from CFS.
  *
  *   This function has non-trivial overhead and should not be called repeatly.
- *
- * Modified from
- * github.com/psiha/sweater/blob/master/include/boost/sweater/hardware_concurrency.hpp
- *
- * MIT License: Copyright (c) 2016 Domagoj Šarić
  */
 std::int32_t GetCfsCPUCount() noexcept;
 
diff --git a/src/common/transform.h b/src/common/transform.h
index fd6f82817..0457e26f3 100644
--- a/src/common/transform.h
+++ b/src/common/transform.h
@@ -62,8 +62,8 @@ class Transform {
   template <typename Functor>
   struct Evaluator {
    public:
-    Evaluator(Functor func, Range range, int32_t n_threads, int32_t device_idx)
-        : func_(func), range_{std::move(range)}, n_threads_{n_threads}, device_{device_idx} {}
+    Evaluator(Functor func, Range range, int32_t n_threads, DeviceOrd device)
+        : func_(func), range_{std::move(range)}, n_threads_{n_threads}, device_{device} {}
 
     /*!
      * \brief Evaluate the functor with input pointers to HostDeviceVector.
@@ -73,7 +73,7 @@ class Transform {
      */
     template <typename... HDV>
     void Eval(HDV... vectors) const {
-      bool on_device = device_ >= 0;
+      bool on_device = device_.IsCUDA();
 
       if (on_device) {
         LaunchCUDA(func_, vectors...);
@@ -118,11 +118,11 @@ class Transform {
     }
     // Recursive unpack for Shard.
     template <typename T>
-    void UnpackShard(int device, const HostDeviceVector<T> *vector) const {
+    void UnpackShard(DeviceOrd device, const HostDeviceVector<T> *vector) const {
       vector->SetDevice(device);
     }
     template <typename Head, typename... Rest>
-    void UnpackShard(int device,
+    void UnpackShard(DeviceOrd device,
                      const HostDeviceVector<Head> *_vector,
                      const HostDeviceVector<Rest> *... _vectors) const {
       _vector->SetDevice(device);
@@ -142,13 +142,7 @@ class Transform {
       // granularity is used in data vector.
       size_t shard_size = range_size;
       Range shard_range {0, static_cast<Range::DifferenceType>(shard_size)};
-
-#if defined(XGBOOST_USE_HIP)
-      dh::safe_cuda(hipSetDevice(device_));
-#elif defined(XGBOOST_USE_CUDA)
-      dh::safe_cuda(cudaSetDevice(device_));
-#endif
-
+      dh::safe_cuda(cudaSetDevice(device_.ordinal));
       const int kGrids =
           static_cast<int>(DivRoundUp(*(range_.end()), kBlockThreads));
       if (kGrids == 0) {
@@ -182,7 +176,7 @@ class Transform {
     /*! \brief Range object specifying parallel threads index range. */
     Range range_;
     int32_t n_threads_;
-    int32_t device_;
+    DeviceOrd device_;
   };
 
  public:
@@ -200,8 +194,8 @@ class Transform {
    */
   template <typename Functor>
   static Evaluator<Functor> Init(Functor func, Range const range, int32_t n_threads,
-                                 int32_t device_idx) {
-    return Evaluator<Functor>{func, std::move(range), n_threads, device_idx};
+                                 DeviceOrd device) {
+    return Evaluator<Functor>{func, std::move(range), n_threads, device};
   }
 };
 
diff --git a/src/context.cc b/src/context.cc
index 1ce81026f..7b74a69e0 100644
--- a/src/context.cc
+++ b/src/context.cc
@@ -20,7 +20,6 @@ namespace xgboost {
 
 DMLC_REGISTER_PARAMETER(Context);
 
-bst_d_ordinal_t constexpr Context::kCpuId;
 std::int64_t constexpr Context::kDefaultSeed;
 
 Context::Context() : cfs_cpu_count_{common::GetCfsCPUCount()} {}
@@ -82,7 +81,7 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
     return std::nullopt;
   }
 
-  std::int32_t parsed_id{Context::kCpuId};
+  std::int32_t parsed_id{DeviceOrd::CPUOrdinal()};
   auto res = std::from_chars(ordinal.c_str(), ordinal.c_str() + ordinal.size(), parsed_id);
   if (res.ec != std::errc()) {
     return std::nullopt;
@@ -119,7 +118,7 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
 
   auto split_it = std::find(s_device.cbegin(), s_device.cend(), ':');
   DeviceOrd device;
-  device.ordinal = Context::InvalidOrdinal();  // mark it invalid for check.
+  device.ordinal = DeviceOrd::InvalidOrdinal();  // mark it invalid for check.
   if (split_it == s_device.cend()) {
     // no ordinal.
     if (s_device == DeviceSym::CPU()) {
@@ -147,7 +146,7 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
     device = DeviceOrd::CUDA(opt_id.value());
   }
 
-  if (device.ordinal < Context::kCpuId) {
+  if (device.ordinal < DeviceOrd::CPUOrdinal()) {
     fatal();
   }
   device = CUDAOrdinal(device, fail_on_invalid_gpu_id);
@@ -156,6 +155,28 @@ DeviceOrd CUDAOrdinal(DeviceOrd device, bool) {
 }
 }  // namespace
 
+std::ostream& operator<<(std::ostream& os, DeviceOrd ord) {
+  os << ord.Name();
+  return os;
+}
+
+void Context::Init(Args const& kwargs) {
+  auto unknown = this->UpdateAllowUnknown(kwargs);
+  if (!unknown.empty()) {
+    std::stringstream ss;
+    std::size_t i = 0;
+    ss << "[Internal Error] Unknown parameters passed to the Context {";
+    for (auto const& [k, _] : unknown) {
+      ss << '"' << k << '"';
+      if (++i != unknown.size()) {
+        ss << ", ";
+      }
+    }
+    ss << "}\n";
+    LOG(FATAL) << ss.str();
+  }
+}
+
 void Context::ConfigureGpuId(bool require_gpu) {
   if (this->IsCPU() && require_gpu) {
     this->UpdateAllowUnknown(Args{{kDevice, DeviceSym::CUDA()}});
@@ -178,7 +199,7 @@ void Context::SetDeviceOrdinal(Args const& kwargs) {
     error::WarnDeprecatedGPUId();
     auto opt_id = ParseInt(StringView{gpu_id_it->second});
     CHECK(opt_id.has_value()) << "Invalid value for `gpu_id`. Got:" << gpu_id_it->second;
-    if (opt_id.value() > Context::kCpuId) {
+    if (opt_id.value() > DeviceOrd::CPUOrdinal()) {
       this->UpdateAllowUnknown(Args{{kDevice, DeviceOrd::CUDA(opt_id.value()).Name()}});
     } else {
       this->UpdateAllowUnknown(Args{{kDevice, DeviceOrd::CPU().Name()}});
@@ -194,9 +215,9 @@ void Context::SetDeviceOrdinal(Args const& kwargs) {
   this->SetDevice(new_d);
 
   if (this->IsCPU()) {
-    CHECK_EQ(this->device_.ordinal, kCpuId);
+    CHECK_EQ(this->device_.ordinal, DeviceOrd::CPUOrdinal());
   } else {
-    CHECK_GT(this->device_.ordinal, kCpuId);
+    CHECK_GT(this->device_.ordinal, DeviceOrd::CPUOrdinal());
   }
 }
 
diff --git a/src/data/adapter.cc b/src/data/adapter.cc
new file mode 100644
index 000000000..4fa171c9d
--- /dev/null
+++ b/src/data/adapter.cc
@@ -0,0 +1,28 @@
+/**
+ *  Copyright 2019-2023, XGBoost Contributors
+ */
+#include "adapter.h"
+
+#include "../c_api/c_api_error.h"  // for API_BEGIN, API_END
+#include "xgboost/c_api.h"
+
+namespace xgboost::data {
+template <typename DataIterHandle, typename XGBCallbackDataIterNext, typename XGBoostBatchCSR>
+bool IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>::Next() {
+  if ((*next_callback_)(
+          data_handle_,
+          [](void *handle, XGBoostBatchCSR batch) -> int {
+            API_BEGIN();
+            static_cast<IteratorAdapter *>(handle)->SetData(batch);
+            API_END();
+          },
+          this) != 0) {
+    at_first_ = false;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+template class IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>;
+}  // namespace xgboost::data
diff --git a/src/data/adapter.h b/src/data/adapter.h
index 1463a13a7..9e7058aba 100644
--- a/src/data/adapter.h
+++ b/src/data/adapter.h
@@ -1,5 +1,5 @@
-/*!
- *  Copyright (c) 2019~2021 by Contributors
+/**
+ *  Copyright 2019-2023, XGBoost Contributors
  * \file adapter.h
  */
 #ifndef XGBOOST_DATA_ADAPTER_H_
@@ -16,11 +16,9 @@
 #include <utility>  // std::move
 #include <vector>
 
-#include "../c_api/c_api_error.h"
 #include "../common/error_msg.h"  // for MaxFeatureSize
 #include "../common/math.h"
 #include "array_interface.h"
-#include "arrow-cdi.h"
 #include "xgboost/base.h"
 #include "xgboost/data.h"
 #include "xgboost/logging.h"
@@ -743,8 +741,10 @@ class FileAdapter : dmlc::DataIter<FileAdapterBatch> {
   dmlc::Parser<uint32_t>* parser_;
 };
 
-/*! \brief Data iterator that takes callback to return data, used in JVM package for
- *  accepting data iterator. */
+/**
+ * @brief Data iterator that takes callback to return data, used in JVM package for accepting data
+ *        iterator.
+ */
 template <typename DataIterHandle, typename XGBCallbackDataIterNext, typename XGBoostBatchCSR>
 class IteratorAdapter : public dmlc::DataIter<FileAdapterBatch> {
  public:
@@ -758,23 +758,9 @@ class IteratorAdapter : public dmlc::DataIter<FileAdapterBatch> {
     CHECK(at_first_) << "Cannot reset IteratorAdapter";
   }
 
-  bool Next() override {
-    if ((*next_callback_)(
-            data_handle_,
-            [](void *handle, XGBoostBatchCSR batch) -> int {
-              API_BEGIN();
-              static_cast<IteratorAdapter *>(handle)->SetData(batch);
-              API_END();
-            },
-            this) != 0) {
-      at_first_ = false;
-      return true;
-    } else {
-      return false;
-    }
-  }
+  [[nodiscard]] bool Next() override;
 
-  FileAdapterBatch const& Value() const override {
+  [[nodiscard]] FileAdapterBatch const& Value() const override {
     return *batch_.get();
   }
 
@@ -822,12 +808,12 @@ class IteratorAdapter : public dmlc::DataIter<FileAdapterBatch> {
     block_.index = dmlc::BeginPtr(index_);
     block_.value = dmlc::BeginPtr(value_);
 
-    batch_.reset(new FileAdapterBatch(&block_, row_offset_));
+    batch_ = std::make_unique<FileAdapterBatch>(&block_, row_offset_);
     row_offset_ += offset_.size() - 1;
   }
 
-  size_t NumColumns() const { return columns_; }
-  size_t NumRows() const { return kAdapterUnknownSize; }
+  [[nodiscard]] std::size_t NumColumns() const { return columns_; }
+  [[nodiscard]] std::size_t NumRows() const { return kAdapterUnknownSize; }
 
  private:
   std::vector<size_t> offset_;
@@ -849,356 +835,6 @@ class IteratorAdapter : public dmlc::DataIter<FileAdapterBatch> {
   std::unique_ptr<FileAdapterBatch> batch_;
 };
 
-enum ColumnDType : uint8_t {
-  kUnknown,
-  kInt8,
-  kUInt8,
-  kInt16,
-  kUInt16,
-  kInt32,
-  kUInt32,
-  kInt64,
-  kUInt64,
-  kFloat,
-  kDouble
-};
-
-class Column {
- public:
-  Column() = default;
-
-  Column(size_t col_idx, size_t length, size_t null_count, const uint8_t* bitmap)
-    : col_idx_{col_idx}, length_{length}, null_count_{null_count}, bitmap_{bitmap} {}
-
-  virtual ~Column() = default;
-
-  Column(const Column&) = delete;
-  Column& operator=(const Column&) = delete;
-  Column(Column&&) = delete;
-  Column& operator=(Column&&) = delete;
-
-  // whether the valid bit is set for this element
-  bool IsValid(size_t row_idx) const {
-    return (!bitmap_ || (bitmap_[row_idx/8] & (1 << (row_idx%8))));
-  }
-
-  virtual COOTuple GetElement(size_t row_idx) const = 0;
-
-  virtual bool IsValidElement(size_t row_idx) const = 0;
-
-  virtual std::vector<float> AsFloatVector() const = 0;
-
-  virtual std::vector<uint64_t> AsUint64Vector() const = 0;
-
-  size_t Length() const { return length_; }
-
- protected:
-  size_t col_idx_;
-  size_t length_;
-  size_t null_count_;
-  const uint8_t* bitmap_;
-};
-
-// Only columns of primitive types are supported. An ArrowColumnarBatch is a
-// collection of std::shared_ptr<PrimitiveColumn>. These columns can be of different data types.
-// Hence, PrimitiveColumn is a class template; and all concrete PrimitiveColumns
-// derive from the abstract class Column.
-template <typename T>
-class PrimitiveColumn : public Column {
-  static constexpr float kNaN = std::numeric_limits<float>::quiet_NaN();
-
- public:
-  PrimitiveColumn(size_t idx, size_t length, size_t null_count,
-                  const uint8_t* bitmap, const T* data, float missing)
-    : Column{idx, length, null_count, bitmap}, data_{data}, missing_{missing} {}
-
-  COOTuple GetElement(size_t row_idx) const override {
-    CHECK(data_ && row_idx < length_) << "Column is empty or out-of-bound index of the column";
-    return { row_idx, col_idx_, IsValidElement(row_idx) ?
-                  static_cast<float>(data_[row_idx]) : kNaN };
-  }
-
-  bool IsValidElement(size_t row_idx) const override {
-    // std::isfinite needs to cast to double to prevent msvc report error
-    return IsValid(row_idx)
-            && std::isfinite(static_cast<double>(data_[row_idx]))
-            && static_cast<float>(data_[row_idx]) != missing_;
-  }
-
-  std::vector<float> AsFloatVector() const override {
-    CHECK(data_) << "Column is empty";
-    std::vector<float> fv(length_);
-    std::transform(data_, data_ + length_, fv.begin(),
-        [](T v) { return static_cast<float>(v); });
-    return fv;
-  }
-
-  std::vector<uint64_t> AsUint64Vector() const override {
-    CHECK(data_) << "Column is empty";
-    std::vector<uint64_t> iv(length_);
-    std::transform(data_, data_ + length_, iv.begin(),
-        [](T v) { return static_cast<uint64_t>(v); });
-    return iv;
-  }
-
- private:
-  const T* data_;
-  float missing_;  // user specified missing value
-};
-
-struct ColumnarMetaInfo {
-  // data type of the column
-  ColumnDType type{ColumnDType::kUnknown};
-  // location of the column in an Arrow record batch
-  int64_t loc{-1};
-};
-
-struct ArrowSchemaImporter {
-  std::vector<ColumnarMetaInfo> columns;
-
-  // map Arrow format strings to types
-  static ColumnDType FormatMap(char const* format_str) {
-    CHECK(format_str) << "Format string cannot be empty";
-    switch (format_str[0]) {
-      case 'c':
-        return ColumnDType::kInt8;
-      case 'C':
-        return ColumnDType::kUInt8;
-      case 's':
-        return ColumnDType::kInt16;
-      case 'S':
-        return ColumnDType::kUInt16;
-      case 'i':
-        return ColumnDType::kInt32;
-      case 'I':
-        return ColumnDType::kUInt32;
-      case 'l':
-        return ColumnDType::kInt64;
-      case 'L':
-        return ColumnDType::kUInt64;
-      case 'f':
-        return ColumnDType::kFloat;
-      case 'g':
-        return ColumnDType::kDouble;
-      default:
-        CHECK(false) << "Column data type not supported by XGBoost";
-        return ColumnDType::kUnknown;
-    }
-  }
-
-  void Import(struct ArrowSchema *schema) {
-    if (schema) {
-      CHECK(std::string(schema->format) == "+s"); // NOLINT
-      CHECK(columns.empty());
-      for (auto i = 0; i < schema->n_children; ++i) {
-        std::string name{schema->children[i]->name};
-        ColumnDType type = FormatMap(schema->children[i]->format);
-        ColumnarMetaInfo col_info{type, i};
-        columns.push_back(col_info);
-      }
-      if (schema->release) {
-        schema->release(schema);
-      }
-    }
-  }
-};
-
-class ArrowColumnarBatch {
- public:
-  ArrowColumnarBatch(struct ArrowArray *rb, struct ArrowSchemaImporter* schema)
-    : rb_{rb}, schema_{schema} {
-    CHECK(rb_) << "Cannot import non-existent record batch";
-    CHECK(!schema_->columns.empty()) << "Cannot import record batch without a schema";
-  }
-
-  size_t Import(float missing) {
-    auto& infov = schema_->columns;
-    for (size_t i = 0; i < infov.size(); ++i) {
-      columns_.push_back(CreateColumn(i, infov[i], missing));
-    }
-
-    // Compute the starting location for every row in this batch
-    auto batch_size = rb_->length;
-    auto num_columns = columns_.size();
-    row_offsets_.resize(batch_size + 1, 0);
-    for (auto i = 0; i < batch_size; ++i) {
-      row_offsets_[i+1] = row_offsets_[i];
-      for (size_t j = 0; j < num_columns; ++j) {
-        if (GetColumn(j).IsValidElement(i)) {
-          row_offsets_[i+1]++;
-        }
-      }
-    }
-    // return number of elements in the batch
-    return row_offsets_.back();
-  }
-
-  ArrowColumnarBatch(const ArrowColumnarBatch&) = delete;
-  ArrowColumnarBatch& operator=(const ArrowColumnarBatch&) = delete;
-  ArrowColumnarBatch(ArrowColumnarBatch&&) = delete;
-  ArrowColumnarBatch& operator=(ArrowColumnarBatch&&) = delete;
-
-  virtual ~ArrowColumnarBatch() {
-    if (rb_ && rb_->release) {
-      rb_->release(rb_);
-      rb_ = nullptr;
-    }
-    columns_.clear();
-  }
-
-  size_t Size() const { return rb_ ? rb_->length : 0; }
-
-  size_t NumColumns() const { return columns_.size(); }
-
-  size_t NumElements() const { return row_offsets_.back(); }
-
-  const Column& GetColumn(size_t col_idx) const {
-    return *columns_[col_idx];
-  }
-
-  void ShiftRowOffsets(size_t batch_offset) {
-    std::transform(row_offsets_.begin(), row_offsets_.end(), row_offsets_.begin(),
-        [=](size_t c) { return c + batch_offset; });
-  }
-
-  const std::vector<size_t>& RowOffsets() const { return row_offsets_; }
-
- private:
-  std::shared_ptr<Column> CreateColumn(size_t idx,
-                                      ColumnarMetaInfo info,
-                                      float missing) const {
-    if (info.loc < 0) {
-      return nullptr;
-    }
-
-    auto loc_in_batch = info.loc;
-    auto length = rb_->length;
-    auto null_count = rb_->null_count;
-    auto buffers0 = rb_->children[loc_in_batch]->buffers[0];
-    auto buffers1 = rb_->children[loc_in_batch]->buffers[1];
-    const uint8_t* bitmap = buffers0 ? reinterpret_cast<const uint8_t*>(buffers0) : nullptr;
-    const uint8_t* data = buffers1 ? reinterpret_cast<const uint8_t*>(buffers1) : nullptr;
-
-    // if null_count is not computed, compute it here
-    if (null_count < 0) {
-      if (!bitmap) {
-        null_count = 0;
-      } else {
-        null_count = length;
-        for (auto i = 0; i < length; ++i) {
-          if (bitmap[i/8] & (1 << (i%8))) {
-            null_count--;
-          }
-        }
-      }
-    }
-
-    switch (info.type) {
-      case ColumnDType::kInt8:
-        return std::make_shared<PrimitiveColumn<int8_t>>(
-            idx, length, null_count, bitmap,
-            reinterpret_cast<const int8_t*>(data), missing);
-      case ColumnDType::kUInt8:
-        return std::make_shared<PrimitiveColumn<uint8_t>>(
-            idx, length, null_count, bitmap, data, missing);
-      case ColumnDType::kInt16:
-        return std::make_shared<PrimitiveColumn<int16_t>>(
-            idx, length, null_count, bitmap,
-            reinterpret_cast<const int16_t*>(data), missing);
-      case ColumnDType::kUInt16:
-        return std::make_shared<PrimitiveColumn<uint16_t>>(
-            idx, length, null_count, bitmap,
-            reinterpret_cast<const uint16_t*>(data), missing);
-      case ColumnDType::kInt32:
-        return std::make_shared<PrimitiveColumn<int32_t>>(
-            idx, length, null_count, bitmap,
-            reinterpret_cast<const int32_t*>(data), missing);
-      case ColumnDType::kUInt32:
-        return std::make_shared<PrimitiveColumn<uint32_t>>(
-            idx, length, null_count, bitmap,
-            reinterpret_cast<const uint32_t*>(data), missing);
-      case ColumnDType::kInt64:
-        return std::make_shared<PrimitiveColumn<int64_t>>(
-            idx, length, null_count, bitmap,
-            reinterpret_cast<const int64_t*>(data), missing);
-      case ColumnDType::kUInt64:
-        return std::make_shared<PrimitiveColumn<uint64_t>>(
-            idx, length, null_count, bitmap,
-            reinterpret_cast<const uint64_t*>(data), missing);
-      case ColumnDType::kFloat:
-        return std::make_shared<PrimitiveColumn<float>>(
-            idx, length, null_count, bitmap,
-            reinterpret_cast<const float*>(data), missing);
-      case ColumnDType::kDouble:
-        return std::make_shared<PrimitiveColumn<double>>(
-            idx, length, null_count, bitmap,
-            reinterpret_cast<const double*>(data), missing);
-      default:
-        return nullptr;
-    }
-  }
-
-  struct ArrowArray* rb_;
-  struct ArrowSchemaImporter* schema_;
-  std::vector<std::shared_ptr<Column>> columns_;
-  std::vector<size_t> row_offsets_;
-};
-
-using ArrowColumnarBatchVec = std::vector<std::unique_ptr<ArrowColumnarBatch>>;
-class RecordBatchesIterAdapter: public dmlc::DataIter<ArrowColumnarBatchVec> {
- public:
-  RecordBatchesIterAdapter(XGDMatrixCallbackNext* next_callback, int nbatch)
-      : next_callback_{next_callback}, nbatches_{nbatch} {}
-
-  void BeforeFirst() override {
-    CHECK(at_first_) << "Cannot reset RecordBatchesIterAdapter";
-  }
-
-  bool Next() override {
-    batches_.clear();
-    while (batches_.size() < static_cast<size_t>(nbatches_) && (*next_callback_)(this) != 0) {
-      at_first_ = false;
-    }
-
-    if (batches_.size() > 0) {
-      return true;
-    } else {
-      return false;
-    }
-  }
-
-  void SetData(struct ArrowArray* rb, struct ArrowSchema* schema) {
-    // Schema is only imported once at the beginning, regardless how many
-    // baches are comming.
-    // But even schema is not imported we still need to release its C data
-    // exported from Arrow.
-    if (at_first_ && schema) {
-      schema_.Import(schema);
-    } else {
-      if (schema && schema->release) {
-        schema->release(schema);
-      }
-    }
-    if (rb) {
-      batches_.push_back(std::make_unique<ArrowColumnarBatch>(rb, &schema_));
-    }
-  }
-
-  const ArrowColumnarBatchVec& Value() const override {
-    return batches_;
-  }
-
-  size_t NumColumns() const { return schema_.columns.size(); }
-  size_t NumRows() const { return kAdapterUnknownSize; }
-
- private:
-  XGDMatrixCallbackNext *next_callback_;
-  bool at_first_{true};
-  int nbatches_;
-  struct ArrowSchemaImporter schema_;
-  ArrowColumnarBatchVec batches_;
-};
-
 class SparsePageAdapterBatch {
   HostSparsePageView page_;
 
diff --git a/src/data/array_interface.h b/src/data/array_interface.h
index 55e245eb1..53dbc37a1 100644
--- a/src/data/array_interface.h
+++ b/src/data/array_interface.h
@@ -16,7 +16,7 @@
 #include <utility>
 #include <vector>
 
-#include "../common/bitfield.h"
+#include "../common/bitfield.h"  // for RBitField8
 #include "../common/common.h"
 #include "../common/error_msg.h"  // for NoF128
 #include "xgboost/base.h"
@@ -106,7 +106,20 @@ struct ArrayInterfaceErrors {
  */
 class ArrayInterfaceHandler {
  public:
-  enum Type : std::int8_t { kF2, kF4, kF8, kF16, kI1, kI2, kI4, kI8, kU1, kU2, kU4, kU8 };
+  enum Type : std::int8_t {
+    kF2 = 0,
+    kF4 = 1,
+    kF8 = 2,
+    kF16 = 3,
+    kI1 = 4,
+    kI2 = 5,
+    kI4 = 6,
+    kI8 = 7,
+    kU1 = 8,
+    kU2 = 9,
+    kU4 = 10,
+    kU8 = 11,
+  };
 
   template <typename PtrType>
   static PtrType GetPtrFromArrayData(Object::Map const &obj) {
@@ -589,6 +602,57 @@ class ArrayInterface {
   ArrayInterfaceHandler::Type type{ArrayInterfaceHandler::kF16};
 };
 
+template <typename Fn>
+auto DispatchDType(ArrayInterfaceHandler::Type dtype, Fn dispatch) {
+  switch (dtype) {
+    case ArrayInterfaceHandler::kF2: {
+#if defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
+      return dispatch(__half{});
+#else
+      LOG(FATAL) << "half type is only supported for CUDA input.";
+      break;
+#endif
+    }
+    case ArrayInterfaceHandler::kF4: {
+      return dispatch(float{});
+    }
+    case ArrayInterfaceHandler::kF8: {
+      return dispatch(double{});
+    }
+    case ArrayInterfaceHandler::kF16: {
+      using T = long double;
+      CHECK(sizeof(T) == 16) << error::NoF128();
+      return dispatch(T{});
+    }
+    case ArrayInterfaceHandler::kI1: {
+      return dispatch(std::int8_t{});
+    }
+    case ArrayInterfaceHandler::kI2: {
+      return dispatch(std::int16_t{});
+    }
+    case ArrayInterfaceHandler::kI4: {
+      return dispatch(std::int32_t{});
+    }
+    case ArrayInterfaceHandler::kI8: {
+      return dispatch(std::int64_t{});
+    }
+    case ArrayInterfaceHandler::kU1: {
+      return dispatch(std::uint8_t{});
+    }
+    case ArrayInterfaceHandler::kU2: {
+      return dispatch(std::uint16_t{});
+    }
+    case ArrayInterfaceHandler::kU4: {
+      return dispatch(std::uint32_t{});
+    }
+    case ArrayInterfaceHandler::kU8: {
+      return dispatch(std::uint64_t{});
+    }
+  }
+
+  return std::result_of_t<Fn(std::int8_t)>();
+}
+
 template <std::int32_t D, typename Fn>
 void DispatchDType(ArrayInterface<D> const array, DeviceOrd device, Fn fn) {
   // Only used for cuDF at the moment.
@@ -604,60 +668,7 @@ void DispatchDType(ArrayInterface<D> const array, DeviceOrd device, Fn fn) {
                                                       std::numeric_limits<std::size_t>::max()},
                                 array.shape, array.strides, device});
   };
-  switch (array.type) {
-    case ArrayInterfaceHandler::kF2: {
-#if defined(XGBOOST_USE_CUDA) || defined(__HIP_PLATFORM_AMD__)
-      dispatch(__half{});
-#endif
-      break;
-    }
-    case ArrayInterfaceHandler::kF4: {
-      dispatch(float{});
-      break;
-    }
-    case ArrayInterfaceHandler::kF8: {
-      dispatch(double{});
-      break;
-    }
-    case ArrayInterfaceHandler::kF16: {
-      using T = long double;
-      CHECK(sizeof(long double) == 16) << error::NoF128();
-      dispatch(T{});
-      break;
-    }
-    case ArrayInterfaceHandler::kI1: {
-      dispatch(std::int8_t{});
-      break;
-    }
-    case ArrayInterfaceHandler::kI2: {
-      dispatch(std::int16_t{});
-      break;
-    }
-    case ArrayInterfaceHandler::kI4: {
-      dispatch(std::int32_t{});
-      break;
-    }
-    case ArrayInterfaceHandler::kI8: {
-      dispatch(std::int64_t{});
-      break;
-    }
-    case ArrayInterfaceHandler::kU1: {
-      dispatch(std::uint8_t{});
-      break;
-    }
-    case ArrayInterfaceHandler::kU2: {
-      dispatch(std::uint16_t{});
-      break;
-    }
-    case ArrayInterfaceHandler::kU4: {
-      dispatch(std::uint32_t{});
-      break;
-    }
-    case ArrayInterfaceHandler::kU8: {
-      dispatch(std::uint64_t{});
-      break;
-    }
-  }
+  DispatchDType(array.type, dispatch);
 }
 
 /**
diff --git a/src/data/arrow-cdi.h b/src/data/arrow-cdi.h
deleted file mode 100644
index 2cb061b3a..000000000
--- a/src/data/arrow-cdi.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#pragma once
-
-#include <cstdint>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define ARROW_FLAG_DICTIONARY_ORDERED 1
-#define ARROW_FLAG_NULLABLE 2
-#define ARROW_FLAG_MAP_KEYS_SORTED 4
-
-struct ArrowSchema {
-  // Array type description
-  const char* format;
-  const char* name;
-  const char* metadata;
-  int64_t flags;
-  int64_t n_children;
-  struct ArrowSchema** children;
-  struct ArrowSchema* dictionary;
-
-  // Release callback
-  void (*release)(struct ArrowSchema*);
-  // Opaque producer-specific data
-  void* private_data;
-};
-
-struct ArrowArray {
-  // Array data description
-  int64_t length;
-  int64_t null_count;
-  int64_t offset;
-  int64_t n_buffers;
-  int64_t n_children;
-  const void** buffers;
-  struct ArrowArray** children;
-  struct ArrowArray* dictionary;
-
-  // Release callback
-  void (*release)(struct ArrowArray*);
-  // Opaque producer-specific data
-  void* private_data;
-};
-
-#ifdef __cplusplus
-}
-#endif
diff --git a/src/data/data.cc b/src/data/data.cc
index 7efa07174..b466bcfef 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -635,22 +635,39 @@ void MetaInfo::GetInfo(char const* key, bst_ulong* out_len, DataType dtype,
 }
 
 void MetaInfo::SetFeatureInfo(const char* key, const char **info, const bst_ulong size) {
-  if (size != 0 && this->num_col_ != 0) {
+  if (size != 0 && this->num_col_ != 0 && !IsColumnSplit()) {
     CHECK_EQ(size, this->num_col_) << "Length of " << key << " must be equal to number of columns.";
     CHECK(info);
   }
   if (!std::strcmp(key, "feature_type")) {
     feature_type_names.clear();
-    auto& h_feature_types = feature_types.HostVector();
     for (size_t i = 0; i < size; ++i) {
       auto elem = info[i];
       feature_type_names.emplace_back(elem);
     }
+    if (IsColumnSplit()) {
+      feature_type_names = collective::AllgatherStrings(feature_type_names);
+      CHECK_EQ(feature_type_names.size(), num_col_)
+          << "Length of " << key << " must be equal to number of columns.";
+    }
+    auto& h_feature_types = feature_types.HostVector();
     LoadFeatureType(feature_type_names, &h_feature_types);
   } else if (!std::strcmp(key, "feature_name")) {
-    feature_names.clear();
-    for (size_t i = 0; i < size; ++i) {
-      feature_names.emplace_back(info[i]);
+    if (IsColumnSplit()) {
+      std::vector<std::string> local_feature_names{};
+      auto const rank = collective::GetRank();
+      for (std::size_t i = 0; i < size; ++i) {
+        auto elem = std::to_string(rank) + "." + info[i];
+        local_feature_names.emplace_back(elem);
+      }
+      feature_names = collective::AllgatherStrings(local_feature_names);
+      CHECK_EQ(feature_names.size(), num_col_)
+        << "Length of " << key << " must be equal to number of columns.";
+    } else {
+      feature_names.clear();
+      for (size_t i = 0; i < size; ++i) {
+        feature_names.emplace_back(info[i]);
+      }
     }
   } else {
     LOG(FATAL) << "Unknown feature info name: " << key;
@@ -687,13 +704,13 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col
 
   linalg::Stack(&this->labels, that.labels);
 
-  this->weights_.SetDevice(that.weights_.DeviceIdx());
+  this->weights_.SetDevice(that.weights_.Device());
   this->weights_.Extend(that.weights_);
 
-  this->labels_lower_bound_.SetDevice(that.labels_lower_bound_.DeviceIdx());
+  this->labels_lower_bound_.SetDevice(that.labels_lower_bound_.Device());
   this->labels_lower_bound_.Extend(that.labels_lower_bound_);
 
-  this->labels_upper_bound_.SetDevice(that.labels_upper_bound_.DeviceIdx());
+  this->labels_upper_bound_.SetDevice(that.labels_upper_bound_.Device());
   this->labels_upper_bound_.Extend(that.labels_upper_bound_);
 
   linalg::Stack(&this->base_margin_, that.base_margin_);
@@ -723,13 +740,13 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col
   }
   if (!that.feature_weights.Empty()) {
     this->feature_weights.Resize(that.feature_weights.Size());
-    this->feature_weights.SetDevice(that.feature_weights.DeviceIdx());
+    this->feature_weights.SetDevice(that.feature_weights.Device());
     this->feature_weights.Copy(that.feature_weights);
   }
 }
 
 void MetaInfo::SynchronizeNumberOfColumns() {
-  if (IsVerticalFederated()) {
+  if (IsColumnSplit()) {
     collective::Allreduce<collective::Operation::kSum>(&num_col_, 1);
   } else {
     collective::Allreduce<collective::Operation::kMax>(&num_col_, 1);
@@ -738,22 +755,22 @@ void MetaInfo::SynchronizeNumberOfColumns() {
 
 namespace {
 template <typename T>
-void CheckDevice(std::int32_t device, HostDeviceVector<T> const& v) {
-  bool valid = v.Device().IsCPU() || device == Context::kCpuId || v.DeviceIdx() == device;
+void CheckDevice(DeviceOrd device, HostDeviceVector<T> const& v) {
+  bool valid = v.Device().IsCPU() || device.IsCPU() || v.Device() == device;
   if (!valid) {
     LOG(FATAL) << "Invalid device ordinal. Data is associated with a different device ordinal than "
                   "the booster. The device ordinal of the data is: "
-               << v.DeviceIdx() << "; the device ordinal of the Booster is: " << device;
+               << v.Device() << "; the device ordinal of the Booster is: " << device;
   }
 }
 
 template <typename T, std::int32_t D>
-void CheckDevice(std::int32_t device, linalg::Tensor<T, D> const& v) {
+void CheckDevice(DeviceOrd device, linalg::Tensor<T, D> const& v) {
   CheckDevice(device, *v.Data());
 }
 }  // anonymous namespace
 
-void MetaInfo::Validate(std::int32_t device) const {
+void MetaInfo::Validate(DeviceOrd device) const {
   if (group_ptr_.size() != 0 && weights_.Size() != 0) {
     CHECK_EQ(group_ptr_.size(), weights_.Size() + 1) << error::GroupWeight();
     return;
@@ -850,14 +867,6 @@ DMatrix* TryLoadBinary(std::string fname, bool silent) {
 }  // namespace
 
 DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_split_mode) {
-  auto need_split = false;
-  if (collective::IsFederated()) {
-    LOG(CONSOLE) << "XGBoost federated mode detected, not splitting data among workers";
-  } else if (collective::IsDistributed()) {
-    LOG(CONSOLE) << "XGBoost distributed mode detected, will split data among workers";
-    need_split = true;
-  }
-
   std::string fname, cache_file;
   auto dlm_pos = uri.find('#');
   if (dlm_pos != std::string::npos) {
@@ -865,24 +874,6 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
     fname = uri.substr(0, dlm_pos);
     CHECK_EQ(cache_file.find('#'), std::string::npos)
         << "Only one `#` is allowed in file path for cache file specification.";
-    if (need_split && data_split_mode == DataSplitMode::kRow) {
-      std::ostringstream os;
-      std::vector<std::string> cache_shards = common::Split(cache_file, ':');
-      for (size_t i = 0; i < cache_shards.size(); ++i) {
-        size_t pos = cache_shards[i].rfind('.');
-        if (pos == std::string::npos) {
-          os << cache_shards[i] << ".r" << collective::GetRank() << "-"
-             << collective::GetWorldSize();
-        } else {
-          os << cache_shards[i].substr(0, pos) << ".r" << collective::GetRank() << "-"
-             << collective::GetWorldSize() << cache_shards[i].substr(pos, cache_shards[i].length());
-        }
-        if (i + 1 != cache_shards.size()) {
-          os << ':';
-        }
-      }
-      cache_file = os.str();
-    }
   } else {
     fname = uri;
   }
@@ -894,19 +885,7 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
   }
 
   int partid = 0, npart = 1;
-  if (need_split && data_split_mode == DataSplitMode::kRow) {
-    partid = collective::GetRank();
-    npart = collective::GetWorldSize();
-  } else {
-    // test option to load in part
-    npart = 1;
-  }
-
-  if (npart != 1) {
-    LOG(CONSOLE) << "Load part of data " << partid << " of " << npart << " parts";
-  }
-
-  DMatrix* dmat{nullptr};
+  DMatrix* dmat{};
 
   if (cache_file.empty()) {
     fname = data::ValidateFileFormat(fname);
@@ -916,6 +895,8 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
     dmat = DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), Context{}.Threads(),
                            cache_file, data_split_mode);
   } else {
+    CHECK(data_split_mode != DataSplitMode::kCol)
+        << "Column-wise data split is not supported for external memory.";
     data::FileIterator iter{fname, static_cast<uint32_t>(partid), static_cast<uint32_t>(npart)};
     dmat = new data::SparsePageDMatrix{&iter,
                                        iter.Proxy(),
@@ -926,17 +907,7 @@ DMatrix* DMatrix::Load(const std::string& uri, bool silent, DataSplitMode data_s
                                        cache_file};
   }
 
-  if (need_split && data_split_mode == DataSplitMode::kCol) {
-    if (!cache_file.empty()) {
-      LOG(FATAL) << "Column-wise data split is not support for external memory.";
-    }
-    LOG(CONSOLE) << "Splitting data by column";
-    auto* sliced = dmat->SliceCol(npart, partid);
-    delete dmat;
-    return sliced;
-  } else {
-    return dmat;
-  }
+  return dmat;
 }
 
 template <typename DataIterHandle, typename DMatrixHandle, typename DataIterResetCallback,
@@ -1011,9 +982,6 @@ template DMatrix* DMatrix::Create<data::CSCArrayAdapter>(data::CSCArrayAdapter*
 template DMatrix* DMatrix::Create(
     data::IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>* adapter,
     float missing, int nthread, const std::string& cache_prefix, DataSplitMode data_split_mode);
-template DMatrix* DMatrix::Create<data::RecordBatchesIterAdapter>(
-    data::RecordBatchesIterAdapter* adapter, float missing, int nthread, const std::string&,
-    DataSplitMode data_split_mode);
 
 SparsePage SparsePage::GetTranspose(int num_columns, int32_t n_threads) const {
   SparsePage transpose;
diff --git a/src/data/data.cu b/src/data/data.cu
index b1b75f5e6..9c0c02b24 100644
--- a/src/data/data.cu
+++ b/src/data/data.cu
@@ -33,13 +33,13 @@ template <typename T, int32_t D>
 void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tensor<T, D>* p_out) {
   ArrayInterface<D> array(arr_interface);
   if (array.n == 0) {
-    p_out->SetDevice(0);
+    p_out->SetDevice(DeviceOrd::CUDA(0));
     p_out->Reshape(array.shape);
     return;
   }
   CHECK_EQ(array.valid.Capacity(), 0)
       << "Meta info like label or weight can not have missing value.";
-  auto ptr_device = SetDeviceToPtr(array.data);
+  auto ptr_device = DeviceOrd::CUDA(SetDeviceToPtr(array.data));
   p_out->SetDevice(ptr_device);
 
   if (array.is_contiguous && array.type == ToDType<T>::kType) {
@@ -55,7 +55,7 @@ void CopyTensorInfoImpl(CUDAContext const* ctx, Json arr_interface, linalg::Tens
     return;
   }
   p_out->Reshape(array.shape);
-  auto t = p_out->View(DeviceOrd::CUDA(ptr_device));
+  auto t = p_out->View(ptr_device);
   linalg::ElementWiseTransformDevice(
       t,
       [=] __device__(size_t i, T) {
@@ -91,7 +91,7 @@ void CopyQidImpl(ArrayInterface<1> array_interface, std::vector<bst_group_t>* p_
       });
   dh::caching_device_vector<bool> flag(1);
   auto d_flag = dh::ToSpan(flag);
-  auto d = SetDeviceToPtr(array_interface.data);
+  auto d = DeviceOrd::CUDA(SetDeviceToPtr(array_interface.data));
   dh::LaunchN(1, [=] __device__(size_t) { d_flag[0] = true; });
   dh::LaunchN(array_interface.Shape(0) - 1, [=] __device__(size_t i) {
     auto typed = TypedIndex<uint32_t, 1>{array_interface};
diff --git a/src/data/device_adapter.cuh b/src/data/device_adapter.cuh
index 7b907f7e2..ac19d47e4 100644
--- a/src/data/device_adapter.cuh
+++ b/src/data/device_adapter.cuh
@@ -28,8 +28,8 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
   CudfAdapterBatch(common::Span<ArrayInterface<1>> columns, size_t num_rows)
       : columns_(columns),
         num_rows_(num_rows) {}
-  size_t Size() const { return num_rows_ * columns_.size(); }
-  __device__ __forceinline__ COOTuple GetElement(size_t idx) const {
+  [[nodiscard]] std::size_t Size() const { return num_rows_ * columns_.size(); }
+  [[nodiscard]] __device__ __forceinline__ COOTuple GetElement(size_t idx) const {
     size_t column_idx = idx % columns_.size();
     size_t row_idx = idx / columns_.size();
     auto const& column = columns_[column_idx];
@@ -39,7 +39,7 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
     return {row_idx, column_idx, value};
   }
 
-  __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
+  [[nodiscard]] __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
     auto const& column = columns_[fidx];
     float value = column.valid.Data() == nullptr || column.valid.Check(ridx)
                       ? column(ridx)
@@ -47,8 +47,8 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
     return value;
   }
 
-  XGBOOST_DEVICE bst_row_t NumRows() const { return num_rows_; }
-  XGBOOST_DEVICE bst_row_t NumCols() const { return columns_.size(); }
+  [[nodiscard]] XGBOOST_DEVICE bst_row_t NumRows() const { return num_rows_; }
+  [[nodiscard]] XGBOOST_DEVICE bst_row_t NumCols() const { return columns_.size(); }
 
  private:
   common::Span<ArrayInterface<1>> columns_;
@@ -120,16 +120,14 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
       return;
     }
 
-    device_idx_ = dh::CudaGetPointerDevice(first_column.data);
-    CHECK_NE(device_idx_, Context::kCpuId);
-
-    dh::safe_cuda(cudaSetDevice(device_idx_));
-
+    device_ = DeviceOrd::CUDA(dh::CudaGetPointerDevice(first_column.data));
+    CHECK(device_.IsCUDA());
+    dh::safe_cuda(cudaSetDevice(device_.ordinal));
     for (auto& json_col : json_columns) {
       auto column = ArrayInterface<1>(get<Object const>(json_col));
       columns.push_back(column);
       num_rows_ = std::max(num_rows_, column.Shape(0));
-      CHECK_EQ(device_idx_, dh::CudaGetPointerDevice(column.data))
+      CHECK_EQ(device_.ordinal, dh::CudaGetPointerDevice(column.data))
           << "All columns should use the same device.";
       CHECK_EQ(num_rows_, column.Shape(0))
           << "All columns should have same number of rows.";
@@ -145,15 +143,15 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
     return batch_;
   }
 
-  size_t NumRows() const { return num_rows_; }
-  size_t NumColumns() const { return columns_.size(); }
-  int32_t DeviceIdx() const { return device_idx_; }
+  [[nodiscard]] std::size_t NumRows() const { return num_rows_; }
+  [[nodiscard]] std::size_t NumColumns() const { return columns_.size(); }
+  [[nodiscard]] DeviceOrd Device() const { return device_; }
 
  private:
   CudfAdapterBatch batch_;
   dh::device_vector<ArrayInterface<1>> columns_;
   size_t num_rows_{0};
-  int32_t device_idx_{Context::kCpuId};
+  DeviceOrd device_{DeviceOrd::CPU()};
 };
 
 class CupyAdapterBatch : public detail::NoMetaInfo {
@@ -161,22 +159,22 @@ class CupyAdapterBatch : public detail::NoMetaInfo {
   CupyAdapterBatch() = default;
   explicit CupyAdapterBatch(ArrayInterface<2> array_interface)
     : array_interface_(std::move(array_interface)) {}
-  size_t Size() const {
+  [[nodiscard]] std::size_t Size() const {
     return array_interface_.Shape(0) * array_interface_.Shape(1);
   }
-  __device__ COOTuple GetElement(size_t idx) const {
+  [[nodiscard]]__device__ COOTuple GetElement(size_t idx) const {
     size_t column_idx = idx % array_interface_.Shape(1);
     size_t row_idx = idx / array_interface_.Shape(1);
     float value = array_interface_(row_idx, column_idx);
     return {row_idx, column_idx, value};
   }
-  __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
+  [[nodiscard]] __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
     float value = array_interface_(ridx, fidx);
     return value;
   }
 
-  XGBOOST_DEVICE bst_row_t NumRows() const { return array_interface_.Shape(0); }
-  XGBOOST_DEVICE bst_row_t NumCols() const { return array_interface_.Shape(1); }
+  [[nodiscard]] XGBOOST_DEVICE bst_row_t NumRows() const { return array_interface_.Shape(0); }
+  [[nodiscard]] XGBOOST_DEVICE bst_row_t NumCols() const { return array_interface_.Shape(1); }
 
  private:
   ArrayInterface<2> array_interface_;
@@ -191,29 +189,28 @@ class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {
     if (array_interface_.Shape(0) == 0) {
       return;
     }
-    device_idx_ = dh::CudaGetPointerDevice(array_interface_.data);
-    CHECK_NE(device_idx_, Context::kCpuId);
+    device_ = DeviceOrd::CUDA(dh::CudaGetPointerDevice(array_interface_.data));
+    CHECK(device_.IsCUDA());
   }
   explicit CupyAdapter(std::string cuda_interface_str)
       : CupyAdapter{StringView{cuda_interface_str}} {}
-  const CupyAdapterBatch& Value() const override { return batch_; }
+  [[nodiscard]] const CupyAdapterBatch& Value() const override { return batch_; }
 
-  size_t NumRows() const { return array_interface_.Shape(0); }
-  size_t NumColumns() const { return array_interface_.Shape(1); }
-  int32_t DeviceIdx() const { return device_idx_; }
+  [[nodiscard]] std::size_t NumRows() const { return array_interface_.Shape(0); }
+  [[nodiscard]] std::size_t NumColumns() const { return array_interface_.Shape(1); }
+  [[nodiscard]] DeviceOrd Device() const { return device_; }
 
  private:
   ArrayInterface<2> array_interface_;
   CupyAdapterBatch batch_;
-  int32_t device_idx_ {Context::kCpuId};
+  DeviceOrd device_{DeviceOrd::CPU()};
 };
 
 // Returns maximum row length
 template <typename AdapterBatchT>
-std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offset, int device_idx,
+std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offset, DeviceOrd device,
                          float missing) {
-  dh::safe_cuda(cudaSetDevice(device_idx));
-
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
   IsValidFunctor is_valid(missing);
   dh::safe_cuda(cudaMemsetAsync(offset.data(), '\0', offset.size_bytes()));
 
diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index 58b96b665..c0f91380b 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -98,23 +98,18 @@ __global__ void CompressBinEllpackKernel(
 }
 
 // Construct an ELLPACK matrix with the given number of empty rows.
-EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
-                                 bool is_dense, size_t row_stride,
-                                 size_t n_rows)
-    : is_dense(is_dense),
-      cuts_(std::move(cuts)),
-      row_stride(row_stride),
-      n_rows(n_rows) {
+EllpackPageImpl::EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts, bool is_dense,
+                                 size_t row_stride, size_t n_rows)
+    : is_dense(is_dense), cuts_(std::move(cuts)), row_stride(row_stride), n_rows(n_rows) {
   monitor_.Init("ellpack_page");
-
-  dh::safe_cuda(cudaSetDevice(device));
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
 
   monitor_.Start("InitCompressedData");
   InitCompressedData(device);
   monitor_.Stop("InitCompressedData");
 }
 
-EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
+EllpackPageImpl::EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts,
                                  const SparsePage &page, bool is_dense,
                                  size_t row_stride,
                                  common::Span<FeatureType const> feature_types)
@@ -128,7 +123,7 @@ EllpackPageImpl::EllpackPageImpl(int device, common::HistogramCuts cuts,
 EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& param)
     : is_dense(dmat->IsDense()) {
   monitor_.Init("ellpack_page");
-  dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
+  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
 
   n_rows = dmat->Info().num_row_;
 
@@ -143,15 +138,15 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchP
   monitor_.Stop("Quantiles");
 
   monitor_.Start("InitCompressedData");
-  this->InitCompressedData(ctx->gpu_id);
+  this->InitCompressedData(ctx->Device());
   monitor_.Stop("InitCompressedData");
 
-  dmat->Info().feature_types.SetDevice(ctx->gpu_id);
+  dmat->Info().feature_types.SetDevice(ctx->Device());
   auto ft = dmat->Info().feature_types.ConstDeviceSpan();
   monitor_.Start("BinningCompression");
   CHECK(dmat->SingleColBlock());
   for (const auto& batch : dmat->GetBatches<SparsePage>()) {
-    CreateHistIndices(ctx->gpu_id, batch, ft);
+    CreateHistIndices(ctx->Device(), batch, ft);
   }
   monitor_.Stop("BinningCompression");
 }
@@ -214,7 +209,7 @@ struct TupleScanOp {
 // to remove missing data
 template <typename AdapterBatchT>
 void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType const> feature_types,
-                       EllpackPageImpl* dst, int device_idx, float missing) {
+                       EllpackPageImpl* dst, DeviceOrd device, float missing) {
   // Some witchcraft happens here
   // The goal is to copy valid elements out of the input to an ELLPACK matrix
   // with a given row stride, using no extra working memory Standard stream
@@ -246,7 +241,7 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType cons
   // Tuple[2] = The index in the input data
   using Tuple = thrust::tuple<size_t, size_t, size_t>;
 
-  auto device_accessor = dst->GetDeviceAccessor(device_idx);
+  auto device_accessor = dst->GetDeviceAccessor(device);
   common::CompressedBufferWriter writer(device_accessor.NumSymbols());
   auto d_compressed_buffer = dst->gidx_buffer.DevicePointer();
 
@@ -298,10 +293,9 @@ void CopyDataToEllpack(const AdapterBatchT& batch, common::Span<FeatureType cons
 #endif
 }
 
-void WriteNullValues(EllpackPageImpl* dst, int device_idx,
-                     common::Span<size_t> row_counts) {
+void WriteNullValues(EllpackPageImpl* dst, DeviceOrd device, common::Span<size_t> row_counts) {
   // Write the null values
-  auto device_accessor = dst->GetDeviceAccessor(device_idx);
+  auto device_accessor = dst->GetDeviceAccessor(device);
   common::CompressedBufferWriter writer(device_accessor.NumSymbols());
   auto d_compressed_buffer = dst->gidx_buffer.DevicePointer();
   auto row_stride = dst->row_stride;
@@ -318,11 +312,11 @@ void WriteNullValues(EllpackPageImpl* dst, int device_idx,
 }
 
 template <typename AdapterBatch>
-EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, int device, bool is_dense,
+EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, DeviceOrd device, bool is_dense,
                                  common::Span<size_t> row_counts_span,
                                  common::Span<FeatureType const> feature_types, size_t row_stride,
                                  size_t n_rows, common::HistogramCuts const& cuts) {
-  dh::safe_cuda(cudaSetDevice(device));
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
 
   *this = EllpackPageImpl(device, cuts, is_dense, row_stride, n_rows);
   CopyDataToEllpack(batch, feature_types, this, device, missing);
@@ -331,7 +325,7 @@ EllpackPageImpl::EllpackPageImpl(AdapterBatch batch, float missing, int device,
 
 #define ELLPACK_BATCH_SPECIALIZE(__BATCH_T)                                                \
   template EllpackPageImpl::EllpackPageImpl(                                               \
-      __BATCH_T batch, float missing, int device, bool is_dense,                           \
+      __BATCH_T batch, float missing, DeviceOrd device, bool is_dense,                     \
       common::Span<size_t> row_counts_span, common::Span<FeatureType const> feature_types, \
       size_t row_stride, size_t n_rows, common::HistogramCuts const& cuts);
 
@@ -388,9 +382,9 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
       [&](size_t i) { return page.row_ptr[i + 1] - page.row_ptr[i]; });
   row_stride = *std::max_element(it, it + page.Size());
 
-  CHECK_GE(ctx->gpu_id, 0);
+  CHECK(ctx->IsCUDA());
   monitor_.Start("InitCompressedData");
-  InitCompressedData(ctx->gpu_id);
+  InitCompressedData(ctx->Device());
   monitor_.Stop("InitCompressedData");
 
   // copy gidx
@@ -400,7 +394,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
   dh::safe_cuda(cudaMemcpyAsync(d_row_ptr.data(), page.row_ptr.data(), d_row_ptr.size_bytes(),
                                 cudaMemcpyHostToDevice, ctx->CUDACtx()->Stream()));
 
-  auto accessor = this->GetDeviceAccessor(ctx->gpu_id, ft);
+  auto accessor = this->GetDeviceAccessor(ctx->Device(), ft);
   auto null = accessor.NullValue();
   CopyGHistToEllpack(page, d_row_ptr, row_stride, d_compressed_buffer, null);
 }
@@ -425,8 +419,7 @@ struct CopyPage {
 };
 
 // Copy the data from the given EllpackPage to the current page.
-size_t EllpackPageImpl::Copy(int device, EllpackPageImpl const *page,
-                             size_t offset) {
+size_t EllpackPageImpl::Copy(DeviceOrd device, EllpackPageImpl const* page, size_t offset) {
   monitor_.Start("Copy");
   size_t num_elements = page->n_rows * page->row_stride;
   CHECK_EQ(row_stride, page->row_stride);
@@ -486,7 +479,7 @@ struct CompactPage {
 };
 
 // Compacts the data from the given EllpackPage into the current page.
-void EllpackPageImpl::Compact(int device, EllpackPageImpl const* page,
+void EllpackPageImpl::Compact(DeviceOrd device, EllpackPageImpl const* page,
                               common::Span<size_t> row_indexes) {
   monitor_.Start("Compact");
   CHECK_EQ(row_stride, page->row_stride);
@@ -499,13 +492,12 @@ void EllpackPageImpl::Compact(int device, EllpackPageImpl const* page,
 }
 
 // Initialize the buffer to stored compressed features.
-void EllpackPageImpl::InitCompressedData(int device) {
+void EllpackPageImpl::InitCompressedData(DeviceOrd device) {
   size_t num_symbols = NumSymbols();
 
   // Required buffer size for storing data matrix in ELLPack format.
   size_t compressed_size_bytes =
-    common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows,
-      num_symbols);
+      common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows, num_symbols);
   gidx_buffer.SetDevice(device);
   // Don't call fill unnecessarily
   if (gidx_buffer.Size() == 0) {
@@ -517,7 +509,7 @@ void EllpackPageImpl::InitCompressedData(int device) {
 }
 
 // Compress a CSR page into ELLPACK.
-void EllpackPageImpl::CreateHistIndices(int device,
+void EllpackPageImpl::CreateHistIndices(DeviceOrd device,
                                         const SparsePage& row_batch,
                                         common::Span<FeatureType const> feature_types) {
   if (row_batch.Size() == 0) return;
@@ -527,7 +519,7 @@ void EllpackPageImpl::CreateHistIndices(int device,
 
   // bin and compress entries in batches of rows
   size_t gpu_batch_nrows =
-      std::min(dh::TotalMemory(device) / (16 * row_stride * sizeof(Entry)),
+      std::min(dh::TotalMemory(device.ordinal) / (16 * row_stride * sizeof(Entry)),
                static_cast<size_t>(row_batch.Size()));
 
   size_t gpu_nbatches = common::DivRoundUp(row_batch.Size(), gpu_batch_nrows);
@@ -592,7 +584,7 @@ size_t EllpackPageImpl::MemCostBytes(size_t num_rows, size_t row_stride,
 }
 
 EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor(
-    int device, common::Span<FeatureType const> feature_types) const {
+    DeviceOrd device, common::Span<FeatureType const> feature_types) const {
   gidx_buffer.SetDevice(device);
   return {device,
           cuts_,
@@ -606,7 +598,7 @@ EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor(
 }
 EllpackDeviceAccessor EllpackPageImpl::GetHostAccessor(
     common::Span<FeatureType const> feature_types) const {
-  return {Context::kCpuId,
+  return {DeviceOrd::CPU(),
           cuts_,
           is_dense,
           row_stride,
diff --git a/src/data/ellpack_page.cuh b/src/data/ellpack_page.cuh
index 96963463b..c64462082 100644
--- a/src/data/ellpack_page.cuh
+++ b/src/data/ellpack_page.cuh
@@ -35,16 +35,17 @@ struct EllpackDeviceAccessor {
 
   common::Span<const FeatureType> feature_types;
 
-  EllpackDeviceAccessor(int device, const common::HistogramCuts& cuts,
-                        bool is_dense, size_t row_stride, size_t base_rowid,
-                        size_t n_rows,common::CompressedIterator<uint32_t> gidx_iter,
+  EllpackDeviceAccessor(DeviceOrd device, const common::HistogramCuts& cuts, bool is_dense,
+                        size_t row_stride, size_t base_rowid, size_t n_rows,
+                        common::CompressedIterator<uint32_t> gidx_iter,
                         common::Span<FeatureType const> feature_types)
       : is_dense(is_dense),
         row_stride(row_stride),
         base_rowid(base_rowid),
-        n_rows(n_rows) ,gidx_iter(gidx_iter),
+        n_rows(n_rows),
+        gidx_iter(gidx_iter),
         feature_types{feature_types} {
-    if (device == Context::kCpuId) {
+    if (device.IsCPU()) {
       gidx_fvalue_map = cuts.cut_values_.ConstHostSpan();
       feature_segments = cuts.cut_ptrs_.ConstHostSpan();
       min_fvalue = cuts.min_vals_.ConstHostSpan();
@@ -59,7 +60,7 @@ struct EllpackDeviceAccessor {
   }
   // Get a matrix element, uses binary search for look up Return NaN if missing
   // Given a row index and a feature index, returns the corresponding cut value
-  __device__ int32_t GetBinIndex(size_t ridx, size_t fidx) const {
+  [[nodiscard]] __device__ int32_t GetBinIndex(size_t ridx, size_t fidx) const {
     ridx -= base_rowid;
     auto row_begin = row_stride * ridx;
     auto row_end = row_begin + row_stride;
@@ -77,7 +78,7 @@ struct EllpackDeviceAccessor {
   }
 
   template <bool is_cat>
-  __device__ uint32_t SearchBin(float value, size_t column_id) const {
+  [[nodiscard]] __device__ uint32_t SearchBin(float value, size_t column_id) const {
     auto beg = feature_segments[column_id];
     auto end = feature_segments[column_id + 1];
     uint32_t idx = 0;
@@ -99,7 +100,7 @@ struct EllpackDeviceAccessor {
     return idx;
   }
 
-  __device__ bst_float GetFvalue(size_t ridx, size_t fidx) const {
+  [[nodiscard]] __device__ bst_float GetFvalue(size_t ridx, size_t fidx) const {
     auto gidx = GetBinIndex(ridx, fidx);
     if (gidx == -1) {
       return nan("");
@@ -108,18 +109,18 @@ struct EllpackDeviceAccessor {
   }
 
   // Check if the row id is withing range of the current batch.
-  __device__ bool IsInRange(size_t row_id) const {
+  [[nodiscard]] __device__ bool IsInRange(size_t row_id) const {
     return row_id >= base_rowid && row_id < base_rowid + n_rows;
   }
   /*! \brief Return the total number of symbols (total number of bins plus 1 for
    * not found). */
-  XGBOOST_DEVICE size_t NumSymbols() const { return gidx_fvalue_map.size() + 1; }
+  [[nodiscard]] XGBOOST_DEVICE size_t NumSymbols() const { return gidx_fvalue_map.size() + 1; }
 
-  XGBOOST_DEVICE size_t NullValue() const { return gidx_fvalue_map.size(); }
+  [[nodiscard]] XGBOOST_DEVICE size_t NullValue() const { return gidx_fvalue_map.size(); }
 
-  XGBOOST_DEVICE size_t NumBins() const { return gidx_fvalue_map.size(); }
+  [[nodiscard]] XGBOOST_DEVICE size_t NumBins() const { return gidx_fvalue_map.size(); }
 
-  XGBOOST_DEVICE size_t NumFeatures() const { return min_fvalue.size(); }
+  [[nodiscard]] XGBOOST_DEVICE size_t NumFeatures() const { return min_fvalue.size(); }
 };
 
 
@@ -141,14 +142,13 @@ class EllpackPageImpl {
    * This is used in the sampling case. The ELLPACK page is constructed from an existing EllpackInfo
    * and the given number of rows.
    */
-  EllpackPageImpl(int device, common::HistogramCuts cuts, bool is_dense,
-                  size_t row_stride, size_t n_rows);
+  EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts, bool is_dense, size_t row_stride,
+                  size_t n_rows);
   /*!
    * \brief Constructor used for external memory.
    */
-  EllpackPageImpl(int device, common::HistogramCuts cuts,
-                  const SparsePage &page, bool is_dense, size_t row_stride,
-                  common::Span<FeatureType const> feature_types);
+  EllpackPageImpl(DeviceOrd device, common::HistogramCuts cuts, const SparsePage& page,
+                  bool is_dense, size_t row_stride, common::Span<FeatureType const> feature_types);
 
   /*!
    * \brief Constructor from an existing DMatrix.
@@ -159,7 +159,7 @@ class EllpackPageImpl {
   explicit EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& parm);
 
   template <typename AdapterBatch>
-  explicit EllpackPageImpl(AdapterBatch batch, float missing, int device, bool is_dense,
+  explicit EllpackPageImpl(AdapterBatch batch, float missing, DeviceOrd device, bool is_dense,
                            common::Span<size_t> row_counts_span,
                            common::Span<FeatureType const> feature_types, size_t row_stride,
                            size_t n_rows, common::HistogramCuts const& cuts);
@@ -176,7 +176,7 @@ class EllpackPageImpl {
    * @param offset The number of elements to skip before copying.
    * @returns The number of elements copied.
    */
-  size_t Copy(int device, EllpackPageImpl const *page, size_t offset);
+  size_t Copy(DeviceOrd device, EllpackPageImpl const *page, size_t offset);
 
   /*! \brief Compact the given ELLPACK page into the current page.
    *
@@ -184,11 +184,10 @@ class EllpackPageImpl {
    * @param page The ELLPACK page to compact from.
    * @param row_indexes Row indexes for the compacted page.
    */
-  void Compact(int device, EllpackPageImpl const* page, common::Span<size_t> row_indexes);
-
+  void Compact(DeviceOrd device, EllpackPageImpl const* page, common::Span<size_t> row_indexes);
 
   /*! \return Number of instances in the page. */
-  size_t Size() const;
+  [[nodiscard]] size_t Size() const;
 
   /*! \brief Set the base row id for this page. */
   void SetBaseRowId(std::size_t row_id) {
@@ -204,12 +203,12 @@ class EllpackPageImpl {
 
   /*! \brief Return the total number of symbols (total number of bins plus 1 for
    * not found). */
-  size_t NumSymbols() const { return cuts_.TotalBins() + 1; }
+  [[nodiscard]] std::size_t NumSymbols() const { return cuts_.TotalBins() + 1; }
 
-  EllpackDeviceAccessor
-  GetDeviceAccessor(int device,
-                    common::Span<FeatureType const> feature_types = {}) const;
-  EllpackDeviceAccessor GetHostAccessor(common::Span<FeatureType const> feature_types = {}) const;
+  [[nodiscard]] EllpackDeviceAccessor GetDeviceAccessor(
+      DeviceOrd device, common::Span<FeatureType const> feature_types = {}) const;
+  [[nodiscard]] EllpackDeviceAccessor GetHostAccessor(
+      common::Span<FeatureType const> feature_types = {}) const;
 
  private:
   /*!
@@ -218,13 +217,13 @@ class EllpackPageImpl {
    * @param device The GPU device to use.
    * @param row_batch The CSR page.
    */
-  void CreateHistIndices(int device,
+  void CreateHistIndices(DeviceOrd device,
                          const SparsePage& row_batch,
                          common::Span<FeatureType const> feature_types);
   /*!
    * \brief Initialize the buffer to store compressed features.
    */
-  void InitCompressedData(int device);
+  void InitCompressedData(DeviceOrd device);
 
 
 public:
diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu
index abfc400c1..41b0f480b 100644
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -10,7 +10,7 @@
 
 namespace xgboost::data {
 void EllpackPageSource::Fetch() {
-  dh::safe_cuda(cudaSetDevice(device_));
+  dh::safe_cuda(cudaSetDevice(device_.ordinal));
   if (!this->ReadCache()) {
     if (count_ != 0 && !sync_) {
       // source is initialized to be the 0th page during construction, so when count_ is 0
diff --git a/src/data/ellpack_page_source.h b/src/data/ellpack_page_source.h
index bf0d44860..98a90111b 100644
--- a/src/data/ellpack_page_source.h
+++ b/src/data/ellpack_page_source.h
@@ -23,14 +23,14 @@ class EllpackPageSource : public PageSourceIncMixIn<EllpackPage> {
   BatchParam param_;
   common::Span<FeatureType const> feature_types_;
   std::unique_ptr<common::HistogramCuts> cuts_;
-  std::int32_t device_;
+  DeviceOrd device_;
 
  public:
   EllpackPageSource(float missing, int nthreads, bst_feature_t n_features, size_t n_batches,
                     std::shared_ptr<Cache> cache, BatchParam param,
                     std::unique_ptr<common::HistogramCuts> cuts, bool is_dense, size_t row_stride,
                     common::Span<FeatureType const> feature_types,
-                    std::shared_ptr<SparsePageSource> source, std::int32_t device)
+                    std::shared_ptr<SparsePageSource> source, DeviceOrd device)
       : PageSourceIncMixIn(missing, nthreads, n_features, n_batches, cache, false),
         is_dense_{is_dense},
         row_stride_{row_stride},
@@ -50,6 +50,7 @@ inline void EllpackPageSource::Fetch() {
   // silent the warning about unused variables.
   (void)(row_stride_);
   (void)(is_dense_);
+  (void)(device_);
   common::AssertGPUSupport();
 }
 #endif  // !defined(XGBOOST_USE_CUDA)
diff --git a/src/data/iterative_dmatrix.cc b/src/data/iterative_dmatrix.cc
index 0909add9f..11c3e0642 100644
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@@ -36,8 +36,7 @@ IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle pro
   auto pctx = MakeProxy(proxy_)->Ctx();
 
   Context ctx;
-  ctx.UpdateAllowUnknown(
-      Args{{"nthread", std::to_string(nthread)}, {"device", pctx->DeviceName()}});
+  ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", pctx->DeviceName()}});
   // hardcoded parameter.
   BatchParam p{max_bin, tree::TrainParam::DftSparseThreshold()};
 
@@ -139,7 +138,7 @@ void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p,
     return HostAdapterDispatch(proxy, [&](auto const& value) {
       size_t n_threads = ctx->Threads();
       size_t n_features = column_sizes.size();
-      linalg::Tensor<std::size_t, 2> column_sizes_tloc({n_threads, n_features}, Context::kCpuId);
+      linalg::Tensor<std::size_t, 2> column_sizes_tloc({n_threads, n_features}, DeviceOrd::CPU());
       column_sizes_tloc.Data()->Fill(0ul);
       auto view = column_sizes_tloc.HostView();
       common::ParallelFor(value.Size(), n_threads, common::Sched::Static(256), [&](auto i) {
diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu
index 4825b58e7..68a58fd60 100644
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@@ -48,10 +48,9 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
   int32_t current_device;
 
   dh::safe_cuda(cudaGetDevice(&current_device));
-
-  auto get_device = [&]() -> int32_t {
-    std::int32_t d = (ctx->gpu_id == Context::kCpuId) ? current_device : ctx->gpu_id;
-    CHECK_NE(d, Context::kCpuId);
+  auto get_device = [&]() {
+    auto d = (ctx->IsCPU()) ? DeviceOrd::CUDA(current_device) : ctx->Device();
+    CHECK(!d.IsCPU());
     return d;
   };
 
@@ -61,11 +60,8 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
   common::HistogramCuts cuts;
   do {
     // We use do while here as the first batch is fetched in ctor
-    // ctx_.gpu_id = proxy->DeviceIdx();
-    CHECK_LT(ctx->gpu_id, common::AllVisibleGPUs());
-
-    dh::safe_cuda(cudaSetDevice(get_device()));
-
+    CHECK_LT(ctx->Ordinal(), common::AllVisibleGPUs());
+    dh::safe_cuda(cudaSetDevice(get_device().ordinal));
     if (cols == 0) {
       cols = num_cols();
       collective::Allreduce<collective::Operation::kMax>(&cols, 1);
@@ -103,8 +99,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
   auto n_features = cols;
   CHECK_GE(n_features, 1) << "Data must has at least 1 column.";
 
-  dh::safe_cuda(cudaSetDevice(get_device()));
-
+  dh::safe_cuda(cudaSetDevice(get_device().ordinal));
   if (!ref) {
     HostDeviceVector<FeatureType> ft;
     common::SketchContainer final_sketch(
@@ -143,9 +138,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
   size_t n_batches_for_verification = 0;
   while (iter.Next()) {
     init_page();
-
-    dh::safe_cuda(cudaSetDevice(get_device()));
-
+    dh::safe_cuda(cudaSetDevice(get_device().ordinal));
     auto rows = num_rows();
     dh::device_vector<size_t> row_counts(rows + 1, 0);
     common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
@@ -197,18 +190,18 @@ BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(Context const* ctx,
   if (!ellpack_) {
     ellpack_.reset(new EllpackPage());
     if (ctx->IsCUDA()) {
-      this->Info().feature_types.SetDevice(ctx->gpu_id);
+      this->Info().feature_types.SetDevice(ctx->Device());
       *ellpack_->Impl() =
           EllpackPageImpl(ctx, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
     } else if (fmat_ctx_.IsCUDA()) {
-      this->Info().feature_types.SetDevice(fmat_ctx_.gpu_id);
+      this->Info().feature_types.SetDevice(fmat_ctx_.Device());
       *ellpack_->Impl() =
           EllpackPageImpl(&fmat_ctx_, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
     } else {
       // Can happen when QDM is initialized on CPU, but a GPU version is queried by a different QDM
       // for cut reference.
       auto cuda_ctx = ctx->MakeCUDA();
-      this->Info().feature_types.SetDevice(cuda_ctx.gpu_id);
+      this->Info().feature_types.SetDevice(cuda_ctx.Device());
       *ellpack_->Impl() =
           EllpackPageImpl(&cuda_ctx, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
     }
diff --git a/src/data/proxy_dmatrix.cc b/src/data/proxy_dmatrix.cc
index df26c437a..a63d003c0 100644
--- a/src/data/proxy_dmatrix.cc
+++ b/src/data/proxy_dmatrix.cc
@@ -11,18 +11,18 @@ void DMatrixProxy::SetArrayData(StringView interface_str) {
   this->batch_ = adapter;
   this->Info().num_col_ = adapter->NumColumns();
   this->Info().num_row_ = adapter->NumRows();
-  this->ctx_.gpu_id = Context::kCpuId;
+  this->ctx_.Init(Args{{"device", "cpu"}});
 }
 
-void DMatrixProxy::SetCSRData(char const *c_indptr, char const *c_indices,
-                              char const *c_values, bst_feature_t n_features, bool on_host) {
+void DMatrixProxy::SetCSRData(char const *c_indptr, char const *c_indices, char const *c_values,
+                              bst_feature_t n_features, bool on_host) {
   CHECK(on_host) << "Not implemented on device.";
   std::shared_ptr<CSRArrayAdapter> adapter{new CSRArrayAdapter(
       StringView{c_indptr}, StringView{c_indices}, StringView{c_values}, n_features)};
   this->batch_ = adapter;
   this->Info().num_col_ = adapter->NumColumns();
   this->Info().num_row_ = adapter->NumRows();
-  this->ctx_.gpu_id = Context::kCpuId;
+  this->ctx_.Init(Args{{"device", "cpu"}});
 }
 
 namespace cuda_impl {
diff --git a/src/data/proxy_dmatrix.cu b/src/data/proxy_dmatrix.cu
index ded1c3aef..cd76e49cf 100644
--- a/src/data/proxy_dmatrix.cu
+++ b/src/data/proxy_dmatrix.cu
@@ -11,13 +11,13 @@ void DMatrixProxy::FromCudaColumnar(StringView interface_str) {
   this->batch_ = adapter;
   this->Info().num_col_ = adapter->NumColumns();
   this->Info().num_row_ = adapter->NumRows();
-  if (adapter->DeviceIdx() < 0) {
+  if (adapter->Device().IsCPU()) {
     // empty data
     CHECK_EQ(this->Info().num_row_, 0);
     ctx_ = ctx_.MakeCUDA(dh::CurrentDevice());
     return;
   }
-  ctx_ = ctx_.MakeCUDA(adapter->DeviceIdx());
+  ctx_ = ctx_.MakeCUDA(adapter->Device().ordinal);
 }
 
 void DMatrixProxy::FromCudaArray(StringView interface_str) {
@@ -25,13 +25,13 @@ void DMatrixProxy::FromCudaArray(StringView interface_str) {
   this->batch_ = adapter;
   this->Info().num_col_ = adapter->NumColumns();
   this->Info().num_row_ = adapter->NumRows();
-  if (adapter->DeviceIdx() < 0) {
+  if (adapter->Device().IsCPU()) {
     // empty data
     CHECK_EQ(this->Info().num_row_, 0);
     ctx_ = ctx_.MakeCUDA(dh::CurrentDevice());
     return;
   }
-  ctx_ = ctx_.MakeCUDA(adapter->DeviceIdx());
+  ctx_ = ctx_.MakeCUDA(adapter->Device().ordinal);
 }
 
 namespace cuda_impl {
diff --git a/src/data/proxy_dmatrix.h b/src/data/proxy_dmatrix.h
index cc8551819..7f17ecf00 100644
--- a/src/data/proxy_dmatrix.h
+++ b/src/data/proxy_dmatrix.h
@@ -46,7 +46,7 @@ class DMatrixProxy : public DMatrix {
 #endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
 
  public:
-  int DeviceIdx() const { return ctx_.gpu_id; }
+  DeviceOrd Device() const { return ctx_.Device(); }
 
   void SetCUDAArray(char const* c_interface) {
     common::AssertGPUSupport();
diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc
index 85ede3258..3814d74d2 100644
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -75,11 +75,9 @@ DMatrix* SimpleDMatrix::SliceCol(int num_slices, int slice_id) {
 }
 
 void SimpleDMatrix::ReindexFeatures(Context const* ctx) {
-  if (info_.IsVerticalFederated()) {
-    std::vector<uint64_t> buffer(collective::GetWorldSize());
-    buffer[collective::GetRank()] = info_.num_col_;
-    collective::Allgather(buffer.data(), buffer.size() * sizeof(uint64_t));
-    auto offset = std::accumulate(buffer.cbegin(), buffer.cbegin() + collective::GetRank(), 0);
+  if (info_.IsColumnSplit() && collective::GetWorldSize() > 1) {
+    auto const cols = collective::Allgather(info_.num_col_);
+    auto const offset = std::accumulate(cols.cbegin(), cols.cbegin() + collective::GetRank(), 0ul);
     if (offset == 0) {
       return;
     }
@@ -253,7 +251,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread,
     }
     if (batch.BaseMargin() != nullptr) {
       info_.base_margin_ = decltype(info_.base_margin_){
-          batch.BaseMargin(), batch.BaseMargin() + batch.Size(), {batch.Size()}, Context::kCpuId};
+          batch.BaseMargin(), batch.BaseMargin() + batch.Size(), {batch.Size()}, DeviceOrd::CPU()};
     }
     if (batch.Qid() != nullptr) {
       qids.insert(qids.end(), batch.Qid(), batch.Qid() + batch.Size());
@@ -361,78 +359,4 @@ template SimpleDMatrix::SimpleDMatrix(FileAdapter* adapter, float missing, int n
 template SimpleDMatrix::SimpleDMatrix(
     IteratorAdapter<DataIterHandle, XGBCallbackDataIterNext, XGBoostBatchCSR>* adapter,
     float missing, int nthread, DataSplitMode data_split_mode);
-
-template <>
-SimpleDMatrix::SimpleDMatrix(RecordBatchesIterAdapter* adapter, float missing, int nthread,
-                             DataSplitMode data_split_mode) {
-  Context ctx;
-  ctx.nthread = nthread;
-
-  auto& offset_vec = sparse_page_->offset.HostVector();
-  auto& data_vec = sparse_page_->data.HostVector();
-  uint64_t total_batch_size = 0;
-  uint64_t total_elements = 0;
-
-  adapter->BeforeFirst();
-  // Iterate over batches of input data
-  while (adapter->Next()) {
-    auto& batches = adapter->Value();
-    size_t num_elements = 0;
-    size_t num_rows = 0;
-    // Import Arrow RecordBatches
-#pragma omp parallel for reduction(+ : num_elements, num_rows) num_threads(ctx.Threads())
-    for (int i = 0; i < static_cast<int>(batches.size()); ++i) {  // NOLINT
-      num_elements += batches[i]->Import(missing);
-      num_rows += batches[i]->Size();
-    }
-    total_elements += num_elements;
-    total_batch_size += num_rows;
-    // Compute global offset for every row and starting row for every batch
-    std::vector<uint64_t> batch_offsets(batches.size());
-    for (size_t i = 0; i < batches.size(); ++i) {
-      if (i == 0) {
-        batch_offsets[i] = total_batch_size - num_rows;
-        batches[i]->ShiftRowOffsets(total_elements - num_elements);
-      } else {
-        batch_offsets[i] = batch_offsets[i - 1] + batches[i - 1]->Size();
-        batches[i]->ShiftRowOffsets(batches[i - 1]->RowOffsets().back());
-      }
-    }
-    // Pre-allocate DMatrix memory
-    data_vec.resize(total_elements);
-    offset_vec.resize(total_batch_size + 1);
-    // Copy data into DMatrix
-#pragma omp parallel num_threads(ctx.Threads())
-    {
-#pragma omp for nowait
-      for (int i = 0; i < static_cast<int>(batches.size()); ++i) {  // NOLINT
-        size_t begin = batches[i]->RowOffsets()[0];
-        for (size_t k = 0; k < batches[i]->Size(); ++k) {
-          for (size_t j = 0; j < batches[i]->NumColumns(); ++j) {
-            auto element = batches[i]->GetColumn(j).GetElement(k);
-            if (!std::isnan(element.value)) {
-              data_vec[begin++] = Entry(element.column_idx, element.value);
-            }
-          }
-        }
-      }
-#pragma omp for nowait
-      for (int i = 0; i < static_cast<int>(batches.size()); ++i) {
-        auto& offsets = batches[i]->RowOffsets();
-        std::copy(offsets.begin() + 1, offsets.end(), offset_vec.begin() + batch_offsets[i] + 1);
-      }
-    }
-  }
-  // Synchronise worker columns
-  info_.num_col_ = adapter->NumColumns();
-  info_.data_split_mode = data_split_mode;
-  ReindexFeatures(&ctx);
-  info_.SynchronizeNumberOfColumns();
-
-  info_.num_row_ = total_batch_size;
-  info_.num_nonzero_ = data_vec.size();
-  CHECK_EQ(offset_vec.back(), info_.num_nonzero_);
-
-  fmat_ctx_ = ctx;
-}
 }  // namespace xgboost::data
diff --git a/src/data/simple_dmatrix.cu b/src/data/simple_dmatrix.cu
index 39d701b43..e41d59394 100644
--- a/src/data/simple_dmatrix.cu
+++ b/src/data/simple_dmatrix.cu
@@ -10,9 +10,7 @@
 #include "xgboost/context.h"  // for Context
 #include "xgboost/data.h"
 
-namespace xgboost {
-namespace data {
-
+namespace xgboost::data {
 // Does not currently support metainfo as no on-device data source contains this
 // Current implementation assumes a single batch. More batches can
 // be supported in future. Does not currently support inferring row/column size
@@ -21,14 +19,14 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthr
                              DataSplitMode data_split_mode) {
   CHECK(data_split_mode != DataSplitMode::kCol)
       << "Column-wise data split is currently not supported on the GPU.";
-  auto device = (adapter->DeviceIdx() < 0 || adapter->NumRows() == 0) ? dh::CurrentDevice()
-                                                                      : adapter->DeviceIdx();
-  CHECK_GE(device, 0);
-
-  dh::safe_cuda(cudaSetDevice(device));
+  auto device = (adapter->Device().IsCPU() || adapter->NumRows() == 0)
+                    ? DeviceOrd::CUDA(dh::CurrentDevice())
+                    : adapter->Device();
+  CHECK(device.IsCUDA());
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
 
   Context ctx;
-  ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", DeviceOrd::CUDA(device).Name()}});
+  ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", device.Name()}});
 
   CHECK(adapter->NumRows() != kAdapterUnknownSize);
   CHECK(adapter->NumColumns() != kAdapterUnknownSize);
@@ -53,5 +51,4 @@ template SimpleDMatrix::SimpleDMatrix(CudfAdapter* adapter, float missing,
                                       int nthread, DataSplitMode data_split_mode);
 template SimpleDMatrix::SimpleDMatrix(CupyAdapter* adapter, float missing,
                                       int nthread, DataSplitMode data_split_mode);
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
diff --git a/src/data/simple_dmatrix.cuh b/src/data/simple_dmatrix.cuh
index a26899ff1..01e532d01 100644
--- a/src/data/simple_dmatrix.cuh
+++ b/src/data/simple_dmatrix.cuh
@@ -54,11 +54,9 @@ void CopyDataToDMatrix(AdapterBatchT batch, common::Span<Entry> data,
 }
 
 template <typename AdapterBatchT>
-void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
-                     int device_idx, float missing) {
-
-  dh::safe_cuda(cudaSetDevice(device_idx));
-
+void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset, DeviceOrd device,
+                     float missing) {
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
   IsValidFunctor is_valid(missing);
   // Count elements per row
   dh::LaunchN(batch.Size(), [=] __device__(size_t idx) {
@@ -71,22 +69,19 @@ void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
   });
 
   dh::XGBCachingDeviceAllocator<char> alloc;
-
-#if defined(XGBOOST_USE_HIP)
-  thrust::exclusive_scan(thrust::hip::par(alloc),
-      thrust::device_pointer_cast(offset.data()),
-      thrust::device_pointer_cast(offset.data() + offset.size()),
-      thrust::device_pointer_cast(offset.data()));
-#elif defined(XGBOOST_USE_CUDA)
-  thrust::exclusive_scan(thrust::cuda::par(alloc),
-      thrust::device_pointer_cast(offset.data()),
-      thrust::device_pointer_cast(offset.data() + offset.size()),
-      thrust::device_pointer_cast(offset.data()));
+#if defined(XGBOOST_USE_CUDA)
+  thrust::exclusive_scan(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
+                         thrust::device_pointer_cast(offset.data() + offset.size()),
+                         thrust::device_pointer_cast(offset.data()));
+#elif defined(XGBOOST_USE_HIP)
+  thrust::exclusive_scan(thrust::hip::par(alloc), thrust::device_pointer_cast(offset.data()),
+                         thrust::device_pointer_cast(offset.data() + offset.size()),
+                         thrust::device_pointer_cast(offset.data()));
 #endif
 }
 
 template <typename AdapterBatchT>
-size_t CopyToSparsePage(AdapterBatchT const& batch, int32_t device, float missing,
+size_t CopyToSparsePage(AdapterBatchT const& batch, DeviceOrd device, float missing,
                         SparsePage* page) {
   bool valid = NoInfInData(batch, IsValidFunctor{missing});
   CHECK(valid) << error::InfInData();
diff --git a/src/data/simple_dmatrix.h b/src/data/simple_dmatrix.h
index 56685c1e6..5b5bb2bfb 100644
--- a/src/data/simple_dmatrix.h
+++ b/src/data/simple_dmatrix.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2015-2022 by XGBoost Contributors
+/**
+ * Copyright 2015-2023, XGBoost Contributors
  * \file simple_dmatrix.h
  * \brief In-memory version of DMatrix.
  * \author Tianqi Chen
@@ -15,8 +15,7 @@
 
 #include "gradient_index.h"
 
-namespace xgboost {
-namespace data {
+namespace xgboost::data {
 // Used for single batch data.
 class SimpleDMatrix : public DMatrix {
  public:
@@ -65,9 +64,10 @@ class SimpleDMatrix : public DMatrix {
   /**
    * \brief Reindex the features based on a global view.
    *
-   * In some cases (e.g. vertical federated learning), features are loaded locally with indices
-   * starting from 0. However, all the algorithms assume the features are globally indexed, so we
-   * reindex the features based on the offset needed to obtain the global view.
+   * In some cases (e.g. column-wise data split and vertical federated learning), features are
+   * loaded locally with indices starting from 0. However, all the algorithms assume the features
+   * are globally indexed, so we reindex the features based on the offset needed to obtain the
+   * global view.
    */
   void ReindexFeatures(Context const* ctx);
 
@@ -75,6 +75,5 @@ class SimpleDMatrix : public DMatrix {
   // Context used only for DMatrix initialization.
   Context fmat_ctx_;
 };
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
 #endif  // XGBOOST_DATA_SIMPLE_DMATRIX_H_
diff --git a/src/data/sparse_page_dmatrix.cu b/src/data/sparse_page_dmatrix.cu
index 9d4c63387..572d6cb08 100644
--- a/src/data/sparse_page_dmatrix.cu
+++ b/src/data/sparse_page_dmatrix.cu
@@ -45,7 +45,8 @@ BatchSet<EllpackPage> SparsePageDMatrix::GetEllpackBatches(Context const* ctx,
     ellpack_page_source_.reset();  // make sure resource is released before making new ones.
     ellpack_page_source_ = std::make_shared<EllpackPageSource>(
         this->missing_, ctx->Threads(), this->Info().num_col_, this->n_batches_, cache_info_.at(id),
-        param, std::move(cuts), this->IsDense(), row_stride, ft, sparse_page_source_, ctx->gpu_id);
+        param, std::move(cuts), this->IsDense(), row_stride, ft, sparse_page_source_,
+        ctx->Device());
   } else {
     CHECK(sparse_page_source_);
     ellpack_page_source_->Reset();
diff --git a/src/data/sparse_page_source.cu b/src/data/sparse_page_source.cu
index 8d4adda17..40037eedc 100644
--- a/src/data/sparse_page_source.cu
+++ b/src/data/sparse_page_source.cu
@@ -19,11 +19,11 @@ std::size_t NFeaturesDevice(DMatrixProxy *proxy) {
 }  // namespace detail
 
 void DevicePush(DMatrixProxy *proxy, float missing, SparsePage *page) {
-  auto device = proxy->DeviceIdx();
-  if (device < 0) {
-    device = dh::CurrentDevice();
+  auto device = proxy->Device();
+  if (device.IsCPU()) {
+    device = DeviceOrd::CUDA(dh::CurrentDevice());
   }
-  CHECK_GE(device, 0);
+  CHECK(device.IsCUDA());
 
   cuda_impl::Dispatch(proxy,
                       [&](auto const &value) { CopyToSparsePage(value, device, missing, page); });
diff --git a/src/data/sparse_page_source.h b/src/data/sparse_page_source.h
index d3708126c..796409c85 100644
--- a/src/data/sparse_page_source.h
+++ b/src/data/sparse_page_source.h
@@ -177,15 +177,15 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
     }
     // An heuristic for number of pre-fetched batches.  We can make it part of BatchParam
     // to let user adjust number of pre-fetched batches when needed.
-    uint32_t constexpr kPreFetch = 3;
-
-    size_t n_prefetch_batches = std::min(kPreFetch, n_batches_);
+    std::int32_t n_prefetches = std::max(nthreads_, 3);
+    std::int32_t n_prefetch_batches =
+        std::min(static_cast<std::uint32_t>(n_prefetches), n_batches_);
     CHECK_GT(n_prefetch_batches, 0) << "total batches:" << n_batches_;
     std::size_t fetch_it = count_;
 
     exce_.Rethrow();
 
-    for (std::size_t i = 0; i < n_prefetch_batches; ++i, ++fetch_it) {
+    for (std::int32_t i = 0; i < n_prefetch_batches; ++i, ++fetch_it) {
       fetch_it %= n_batches_;  // ring
       if (ring_->at(fetch_it).valid()) {
         continue;
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index 9bd9ce098..89f14310c 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -212,7 +212,7 @@ void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
   bst_target_t const n_groups = model_.learner_model_param->OutputLength();
   monitor_.Start("BoostNewTrees");
 
-  predt->predictions.SetDevice(ctx_->Ordinal());
+  predt->predictions.SetDevice(ctx_->Device());
   auto out = linalg::MakeTensorView(ctx_, &predt->predictions, p_fmat->Info().num_row_,
                                     model_.learner_model_param->OutputLength());
   CHECK_NE(n_groups, 0);
@@ -248,7 +248,7 @@ void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix<GradientPair>* in_gpair,
   } else {
     CHECK_EQ(in_gpair->Size() % n_groups, 0U) << "must have exactly ngroup * nrow gpairs";
     linalg::Matrix<GradientPair> tmp{{in_gpair->Shape(0), static_cast<std::size_t>(1ul)},
-                                     ctx_->Ordinal()};
+                                     ctx_->Device()};
     bool update_predict = true;
     for (bst_target_t gid = 0; gid < n_groups; ++gid) {
       node_position.clear();
@@ -736,7 +736,7 @@ class Dart : public GBTree {
 
     PredictionCacheEntry predts;  // temporary storage for prediction
     if (ctx_->IsCUDA()) {
-      predts.predictions.SetDevice(ctx_->gpu_id);
+      predts.predictions.SetDevice(ctx_->Device());
     }
     predts.predictions.Resize(p_fmat->Info().num_row_ * n_groups, 0);
     // multi-target is not yet supported.
@@ -761,8 +761,8 @@ class Dart : public GBTree {
       CHECK_EQ(p_out_preds->predictions.Size(), predts.predictions.Size());
 
       size_t n_rows = p_fmat->Info().num_row_;
-      if (predts.predictions.DeviceIdx() != Context::kCpuId) {
-        p_out_preds->predictions.SetDevice(predts.predictions.DeviceIdx());
+      if (predts.predictions.Device().IsCUDA()) {
+        p_out_preds->predictions.SetDevice(predts.predictions.Device());
         GPUDartPredictInc(p_out_preds->predictions.DeviceSpan(),
                           predts.predictions.DeviceSpan(), w, n_rows, n_groups,
                           group);
@@ -801,8 +801,8 @@ class Dart : public GBTree {
 
     StringView msg{"Unsupported data type for inplace predict."};
     PredictionCacheEntry predts;
-    if (ctx_->gpu_id != Context::kCpuId) {
-      predts.predictions.SetDevice(ctx_->gpu_id);
+    if (ctx_->IsCUDA()) {
+      predts.predictions.SetDevice(ctx_->Device());
     }
     predts.predictions.Resize(p_fmat->Info().num_row_ * n_groups, 0);
 
@@ -838,8 +838,8 @@ class Dart : public GBTree {
       CHECK_EQ(predts.predictions.Size(), p_out_preds->predictions.Size());
 
       size_t n_rows = p_fmat->Info().num_row_;
-      if (predts.predictions.DeviceIdx() != Context::kCpuId) {
-        p_out_preds->predictions.SetDevice(predts.predictions.DeviceIdx());
+      if (predts.predictions.Device().IsCUDA()) {
+        p_out_preds->predictions.SetDevice(predts.predictions.Device());
         auto base_score = model_.learner_model_param->BaseScore(predts.predictions.Device());
         GPUDartInplacePredictInc(p_out_preds->predictions.DeviceSpan(),
                                  predts.predictions.DeviceSpan(), w, n_rows, base_score, n_groups,
diff --git a/src/gbm/gbtree_model.cc b/src/gbm/gbtree_model.cc
index 1373e3e2b..14131865f 100644
--- a/src/gbm/gbtree_model.cc
+++ b/src/gbm/gbtree_model.cc
@@ -106,14 +106,30 @@ void GBTreeModel::Load(dmlc::Stream* fi) {
   Validate(*this);
 }
 
+namespace {
+std::int32_t IOThreads(Context const* ctx) {
+  CHECK(ctx);
+  std::int32_t n_threads = ctx->Threads();
+  // CRAN checks for number of threads used by examples, but we might not have the right
+  // number of threads when serializing/unserializing models as nthread is a booster
+  // parameter, which is only effective after booster initialization.
+  //
+  // The threshold ratio of CPU time to user time for R is 2.5, we set the number of
+  // threads to 2.
+#if defined(XGBOOST_STRICT_R_MODE) && XGBOOST_STRICT_R_MODE == 1
+  n_threads = std::min(2, n_threads);
+#endif
+  return n_threads;
+}
+}  // namespace
+
 void GBTreeModel::SaveModel(Json* p_out) const {
   auto& out = *p_out;
   CHECK_EQ(param.num_trees, static_cast<int>(trees.size()));
   out["gbtree_model_param"] = ToJson(param);
   std::vector<Json> trees_json(trees.size());
 
-  CHECK(ctx_);
-  common::ParallelFor(trees.size(), ctx_->Threads(), [&](auto t) {
+  common::ParallelFor(trees.size(), IOThreads(ctx_), [&](auto t) {
     auto const& tree = trees[t];
     Json jtree{Object{}};
     tree->SaveModel(&jtree);
@@ -151,9 +167,7 @@ void GBTreeModel::LoadModel(Json const& in) {
   CHECK_EQ(tree_info_json.size(), param.num_trees);
   tree_info.resize(param.num_trees);
 
-  CHECK(ctx_);
-
-  common::ParallelFor(param.num_trees, ctx_->Threads(), [&](auto t) {
+  common::ParallelFor(param.num_trees, IOThreads(ctx_), [&](auto t) {
     auto tree_id = get<Integer const>(trees_json[t]["id"]);
     trees.at(tree_id).reset(new RegTree{});
     trees[tree_id]->LoadModel(trees_json[t]);
diff --git a/src/learner.cc b/src/learner.cc
index a7e319d6d..5d7c85dd6 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -278,7 +278,7 @@ LearnerModelParam::LearnerModelParam(Context const* ctx, LearnerModelParamLegacy
   std::swap(base_score_, base_margin);
   // Make sure read access everywhere for thread-safe prediction.
   std::as_const(base_score_).HostView();
-  if (!ctx->IsCPU()) {
+  if (ctx->IsCUDA()) {
     std::as_const(base_score_).View(ctx->Device());
   }
   CHECK(std::as_const(base_score_).Data()->HostCanRead());
@@ -287,7 +287,7 @@ LearnerModelParam::LearnerModelParam(Context const* ctx, LearnerModelParamLegacy
 linalg::TensorView<float const, 1> LearnerModelParam::BaseScore(DeviceOrd device) const {
   // multi-class is not yet supported.
   CHECK_EQ(base_score_.Size(), 1) << ModelNotFitted();
-  if (device.IsCPU()) {
+  if (!device.IsCUDA()) {
     // Make sure that we won't run into race condition.
     CHECK(base_score_.Data()->HostCanRead());
     return base_score_.HostView();
@@ -305,10 +305,10 @@ linalg::TensorView<float const, 1> LearnerModelParam::BaseScore(Context const* c
 
 void LearnerModelParam::Copy(LearnerModelParam const& that) {
   base_score_.Reshape(that.base_score_.Shape());
-  base_score_.Data()->SetDevice(that.base_score_.DeviceIdx());
+  base_score_.Data()->SetDevice(that.base_score_.Device());
   base_score_.Data()->Copy(*that.base_score_.Data());
   std::as_const(base_score_).HostView();
-  if (that.base_score_.DeviceIdx() != Context::kCpuId) {
+  if (!that.base_score_.Device().IsCPU()) {
     std::as_const(base_score_).View(that.base_score_.Device());
   }
   CHECK_EQ(base_score_.Data()->DeviceCanRead(), that.base_score_.Data()->DeviceCanRead());
@@ -424,7 +424,7 @@ class LearnerConfiguration : public Learner {
     if (mparam_.boost_from_average && !UsePtr(gbm_)->ModelFitted()) {
       if (p_fmat) {
         auto const& info = p_fmat->Info();
-        info.Validate(Ctx()->Ordinal());
+        info.Validate(Ctx()->Device());
         // We estimate it from input data.
         linalg::Tensor<float, 1> base_score;
         InitEstimation(info, &base_score);
@@ -446,7 +446,7 @@ class LearnerConfiguration : public Learner {
     monitor_.Init("Learner");
     for (std::shared_ptr<DMatrix> const& d : cache) {
       if (d) {
-        prediction_container_.Cache(d, Context::kCpuId);
+        prediction_container_.Cache(d, DeviceOrd::CPU());
       }
     }
   }
@@ -1057,7 +1057,7 @@ class LearnerIO : public LearnerConfiguration {
                                                         ? std::numeric_limits<float>::quiet_NaN()
                                                         : obj_->ProbToMargin(mparam_.base_score)},
                                                    {1},
-                                                   Context::kCpuId},
+                                                   DeviceOrd::CPU()},
                           obj_->Task(), tparam_.multi_strategy);
 
     if (attributes_.find("objective") != attributes_.cend()) {
@@ -1282,7 +1282,7 @@ class LearnerImpl : public LearnerIO {
 
     this->ValidateDMatrix(train.get(), true);
 
-    auto& predt = prediction_container_.Cache(train, ctx_.gpu_id);
+    auto& predt = prediction_container_.Cache(train, ctx_.Device());
 
     monitor_.Start("PredictRaw");
     this->PredictRaw(train.get(), &predt, true, 0, 0);
@@ -1312,7 +1312,7 @@ class LearnerImpl : public LearnerIO {
     CHECK_EQ(this->learner_model_param_.OutputLength(), in_gpair->Shape(1))
         << "The number of columns in gradient should be equal to the number of targets/classes in "
            "the model.";
-    auto& predt = prediction_container_.Cache(train, ctx_.gpu_id);
+    auto& predt = prediction_container_.Cache(train, ctx_.Device());
     gbm_->DoBoost(train.get(), in_gpair, &predt, obj_.get());
     monitor_.Stop("BoostOneIter");
   }
@@ -1330,17 +1330,19 @@ class LearnerImpl : public LearnerIO {
     if (metrics_.empty() && tparam_.disable_default_eval_metric <= 0) {
       metrics_.emplace_back(Metric::Create(obj_->DefaultEvalMetric(), &ctx_));
       auto config = obj_->DefaultMetricConfig();
-      metrics_.back()->LoadConfig(config);
+      if (!IsA<Null>(config)) {
+        metrics_.back()->LoadConfig(config);
+      }
       metrics_.back()->Configure({cfg_.begin(), cfg_.end()});
     }
 
     for (size_t i = 0; i < data_sets.size(); ++i) {
       std::shared_ptr<DMatrix> m = data_sets[i];
-      auto &predt = prediction_container_.Cache(m, ctx_.gpu_id);
+      auto &predt = prediction_container_.Cache(m, ctx_.Device());
       this->ValidateDMatrix(m.get(), false);
       this->PredictRaw(m.get(), &predt, false, 0, 0);
 
-      auto &out = output_predictions_.Cache(m, ctx_.gpu_id).predictions;
+      auto &out = output_predictions_.Cache(m, ctx_.Device()).predictions;
       out.Resize(predt.predictions.Size());
       out.Copy(predt.predictions);
 
@@ -1376,7 +1378,7 @@ class LearnerImpl : public LearnerIO {
     } else if (pred_leaf) {
       gbm_->PredictLeaf(data.get(), out_preds, layer_begin, layer_end);
     } else {
-      auto& prediction = prediction_container_.Cache(data, ctx_.gpu_id);
+      auto& prediction = prediction_container_.Cache(data, ctx_.Device());
       this->PredictRaw(data.get(), &prediction, training, layer_begin, layer_end);
       // Copy the prediction cache to output prediction. out_preds comes from C API
       out_preds->SetDevice(ctx_.Device());
@@ -1456,7 +1458,7 @@ class LearnerImpl : public LearnerIO {
 
   void ValidateDMatrix(DMatrix* p_fmat, bool is_training) const {
     MetaInfo const& info = p_fmat->Info();
-    info.Validate(ctx_.gpu_id);
+    info.Validate(ctx_.Device());
 
     if (is_training) {
       CHECK_EQ(learner_model_param_.num_feature, p_fmat->Info().num_col_)
diff --git a/src/linear/updater_gpu_coordinate.cu b/src/linear/updater_gpu_coordinate.cu
index 1c1ae1ba4..3f7ab7895 100644
--- a/src/linear/updater_gpu_coordinate.cu
+++ b/src/linear/updater_gpu_coordinate.cu
@@ -48,7 +48,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
   }
 
   void LazyInitDevice(DMatrix *p_fmat, const LearnerModelParam &model_param) {
-    if (ctx_->gpu_id < 0) return;
+    if (ctx_->IsCPU()) return;
 
     num_row_ = static_cast<size_t>(p_fmat->Info().num_row_);
 
@@ -60,8 +60,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
       return;
     }
 
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
-
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
     // The begin and end indices for the section of each column associated with
     // this device
     std::vector<std::pair<bst_uint, bst_uint>> column_segments;
@@ -135,7 +134,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
          ++group_idx) {
       // Get gradient
       auto grad = GradientPair(0, 0);
-      if (ctx_->gpu_id >= 0) {
+      if (ctx_->IsCUDA()) {
         grad = GetBiasGradient(group_idx, model->learner_model_param->num_output_group);
       }
       auto dbias = static_cast<float>(
@@ -144,7 +143,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
       model->Bias()[group_idx] += dbias;
 
       // Update residual
-      if (ctx_->gpu_id >= 0) {
+      if (ctx_->IsCUDA()) {
         UpdateBiasResidual(dbias, group_idx, model->learner_model_param->num_output_group);
       }
     }
@@ -155,7 +154,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
     bst_float &w = (*model)[fidx][group_idx];
     // Get gradient
     auto grad = GradientPair(0, 0);
-    if (ctx_->gpu_id >= 0) {
+    if (ctx_->IsCUDA()) {
       grad = GetGradient(group_idx, model->learner_model_param->num_output_group, fidx);
     }
     auto dw = static_cast<float>(tparam_.learning_rate *
@@ -164,15 +163,14 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
                                                  tparam_.reg_lambda_denorm));
     w += dw;
 
-    if (ctx_->gpu_id >= 0) {
+    if (ctx_->IsCUDA()) {
       UpdateResidual(dw, group_idx, model->learner_model_param->num_output_group, fidx);
     }
   }
 
   // This needs to be public because of the __device__ lambda.
   GradientPair GetBiasGradient(int group_idx, int num_group) {
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
-
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
     auto counting = thrust::make_counting_iterator(0ull);
     auto f = [=] __device__(size_t idx) {
       return idx * num_group + group_idx;
@@ -196,8 +194,7 @@ class GPUCoordinateUpdater : public LinearUpdater {  // NOLINT
 
   // This needs to be public because of the __device__ lambda.
   GradientPair GetGradient(int group_idx, int num_group, int fidx) {
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
-
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
     common::Span<xgboost::Entry> d_col = dh::ToSpan(data_).subspan(row_ptr_[fidx]);
     size_t col_size = row_ptr_[fidx + 1] - row_ptr_[fidx];
     common::Span<GradientPair> d_gpair = dh::ToSpan(gpair_);
diff --git a/src/metric/auc.cc b/src/metric/auc.cc
index 4cd7cf874..eb6025d6c 100644
--- a/src/metric/auc.cc
+++ b/src/metric/auc.cc
@@ -23,8 +23,7 @@
 #include "xgboost/linalg.h"
 #include "xgboost/metric.h"
 
-namespace xgboost {
-namespace metric {
+namespace xgboost::metric {
 // tag the this file, used by force static link later.
 DMLC_REGISTRY_FILE_TAG(auc);
 /**
@@ -257,10 +256,10 @@ template <typename Curve>
 class EvalAUC : public MetricNoCache {
   double Eval(const HostDeviceVector<bst_float> &preds, const MetaInfo &info) override {
     double auc {0};
-    if (ctx_->gpu_id != Context::kCpuId) {
-      preds.SetDevice(ctx_->gpu_id);
-      info.labels.SetDevice(ctx_->gpu_id);
-      info.weights_.SetDevice(ctx_->gpu_id);
+    if (ctx_->Device().IsCUDA()) {
+      preds.SetDevice(ctx_->Device());
+      info.labels.SetDevice(ctx_->Device());
+      info.weights_.SetDevice(ctx_->Device());
     }
     //  We use the global size to handle empty dataset.
     std::array<size_t, 2> meta{info.labels.Size(), preds.Size()};
@@ -329,7 +328,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
     double auc{0};
     uint32_t valid_groups = 0;
     auto n_threads = ctx_->Threads();
-    if (ctx_->gpu_id == Context::kCpuId) {
+    if (ctx_->IsCPU()) {
       std::tie(auc, valid_groups) =
           RankingAUC<true>(ctx_, predts.ConstHostVector(), info, n_threads);
     } else {
@@ -344,7 +343,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
     double auc{0};
     auto n_threads = ctx_->Threads();
     CHECK_NE(n_classes, 0);
-    if (ctx_->gpu_id == Context::kCpuId) {
+    if (ctx_->IsCPU()) {
       auc = MultiClassOVR(ctx_, predts.ConstHostVector(), info, n_classes, n_threads, BinaryROCAUC);
     } else {
       auc = GPUMultiClassROCAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_, n_classes);
@@ -355,7 +354,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
   std::tuple<double, double, double>
   EvalBinary(HostDeviceVector<float> const &predts, MetaInfo const &info) {
     double fp, tp, auc;
-    if (ctx_->gpu_id == Context::kCpuId) {
+    if (ctx_->IsCPU()) {
       std::tie(fp, tp, auc) = BinaryROCAUC(ctx_, predts.ConstHostVector(),
                                            info.labels.HostView().Slice(linalg::All(), 0),
                                            common::OptionalWeights{info.weights_.ConstHostSpan()});
@@ -367,7 +366,7 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
   }
 
  public:
-  char const* Name() const override {
+  [[nodiscard]] char const* Name() const override {
     return "auc";
   }
 };
@@ -405,7 +404,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
   std::tuple<double, double, double>
   EvalBinary(HostDeviceVector<float> const &predts, MetaInfo const &info) {
     double pr, re, auc;
-    if (ctx_->gpu_id == Context::kCpuId) {
+    if (ctx_->IsCPU()) {
       std::tie(pr, re, auc) =
           BinaryPRAUC(ctx_, predts.ConstHostSpan(), info.labels.HostView().Slice(linalg::All(), 0),
                       common::OptionalWeights{info.weights_.ConstHostSpan()});
@@ -418,7 +417,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
 
   double EvalMultiClass(HostDeviceVector<float> const &predts, MetaInfo const &info,
                         size_t n_classes) {
-    if (ctx_->gpu_id == Context::kCpuId) {
+    if (ctx_->IsCPU()) {
       auto n_threads = this->ctx_->Threads();
       return MultiClassOVR(ctx_, predts.ConstHostSpan(), info, n_classes, n_threads, BinaryPRAUC);
     } else {
@@ -431,7 +430,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
     double auc{0};
     uint32_t valid_groups = 0;
     auto n_threads = ctx_->Threads();
-    if (ctx_->gpu_id == Context::kCpuId) {
+    if (ctx_->IsCPU()) {
       auto labels = info.labels.Data()->ConstHostSpan();
       if (std::any_of(labels.cbegin(), labels.cend(), PRAUCLabelInvalid{})) {
         InvalidLabels();
@@ -446,7 +445,7 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
   }
 
  public:
-  const char *Name() const override { return "aucpr"; }
+  [[nodiscard]] const char *Name() const override { return "aucpr"; }
 };
 
 XGBOOST_REGISTER_METRIC(AUCPR, "aucpr")
@@ -473,5 +472,4 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *, common::Span<f
   return {};
 }
 #endif
-}  // namespace metric
-}  // namespace xgboost
+}  // namespace xgboost::metric
diff --git a/src/metric/auc.cu b/src/metric/auc.cu
index 7f8fa38be..0c24a4829 100644
--- a/src/metric/auc.cu
+++ b/src/metric/auc.cu
@@ -926,8 +926,7 @@ std::pair<double, std::uint32_t> GPURankingPRAUC(Context const *ctx,
                                                  common::Span<float const> predts,
                                                  MetaInfo const &info,
                                                  std::shared_ptr<DeviceAUCCache> *p_cache) {
-  dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
-
+  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
   if (predts.empty()) {
     return std::make_pair(0.0, static_cast<uint32_t>(0));
   }
diff --git a/src/metric/elementwise_metric.cu b/src/metric/elementwise_metric.cu
index 388487344..937e31400 100644
--- a/src/metric/elementwise_metric.cu
+++ b/src/metric/elementwise_metric.cu
@@ -46,7 +46,26 @@ template <typename Fn>
 PackedReduceResult Reduce(Context const* ctx, MetaInfo const& info, Fn&& loss) {
   PackedReduceResult result;
   auto labels = info.labels.View(ctx->Device());
-  if (ctx->IsCPU()) {
+  if (ctx->IsCUDA()) {
+#if defined(XGBOOST_USE_CUDA)
+    dh::XGBCachingDeviceAllocator<char> alloc;
+    thrust::counting_iterator<size_t> begin(0);
+    thrust::counting_iterator<size_t> end = begin + labels.Size();
+    result = thrust::transform_reduce(
+        thrust::cuda::par(alloc), begin, end,
+        [=] XGBOOST_DEVICE(size_t i) {
+          auto idx = linalg::UnravelIndex(i, labels.Shape());
+          auto sample_id = std::get<0>(idx);
+          auto target_id = std::get<1>(idx);
+          auto res = loss(i, sample_id, target_id);
+          float v{std::get<0>(res)}, wt{std::get<1>(res)};
+          return PackedReduceResult{v, wt};
+        },
+        PackedReduceResult{}, thrust::plus<PackedReduceResult>());
+#else
+    common::AssertGPUSupport();
+#endif  //  defined(XGBOOST_USE_CUDA)
+  } else {
     auto n_threads = ctx->Threads();
     std::vector<double> score_tloc(n_threads, 0.0);
     std::vector<double> weight_tloc(n_threads, 0.0);
@@ -69,41 +88,6 @@ PackedReduceResult Reduce(Context const* ctx, MetaInfo const& info, Fn&& loss) {
     double residue_sum = std::accumulate(score_tloc.cbegin(), score_tloc.cend(), 0.0);
     double weights_sum = std::accumulate(weight_tloc.cbegin(), weight_tloc.cend(), 0.0);
     result = PackedReduceResult{residue_sum, weights_sum};
-  } else {
-#if defined(XGBOOST_USE_CUDA)
-    dh::XGBCachingDeviceAllocator<char> alloc;
-    thrust::counting_iterator<size_t> begin(0);
-    thrust::counting_iterator<size_t> end = begin + labels.Size();
-    result = thrust::transform_reduce(
-        thrust::cuda::par(alloc), begin, end,
-        [=] XGBOOST_DEVICE(size_t i) {
-          auto idx = linalg::UnravelIndex(i, labels.Shape());
-          auto sample_id = std::get<0>(idx);
-          auto target_id = std::get<1>(idx);
-          auto res = loss(i, sample_id, target_id);
-          float v{std::get<0>(res)}, wt{std::get<1>(res)};
-          return PackedReduceResult{v, wt};
-        },
-        PackedReduceResult{}, thrust::plus<PackedReduceResult>());
-#elif defined(XGBOOST_USE_HIP)
-    dh::XGBCachingDeviceAllocator<char> alloc;
-    thrust::counting_iterator<size_t> begin(0);
-    thrust::counting_iterator<size_t> end = begin + labels.Size();
-
-    result = thrust::transform_reduce(
-        thrust::hip::par(alloc), begin, end,
-        [=] XGBOOST_DEVICE(size_t i) {
-          auto idx = linalg::UnravelIndex(i, labels.Shape());
-          auto sample_id = std::get<0>(idx);
-          auto target_id = std::get<1>(idx);
-          auto res = loss(i, sample_id, target_id);
-          float v{std::get<0>(res)}, wt{std::get<1>(res)};
-          return PackedReduceResult{v, wt};
-        },
-        PackedReduceResult{}, thrust::plus<PackedReduceResult>());
-#else
-    common::AssertGPUSupport();
-#endif  //  defined(XGBOOST_USE_CUDA)
   }
   return result;
 }
@@ -201,10 +185,10 @@ class PseudoErrorLoss : public MetricNoCache {
     CHECK_EQ(info.labels.Shape(0), info.num_row_);
     auto labels = info.labels.View(ctx_->Device());
     preds.SetDevice(ctx_->Device());
-    auto predts = ctx_->IsCPU() ? preds.ConstHostSpan() : preds.ConstDeviceSpan();
+    auto predts = ctx_->IsCUDA() ? preds.ConstDeviceSpan() : preds.ConstHostSpan();
     info.weights_.SetDevice(ctx_->Device());
-    common::OptionalWeights weights(ctx_->IsCPU() ? info.weights_.ConstHostSpan()
-                                                     : info.weights_.ConstDeviceSpan());
+    common::OptionalWeights weights(ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan()
+                                                   : info.weights_.ConstHostSpan());
     float slope = this->param_.huber_slope;
     CHECK_NE(slope, 0.0) << "slope for pseudo huber cannot be 0.";
     PackedReduceResult result =
@@ -367,10 +351,10 @@ struct EvalEWiseBase : public MetricNoCache {
     }
     auto labels = info.labels.View(ctx_->Device());
     info.weights_.SetDevice(ctx_->Device());
-    common::OptionalWeights weights(ctx_->IsCPU() ? info.weights_.ConstHostSpan()
-                                                     : info.weights_.ConstDeviceSpan());
+    common::OptionalWeights weights(ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan()
+                                                   : info.weights_.ConstHostSpan());
     preds.SetDevice(ctx_->Device());
-    auto predts = ctx_->IsCPU() ? preds.ConstHostSpan() : preds.ConstDeviceSpan();
+    auto predts = ctx_->IsCUDA() ? preds.ConstDeviceSpan() : preds.ConstHostSpan();
 
     auto d_policy = policy_;
     auto result =
diff --git a/src/metric/multiclass_metric.cu b/src/metric/multiclass_metric.cu
index ba236a0be..6e9019488 100644
--- a/src/metric/multiclass_metric.cu
+++ b/src/metric/multiclass_metric.cu
@@ -149,24 +149,24 @@ class MultiClassMetricsReduction {
 
 #endif  // XGBOOST_USE_CUDA || defined(XGBOOST_USE_HIP)
 
-  PackedReduceResult Reduce(const Context& tparam, int device, size_t n_class,
+  PackedReduceResult Reduce(const Context& ctx, DeviceOrd device, size_t n_class,
                             const HostDeviceVector<bst_float>& weights,
                             const HostDeviceVector<bst_float>& labels,
                             const HostDeviceVector<bst_float>& preds) {
     PackedReduceResult result;
 
-    if (device < 0) {
+    if (device.IsCPU()) {
       result =
-          CpuReduceMetrics(weights, labels, preds, n_class, tparam.Threads());
+          CpuReduceMetrics(weights, labels, preds, n_class, ctx.Threads());
     }
 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
     else {  // NOLINT
-      device_ = tparam.gpu_id;
+      device_ = ctx.Device();
       preds.SetDevice(device_);
       labels.SetDevice(device_);
       weights.SetDevice(device_);
 
-      dh::safe_cuda(cudaSetDevice(device_));
+      dh::safe_cuda(cudaSetDevice(device_.ordinal));
       result = DeviceReduceMetrics(weights, labels, preds, n_class);
     }
 #endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
@@ -176,8 +176,8 @@ class MultiClassMetricsReduction {
  private:
 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
   dh::PinnedMemory label_error_;
-  int device_{-1};
-#endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
+  DeviceOrd device_{DeviceOrd::CPU()};
+#endif  // defined(XGBOOST_USE_CUDA)
 };
 
 /*!
@@ -198,7 +198,7 @@ struct EvalMClassBase : public MetricNoCache {
       CHECK_GE(nclass, 1U)
           << "mlogloss and merror are only used for multi-class classification,"
           << " use logloss for binary classification";
-      int device = ctx_->gpu_id;
+      auto device = ctx_->Device();
       auto result =
           reducer_.Reduce(*ctx_, device, nclass, info.weights_, *info.labels.Data(), preds);
       dat[0] = result.Residue();
diff --git a/src/metric/rank_metric.cu b/src/metric/rank_metric.cu
index 4ab016006..30814447a 100644
--- a/src/metric/rank_metric.cu
+++ b/src/metric/rank_metric.cu
@@ -41,7 +41,7 @@ PackedReduceResult PreScore(Context const *ctx, MetaInfo const &info,
   auto d_gptr = p_cache->DataGroupPtr(ctx);
   auto d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);
 
-  predt.SetDevice(ctx->gpu_id);
+  predt.SetDevice(ctx->Device());
   auto d_rank_idx = p_cache->SortedIdx(ctx, predt.ConstDeviceSpan());
   auto topk = p_cache->Param().TopK();
   auto d_weight = common::MakeOptionalWeights(ctx, info.weights_);
@@ -96,7 +96,7 @@ PackedReduceResult NDCGScore(Context const *ctx, MetaInfo const &info,
     CHECK_EQ(d_weight.weights.size(), p_cache->Groups());
   }
   auto d_label = info.labels.View(ctx->Device()).Slice(linalg::All(), 0);
-  predt.SetDevice(ctx->gpu_id);
+  predt.SetDevice(ctx->Device());
   auto d_predt = linalg::MakeTensorView(ctx, predt.ConstDeviceSpan(), predt.Size());
 
   auto d_group_ptr = p_cache->DataGroupPtr(ctx);
diff --git a/src/metric/survival_metric.cu b/src/metric/survival_metric.cu
index ef49687f9..b501bed76 100644
--- a/src/metric/survival_metric.cu
+++ b/src/metric/survival_metric.cu
@@ -148,19 +148,18 @@ class ElementWiseSurvivalMetricsReduction {
       const HostDeviceVector<bst_float>& preds) {
     PackedReduceResult result;
 
-    if (ctx.gpu_id < 0) {
+    if (ctx.IsCPU()) {
       result = CpuReduceMetrics(weights, labels_lower_bound, labels_upper_bound,
                                 preds, ctx.Threads());
     }
 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
     else {  // NOLINT
-      preds.SetDevice(ctx.gpu_id);
-      labels_lower_bound.SetDevice(ctx.gpu_id);
-      labels_upper_bound.SetDevice(ctx.gpu_id);
-      weights.SetDevice(ctx.gpu_id);
-
-      dh::safe_cuda(cudaSetDevice(ctx.gpu_id));
+      preds.SetDevice(ctx.Device());
+      labels_lower_bound.SetDevice(ctx.Device());
+      labels_upper_bound.SetDevice(ctx.Device());
+      weights.SetDevice(ctx.Device());
 
+      dh::safe_cuda(cudaSetDevice(ctx.Ordinal()));
       result = DeviceReduceMetrics(weights, labels_lower_bound, labels_upper_bound, preds);
     }
 #endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
diff --git a/src/objective/adaptive.h b/src/objective/adaptive.h
index ffd3ddec7..a64f37f63 100644
--- a/src/objective/adaptive.h
+++ b/src/objective/adaptive.h
@@ -96,13 +96,13 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit
 inline void UpdateTreeLeaf(Context const* ctx, HostDeviceVector<bst_node_t> const& position,
                            std::int32_t group_idx, MetaInfo const& info, float learning_rate,
                            HostDeviceVector<float> const& predt, float alpha, RegTree* p_tree) {
-  if (ctx->IsCPU()) {
-    detail::UpdateTreeLeafHost(ctx, position.ConstHostVector(), group_idx, info, learning_rate,
-                               predt, alpha, p_tree);
-  } else {
-    position.SetDevice(ctx->gpu_id);
+  if (ctx->IsCUDA()) {
+    position.SetDevice(ctx->Device());
     detail::UpdateTreeLeafDevice(ctx, position.ConstDeviceSpan(), group_idx, info, learning_rate,
                                  predt, alpha, p_tree);
+  } else {
+    detail::UpdateTreeLeafHost(ctx, position.ConstHostVector(), group_idx, info, learning_rate,
+                               predt, alpha, p_tree);
   }
 }
 }  // namespace obj
diff --git a/src/objective/aft_obj.cu b/src/objective/aft_obj.cu
index c2c8662b2..74a08ed86 100644
--- a/src/objective/aft_obj.cu
+++ b/src/objective/aft_obj.cu
@@ -42,7 +42,7 @@ class AFTObj : public ObjFunction {
 
   template <typename Distribution>
   void GetGradientImpl(const HostDeviceVector<bst_float>& preds, const MetaInfo& info,
-                       linalg::Matrix<GradientPair>* out_gpair, size_t ndata, int device,
+                       linalg::Matrix<GradientPair>* out_gpair, size_t ndata, DeviceOrd device,
                        bool is_null_weight, float aft_loss_distribution_scale) {
     common::Transform<>::Init(
         [=] XGBOOST_DEVICE(size_t _idx,
@@ -75,7 +75,7 @@ class AFTObj : public ObjFunction {
     CHECK_EQ(info.labels_upper_bound_.Size(), ndata);
     out_gpair->SetDevice(ctx_->Device());
     out_gpair->Reshape(ndata, 1);
-    const int device = ctx_->gpu_id;
+    const auto device = ctx_->Device();
     const float aft_loss_distribution_scale = param_.aft_loss_distribution_scale;
     const bool is_null_weight = info.weights_.Size() == 0;
     if (!is_null_weight) {
@@ -108,7 +108,7 @@ class AFTObj : public ObjFunction {
           _preds[_idx] = exp(_preds[_idx]);
         },
         common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
-        io_preds->DeviceIdx())
+        io_preds->Device())
         .Eval(io_preds);
   }
 
diff --git a/src/objective/hinge.cu b/src/objective/hinge.cu
index 0473f3636..ded9a05ed 100644
--- a/src/objective/hinge.cu
+++ b/src/objective/hinge.cu
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2018-2022 by XGBoost Contributors
+/**
+ * Copyright 2018-2023, XGBoost Contributors
  * \file hinge.cc
  * \brief Provides an implementation of the hinge loss function
  * \author Henry Gouk
@@ -13,8 +13,7 @@
 #include "../common/transform.h"
 #include "../common/common.h"
 
-namespace xgboost {
-namespace obj {
+namespace xgboost::obj {
 
 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
 DMLC_REGISTRY_FILE_TAG(hinge_obj_gpu);
@@ -63,7 +62,7 @@ class HingeObj : public ObjFunction {
           _out_gpair[_idx] = GradientPair(g, h);
         },
         common::Range{0, static_cast<int64_t>(ndata)}, this->ctx_->Threads(),
-        ctx_->gpu_id).Eval(
+        ctx_->Device()).Eval(
             out_gpair->Data(), &preds, info.labels.Data(), &info.weights_);
   }
 
@@ -73,11 +72,11 @@ class HingeObj : public ObjFunction {
           _preds[_idx] = _preds[_idx] > 0.0 ? 1.0 : 0.0;
         },
         common::Range{0, static_cast<int64_t>(io_preds->Size()), 1}, this->ctx_->Threads(),
-        io_preds->DeviceIdx())
+        io_preds->Device())
         .Eval(io_preds);
   }
 
-  const char* DefaultEvalMetric() const override {
+  [[nodiscard]] const char* DefaultEvalMetric() const override {
     return "error";
   }
 
@@ -93,5 +92,4 @@ XGBOOST_REGISTER_OBJECTIVE(HingeObj, "binary:hinge")
 .describe("Hinge loss. Expects labels to be in [0,1f]")
 .set_body([]() { return new HingeObj(); });
 
-}  // namespace obj
-}  // namespace xgboost
+}  // namespace xgboost::obj
diff --git a/src/objective/init_estimation.cc b/src/objective/init_estimation.cc
index 47e0364fe..df06882bb 100644
--- a/src/objective/init_estimation.cc
+++ b/src/objective/init_estimation.cc
@@ -20,8 +20,8 @@ void FitIntercept::InitEstimation(MetaInfo const& info, linalg::Vector<float>* b
     CheckInitInputs(info);
   }
   // Avoid altering any state in child objective.
-  HostDeviceVector<float> dummy_predt(info.labels.Size(), 0.0f, this->ctx_->gpu_id);
-  linalg::Matrix<GradientPair> gpair(info.labels.Shape(), this->ctx_->gpu_id);
+  HostDeviceVector<float> dummy_predt(info.labels.Size(), 0.0f, this->ctx_->Device());
+  linalg::Matrix<GradientPair> gpair(info.labels.Shape(), this->ctx_->Device());
 
   Json config{Object{}};
   this->SaveConfig(&config);
diff --git a/src/objective/lambdarank_obj.cc b/src/objective/lambdarank_obj.cc
index e751ca5e6..efddf636e 100644
--- a/src/objective/lambdarank_obj.cc
+++ b/src/objective/lambdarank_obj.cc
@@ -103,19 +103,19 @@ class LambdaRankObj : public FitIntercept {
 
   // Update position biased for unbiased click data
   void UpdatePositionBias() {
-    li_full_.SetDevice(ctx_->gpu_id);
-    lj_full_.SetDevice(ctx_->gpu_id);
-    li_.SetDevice(ctx_->gpu_id);
-    lj_.SetDevice(ctx_->gpu_id);
+    li_full_.SetDevice(ctx_->Device());
+    lj_full_.SetDevice(ctx_->Device());
+    li_.SetDevice(ctx_->Device());
+    lj_.SetDevice(ctx_->Device());
 
-    if (ctx_->IsCPU()) {
-      cpu_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->Device()),
-                                             lj_full_.View(ctx_->Device()), &ti_plus_, &tj_minus_,
-                                             &li_, &lj_, p_cache_);
-    } else {
+    if (ctx_->IsCUDA()) {
       cuda_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->Device()),
                                               lj_full_.View(ctx_->Device()), &ti_plus_, &tj_minus_,
                                               &li_, &lj_, p_cache_);
+    } else {
+      cpu_impl::LambdaRankUpdatePositionBias(ctx_, li_full_.View(ctx_->Device()),
+                                             lj_full_.View(ctx_->Device()), &ti_plus_, &tj_minus_,
+                                             &li_, &lj_, p_cache_);
     }
 
     li_full_.Data()->Fill(0.0);
diff --git a/src/objective/lambdarank_obj.cu b/src/objective/lambdarank_obj.cu
index f0a7f1d5e..9d908c19c 100644
--- a/src/objective/lambdarank_obj.cu
+++ b/src/objective/lambdarank_obj.cu
@@ -296,12 +296,12 @@ void Launch(Context const* ctx, std::int32_t iter, HostDeviceVector<float> const
             linalg::VectorView<double> li, linalg::VectorView<double> lj,
             linalg::Matrix<GradientPair>* out_gpair) {
   // boilerplate
-  std::int32_t device_id = ctx->gpu_id;
-  dh::safe_cuda(cudaSetDevice(device_id));
+  auto device = ctx->Device();
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
   auto n_groups = p_cache->Groups();
 
-  info.labels.SetDevice(device_id);
-  preds.SetDevice(device_id);
+  info.labels.SetDevice(device);
+  preds.SetDevice(device);
   out_gpair->SetDevice(ctx->Device());
   out_gpair->Reshape(preds.Size(), 1);
 
diff --git a/src/objective/multiclass_obj.cu b/src/objective/multiclass_obj.cu
index b7e6228af..4813ed5f9 100644
--- a/src/objective/multiclass_obj.cu
+++ b/src/objective/multiclass_obj.cu
@@ -63,7 +63,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
     const int nclass = param_.num_class;
     const auto ndata = static_cast<int64_t>(preds.Size() / nclass);
 
-    auto device = ctx_->gpu_id;
+    auto device = ctx_->Device();
     out_gpair->SetDevice(device);
     info.labels.SetDevice(device);
     info.weights_.SetDevice(device);
@@ -133,7 +133,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
     const int nclass = param_.num_class;
     const auto ndata = static_cast<int64_t>(io_preds->Size() / nclass);
 
-    auto device = io_preds->DeviceIdx();
+    auto device = io_preds->Device();
     if (prob) {
       common::Transform<>::Init(
           [=] XGBOOST_DEVICE(size_t _idx, common::Span<bst_float> _preds) {
diff --git a/src/objective/quantile_obj.cu b/src/objective/quantile_obj.cu
index 05b5309ee..b6e3a7d67 100644
--- a/src/objective/quantile_obj.cu
+++ b/src/objective/quantile_obj.cu
@@ -70,16 +70,16 @@ class QuantileRegression : public ObjFunction {
     out_gpair->Reshape(info.num_row_, n_targets);
     auto gpair = out_gpair->View(ctx_->Device());
 
-    info.weights_.SetDevice(ctx_->gpu_id);
-    common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
-                                                 : info.weights_.ConstDeviceSpan()};
+    info.weights_.SetDevice(ctx_->Device());
+    common::OptionalWeights weight{ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan()
+                                                  : info.weights_.ConstHostSpan()};
 
-    preds.SetDevice(ctx_->gpu_id);
+    preds.SetDevice(ctx_->Device());
     auto predt = linalg::MakeVec(&preds);
     auto n_samples = info.num_row_;
 
-    alpha_.SetDevice(ctx_->gpu_id);
-    auto alpha = ctx_->IsCPU() ? alpha_.ConstHostSpan() : alpha_.ConstDeviceSpan();
+    alpha_.SetDevice(ctx_->Device());
+    auto alpha = ctx_->IsCUDA() ? alpha_.ConstDeviceSpan() : alpha_.ConstHostSpan();
 
     linalg::ElementWiseKernel(
         ctx_, gpair, [=] XGBOOST_DEVICE(std::size_t i, GradientPair const&) mutable {
@@ -103,11 +103,48 @@ class QuantileRegression : public ObjFunction {
     CHECK(!alpha_.Empty());
 
     auto n_targets = this->Targets(info);
-    base_score->SetDevice(ctx_->gpu_id);
+    base_score->SetDevice(ctx_->Device());
     base_score->Reshape(n_targets);
 
     double sw{0};
-    if (ctx_->IsCPU()) {
+    if (ctx_->IsCUDA()) {
+#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
+      alpha_.SetDevice(ctx_->Device());
+      auto d_alpha = alpha_.ConstDeviceSpan();
+      auto d_labels = info.labels.View(ctx_->Device());
+      auto seg_it = dh::MakeTransformIterator<std::size_t>(
+          thrust::make_counting_iterator(0ul),
+          [=] XGBOOST_DEVICE(std::size_t i) { return i * d_labels.Shape(0); });
+      CHECK_EQ(d_labels.Shape(1), 1);
+      auto val_it = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
+                                                     [=] XGBOOST_DEVICE(std::size_t i) {
+                                                       auto sample_idx = i % d_labels.Shape(0);
+                                                       return d_labels(sample_idx, 0);
+                                                     });
+      auto n = d_labels.Size() * d_alpha.size();
+      CHECK_EQ(base_score->Size(), d_alpha.size());
+      if (info.weights_.Empty()) {
+        common::SegmentedQuantile(ctx_, d_alpha.data(), seg_it, seg_it + d_alpha.size() + 1, val_it,
+                                  val_it + n, base_score->Data());
+        sw = info.num_row_;
+      } else {
+        info.weights_.SetDevice(ctx_->Device());
+        auto d_weights = info.weights_.ConstDeviceSpan();
+        auto weight_it = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
+                                                          [=] XGBOOST_DEVICE(std::size_t i) {
+                                                            auto sample_idx = i % d_labels.Shape(0);
+                                                            return d_weights[sample_idx];
+                                                          });
+        common::SegmentedWeightedQuantile(ctx_, d_alpha.data(), seg_it, seg_it + d_alpha.size() + 1,
+                                          val_it, val_it + n, weight_it, weight_it + n,
+                                          base_score->Data());
+        sw = dh::Reduce(ctx_->CUDACtx()->CTP(), dh::tcbegin(d_weights), dh::tcend(d_weights), 0.0,
+                        thrust::plus<double>{});
+      }
+#else
+      common::AssertGPUSupport();
+#endif  // defined(XGBOOST_USE_CUDA)
+    } else {
       auto quantiles = base_score->HostView();
       auto h_weights = info.weights_.ConstHostVector();
       if (info.weights_.Empty()) {
@@ -127,43 +164,6 @@ class QuantileRegression : public ObjFunction {
                                                   linalg::cend(h_labels), std::cbegin(h_weights));
         }
       }
-    } else {
-#if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
-      alpha_.SetDevice(ctx_->gpu_id);
-      auto d_alpha = alpha_.ConstDeviceSpan();
-      auto d_labels = info.labels.View(ctx_->Device());
-      auto seg_it = dh::MakeTransformIterator<std::size_t>(
-          thrust::make_counting_iterator(0ul),
-          [=] XGBOOST_DEVICE(std::size_t i) { return i * d_labels.Shape(0); });
-      CHECK_EQ(d_labels.Shape(1), 1);
-      auto val_it = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
-                                                     [=] XGBOOST_DEVICE(std::size_t i) {
-                                                       auto sample_idx = i % d_labels.Shape(0);
-                                                       return d_labels(sample_idx, 0);
-                                                     });
-      auto n = d_labels.Size() * d_alpha.size();
-      CHECK_EQ(base_score->Size(), d_alpha.size());
-      if (info.weights_.Empty()) {
-        common::SegmentedQuantile(ctx_, d_alpha.data(), seg_it, seg_it + d_alpha.size() + 1, val_it,
-                                  val_it + n, base_score->Data());
-        sw = info.num_row_;
-      } else {
-        info.weights_.SetDevice(ctx_->gpu_id);
-        auto d_weights = info.weights_.ConstDeviceSpan();
-        auto weight_it = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
-                                                          [=] XGBOOST_DEVICE(std::size_t i) {
-                                                            auto sample_idx = i % d_labels.Shape(0);
-                                                            return d_weights[sample_idx];
-                                                          });
-        common::SegmentedWeightedQuantile(ctx_, d_alpha.data(), seg_it, seg_it + d_alpha.size() + 1,
-                                          val_it, val_it + n, weight_it, weight_it + n,
-                                          base_score->Data());
-        sw = dh::Reduce(ctx_->CUDACtx()->CTP(), dh::tcbegin(d_weights), dh::tcend(d_weights), 0.0,
-                        thrust::plus<double>{});
-      }
-#else
-      common::AssertGPUSupport();
-#endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
     }
 
     // For multiple quantiles, we should extend the base score to a vector instead of
diff --git a/src/objective/regression_obj.cu b/src/objective/regression_obj.cu
index 2a461fc0a..9168c79fb 100644
--- a/src/objective/regression_obj.cu
+++ b/src/objective/regression_obj.cu
@@ -116,7 +116,7 @@ class RegLossObj : public FitIntercept {
 
     size_t const ndata = preds.Size();
     out_gpair->SetDevice(ctx_->Device());
-    auto device = ctx_->gpu_id;
+    auto device = ctx_->Device();
 
     bool is_null_weight = info.weights_.Size() == 0;
     auto scale_pos_weight = param_.scale_pos_weight;
@@ -124,7 +124,7 @@ class RegLossObj : public FitIntercept {
     additional_input_.HostVector().begin()[1] = is_null_weight;
 
     const size_t nthreads = ctx_->Threads();
-    bool on_device = device >= 0;
+    bool on_device = device.IsCUDA();
     // On CPU we run the transformation each thread processing a contigious block of data
     // for better performance.
     const size_t n_data_blocks = std::max(static_cast<size_t>(1), (on_device ? ndata : nthreads));
@@ -175,7 +175,7 @@ class RegLossObj : public FitIntercept {
           _preds[_idx] = Loss::PredTransform(_preds[_idx]);
         },
         common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
-        io_preds->DeviceIdx())
+        io_preds->Device())
         .Eval(io_preds);
   }
 
@@ -246,16 +246,16 @@ class PseudoHuberRegression : public FitIntercept {
     CHECK_NE(slope, 0.0) << "slope for pseudo huber cannot be 0.";
     auto labels = info.labels.View(ctx_->Device());
 
-    out_gpair->SetDevice(ctx_->gpu_id);
+    out_gpair->SetDevice(ctx_->Device());
     out_gpair->Reshape(info.num_row_, this->Targets(info));
     auto gpair = out_gpair->View(ctx_->Device());
 
-    preds.SetDevice(ctx_->gpu_id);
+    preds.SetDevice(ctx_->Device());
     auto predt = linalg::MakeVec(&preds);
 
-    info.weights_.SetDevice(ctx_->gpu_id);
-    common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
-                                                 : info.weights_.ConstDeviceSpan()};
+    info.weights_.SetDevice(ctx_->Device());
+    common::OptionalWeights weight{ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan()
+                                                  : info.weights_.ConstHostSpan()};
 
     linalg::ElementWiseKernel(ctx_, labels, [=] XGBOOST_DEVICE(size_t i, float const y) mutable {
       auto sample_id = std::get<0>(linalg::UnravelIndex(i, labels.Shape()));
@@ -287,6 +287,13 @@ class PseudoHuberRegression : public FitIntercept {
     }
     FromJson(in["pseudo_huber_param"], &param_);
   }
+  [[nodiscard]] Json DefaultMetricConfig() const override {
+    CHECK(param_.GetInitialised());
+    Json config{Object{}};
+    config["name"] = String{this->DefaultEvalMetric()};
+    config["pseudo_huber_param"] = ToJson(param_);
+    return config;
+  }
 };
 
 XGBOOST_REGISTER_OBJECTIVE(PseudoHuberRegression, "reg:pseudohubererror")
@@ -320,7 +327,7 @@ class PoissonRegression : public FitIntercept {
     size_t const ndata = preds.Size();
     out_gpair->SetDevice(ctx_->Device());
     out_gpair->Reshape(info.num_row_, this->Targets(info));
-    auto device = ctx_->gpu_id;
+    auto device = ctx_->Device();
     label_correct_.Resize(1);
     label_correct_.Fill(1);
 
@@ -362,7 +369,7 @@ class PoissonRegression : public FitIntercept {
           _preds[_idx] = expf(_preds[_idx]);
         },
         common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
-        io_preds->DeviceIdx())
+        io_preds->Device())
         .Eval(io_preds);
   }
   void EvalTransform(HostDeviceVector<bst_float> *io_preds) override {
@@ -505,7 +512,7 @@ class GammaRegression : public FitIntercept {
     CHECK_NE(info.labels.Size(), 0U) << "label set cannot be empty";
     CHECK_EQ(preds.Size(), info.labels.Size()) << "labels are not correctly provided";
     const size_t ndata = preds.Size();
-    auto device = ctx_->gpu_id;
+    auto device = ctx_->Device();
     out_gpair->SetDevice(ctx_->Device());
     out_gpair->Reshape(info.num_row_, this->Targets(info));
     label_correct_.Resize(1);
@@ -548,7 +555,7 @@ class GammaRegression : public FitIntercept {
           _preds[_idx] = expf(_preds[_idx]);
         },
         common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
-        io_preds->DeviceIdx())
+        io_preds->Device())
         .Eval(io_preds);
   }
   void EvalTransform(HostDeviceVector<bst_float> *io_preds) override {
@@ -606,7 +613,7 @@ class TweedieRegression : public FitIntercept {
     out_gpair->SetDevice(ctx_->Device());
     out_gpair->Reshape(info.num_row_, this->Targets(info));
 
-    auto device = ctx_->gpu_id;
+    auto device = ctx_->Device();
     label_correct_.Resize(1);
     label_correct_.Fill(1);
 
@@ -653,7 +660,7 @@ class TweedieRegression : public FitIntercept {
           _preds[_idx] = expf(_preds[_idx]);
         },
         common::Range{0, static_cast<int64_t>(io_preds->Size())}, this->ctx_->Threads(),
-        io_preds->DeviceIdx())
+        io_preds->Device())
         .Eval(io_preds);
   }
 
@@ -704,11 +711,11 @@ class MeanAbsoluteError : public ObjFunction {
     out_gpair->Reshape(info.num_row_, this->Targets(info));
     auto gpair = out_gpair->View(ctx_->Device());
 
-    preds.SetDevice(ctx_->gpu_id);
+    preds.SetDevice(ctx_->Device());
     auto predt = linalg::MakeVec(&preds);
-    info.weights_.SetDevice(ctx_->gpu_id);
-    common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
-                                                 : info.weights_.ConstDeviceSpan()};
+    info.weights_.SetDevice(ctx_->Device());
+    common::OptionalWeights weight{ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan()
+                                                  : info.weights_.ConstHostSpan()};
 
     linalg::ElementWiseKernel(ctx_, labels, [=] XGBOOST_DEVICE(std::size_t i, float y) mutable {
       auto sign = [](auto x) {
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index b1ab57b98..4a75903b7 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -180,33 +180,30 @@ struct DeviceAdapterLoader {
 
   XGBOOST_DEV_INLINE DeviceAdapterLoader(Batch const batch, bool use_shared,
                                          bst_feature_t num_features, bst_row_t num_rows,
-                                         size_t entry_start, float missing) :
-    batch{batch},
-    columns{num_features},
-    use_shared{use_shared},
-    is_valid{missing} {
-      extern __shared__ float _smem[];
-      smem = _smem;
-      if (use_shared) {
-        uint32_t global_idx = blockDim.x * blockIdx.x + threadIdx.x;
-        size_t shared_elements = blockDim.x * num_features;
-        dh::BlockFill(smem, shared_elements, nanf(""));
-        __syncthreads();
-        if (global_idx < num_rows) {
-          auto beg = global_idx * columns;
-          auto end = (global_idx + 1) * columns;
-          for (size_t i = beg; i < end; ++i) {
-            auto value = batch.GetElement(i).value;
-            if (is_valid(value)) {
-              smem[threadIdx.x * num_features + (i - beg)] = value;
-            }
+                                         size_t entry_start, float missing)
+      : batch{batch}, columns{num_features}, use_shared{use_shared}, is_valid{missing} {
+    extern __shared__ float _smem[];
+    smem = _smem;
+    if (use_shared) {
+      uint32_t global_idx = blockDim.x * blockIdx.x + threadIdx.x;
+      size_t shared_elements = blockDim.x * num_features;
+      dh::BlockFill(smem, shared_elements, nanf(""));
+      __syncthreads();
+      if (global_idx < num_rows) {
+        auto beg = global_idx * columns;
+        auto end = (global_idx + 1) * columns;
+        for (size_t i = beg; i < end; ++i) {
+          auto value = batch.GetElement(i).value;
+          if (is_valid(value)) {
+            smem[threadIdx.x * num_features + (i - beg)] = value;
           }
         }
       }
-      __syncthreads();
     }
+    __syncthreads();
+  }
 
-  XGBOOST_DEV_INLINE  float GetElement(size_t  ridx, size_t  fidx) const {
+  [[nodiscard]] XGBOOST_DEV_INLINE float GetElement(size_t ridx, size_t fidx) const {
     if (use_shared) {
       return smem[threadIdx.x * columns + fidx];
     }
@@ -340,11 +337,11 @@ class DeviceModel {
   size_t tree_end_;  // NOLINT
   int num_group;
 
-  void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, int32_t gpu_id) {
-    dh::safe_cuda(cudaSetDevice(gpu_id));
+  void Init(const gbm::GBTreeModel& model, size_t tree_begin, size_t tree_end, DeviceOrd device) {
+    dh::safe_cuda(cudaSetDevice(device.ordinal));
 
     // Copy decision trees to device
-    tree_segments = HostDeviceVector<size_t>({}, gpu_id);
+    tree_segments = HostDeviceVector<size_t>({}, device);
     auto& h_tree_segments = tree_segments.HostVector();
     h_tree_segments.reserve((tree_end - tree_begin) + 1);
     size_t sum = 0;
@@ -354,8 +351,8 @@ class DeviceModel {
       h_tree_segments.push_back(sum);
     }
 
-    nodes = HostDeviceVector<RegTree::Node>(h_tree_segments.back(), RegTree::Node(), gpu_id);
-    stats = HostDeviceVector<RTreeNodeStat>(h_tree_segments.back(), RTreeNodeStat(), gpu_id);
+    nodes = HostDeviceVector<RegTree::Node>(h_tree_segments.back(), RegTree::Node(), device);
+    stats = HostDeviceVector<RTreeNodeStat>(h_tree_segments.back(), RTreeNodeStat(), device);
     auto d_nodes = nodes.DevicePointer();
     auto d_stats = stats.DevicePointer();
     for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
@@ -370,12 +367,12 @@ class DeviceModel {
           sizeof(RTreeNodeStat) * src_stats.size(), cudaMemcpyDefault));
     }
 
-    tree_group = HostDeviceVector<int>(model.tree_info.size(), 0, gpu_id);
+    tree_group = HostDeviceVector<int>(model.tree_info.size(), 0, device);
     auto& h_tree_group = tree_group.HostVector();
     std::memcpy(h_tree_group.data(), model.tree_info.data(), sizeof(int) * model.tree_info.size());
 
     // Initialize categorical splits.
-    split_types.SetDevice(gpu_id);
+    split_types.SetDevice(device);
     std::vector<FeatureType>& h_split_types = split_types.HostVector();
     h_split_types.resize(h_tree_segments.back());
     for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
@@ -384,8 +381,8 @@ class DeviceModel {
                 h_split_types.begin() + h_tree_segments[tree_idx - tree_begin]);
     }
 
-    categories = HostDeviceVector<uint32_t>({}, gpu_id);
-    categories_tree_segments = HostDeviceVector<uint32_t>(1, 0, gpu_id);
+    categories = HostDeviceVector<uint32_t>({}, device);
+    categories_tree_segments = HostDeviceVector<uint32_t>(1, 0, device);
     std::vector<uint32_t> &h_categories = categories.HostVector();
     std::vector<uint32_t> &h_split_cat_segments = categories_tree_segments.HostVector();
     for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
@@ -398,7 +395,7 @@ class DeviceModel {
     }
 
     categories_node_segments = HostDeviceVector<RegTree::CategoricalSplitMatrix::Segment>(
-        h_tree_segments.back(), {}, gpu_id);
+        h_tree_segments.back(), {}, device);
     std::vector<RegTree::CategoricalSplitMatrix::Segment>& h_categories_node_segments =
         categories_node_segments.HostVector();
     for (auto tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
@@ -490,8 +487,8 @@ struct PathInfo {
 void ExtractPaths(
     dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>> *paths,
     DeviceModel *model, dh::device_vector<uint32_t> *path_categories,
-    int gpu_id) {
-  dh::safe_cuda(cudaSetDevice(gpu_id));
+    DeviceOrd device) {
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
   auto& device_model = *model;
 
   dh::caching_device_vector<PathInfo> info(device_model.nodes.Size());
@@ -654,11 +651,12 @@ __global__ void MaskBitVectorKernel(
     common::Span<std::uint32_t const> d_categories, BitVector decision_bits, BitVector missing_bits,
     std::size_t tree_begin, std::size_t tree_end, std::size_t num_features, std::size_t num_rows,
     std::size_t entry_start, std::size_t num_nodes, bool use_shared, float missing) {
+  // This needs to be always instantiated since the data is loaded cooperatively by all threads.
+  SparsePageLoader loader(data, use_shared, num_features, num_rows, entry_start, missing);
   auto const row_idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (row_idx >= num_rows) {
     return;
   }
-  SparsePageLoader loader(data, use_shared, num_features, num_rows, entry_start, missing);
 
   std::size_t tree_offset = 0;
   for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
@@ -689,10 +687,10 @@ __global__ void MaskBitVectorKernel(
   }
 }
 
-__device__ float GetLeafWeightByBitVector(bst_row_t ridx, TreeView const& tree,
-                                          BitVector const& decision_bits,
-                                          BitVector const& missing_bits, std::size_t num_nodes,
-                                          std::size_t tree_offset) {
+__device__ bst_node_t GetLeafIndexByBitVector(bst_row_t ridx, TreeView const& tree,
+                                              BitVector const& decision_bits,
+                                              BitVector const& missing_bits, std::size_t num_nodes,
+                                              std::size_t tree_offset) {
   bst_node_t nidx = 0;
   RegTree::Node n = tree.d_tree[nidx];
   while (!n.IsLeaf()) {
@@ -704,9 +702,19 @@ __device__ float GetLeafWeightByBitVector(bst_row_t ridx, TreeView const& tree,
     }
     n = tree.d_tree[nidx];
   }
+  return nidx;
+}
+
+__device__ float GetLeafWeightByBitVector(bst_row_t ridx, TreeView const& tree,
+                                          BitVector const& decision_bits,
+                                          BitVector const& missing_bits, std::size_t num_nodes,
+                                          std::size_t tree_offset) {
+  auto const nidx =
+      GetLeafIndexByBitVector(ridx, tree, decision_bits, missing_bits, num_nodes, tree_offset);
   return tree.d_tree[nidx].LeafValue();
 }
 
+template <bool predict_leaf>
 __global__ void PredictByBitVectorKernel(
     common::Span<RegTree::Node const> d_nodes, common::Span<float> d_out_predictions,
     common::Span<std::size_t const> d_tree_segments, common::Span<int const> d_tree_group,
@@ -722,27 +730,39 @@ __global__ void PredictByBitVectorKernel(
   }
 
   std::size_t tree_offset = 0;
-  if (num_group == 1) {
-    float sum = 0;
-    for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
+  if constexpr (predict_leaf) {
+    for (size_t tree_idx = tree_begin; tree_idx < tree_end; ++tree_idx) {
       TreeView d_tree{tree_begin,          tree_idx,           d_nodes,
                       d_tree_segments,     d_tree_split_types, d_cat_tree_segments,
                       d_cat_node_segments, d_categories};
-      sum += GetLeafWeightByBitVector(row_idx, d_tree, decision_bits, missing_bits, num_nodes,
-                                      tree_offset);
+      auto const leaf = GetLeafIndexByBitVector(row_idx, d_tree, decision_bits, missing_bits,
+                                                num_nodes, tree_offset);
+      d_out_predictions[row_idx * (tree_end - tree_begin) + tree_idx] = static_cast<float>(leaf);
       tree_offset += d_tree.d_tree.size();
     }
-    d_out_predictions[row_idx] += sum;
   } else {
-    for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
-      auto const tree_group = d_tree_group[tree_idx];
-      TreeView d_tree{tree_begin,          tree_idx,           d_nodes,
-                      d_tree_segments,     d_tree_split_types, d_cat_tree_segments,
-                      d_cat_node_segments, d_categories};
-      bst_uint out_prediction_idx = row_idx * num_group + tree_group;
-      d_out_predictions[out_prediction_idx] += GetLeafWeightByBitVector(
-          row_idx, d_tree, decision_bits, missing_bits, num_nodes, tree_offset);
-      tree_offset += d_tree.d_tree.size();
+    if (num_group == 1) {
+      float sum = 0;
+      for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
+          TreeView d_tree{tree_begin,          tree_idx,           d_nodes,
+                          d_tree_segments,     d_tree_split_types, d_cat_tree_segments,
+                          d_cat_node_segments, d_categories};
+          sum += GetLeafWeightByBitVector(row_idx, d_tree, decision_bits, missing_bits, num_nodes,
+                                          tree_offset);
+          tree_offset += d_tree.d_tree.size();
+      }
+      d_out_predictions[row_idx] += sum;
+    } else {
+      for (auto tree_idx = tree_begin; tree_idx < tree_end; tree_idx++) {
+          auto const tree_group = d_tree_group[tree_idx];
+          TreeView d_tree{tree_begin,          tree_idx,           d_nodes,
+                          d_tree_segments,     d_tree_split_types, d_cat_tree_segments,
+                          d_cat_node_segments, d_categories};
+          bst_uint out_prediction_idx = row_idx * num_group + tree_group;
+          d_out_predictions[out_prediction_idx] += GetLeafWeightByBitVector(
+              row_idx, d_tree, decision_bits, missing_bits, num_nodes, tree_offset);
+          tree_offset += d_tree.d_tree.size();
+      }
     }
   }
 }
@@ -754,21 +774,29 @@ class ColumnSplitHelper {
   void PredictBatch(DMatrix* dmat, HostDeviceVector<float>* out_preds,
                     gbm::GBTreeModel const& model, DeviceModel const& d_model) const {
     CHECK(dmat->PageExists<SparsePage>()) << "Column split for external memory is not support.";
-    PredictDMatrix(dmat, out_preds, d_model, model.learner_model_param->num_feature,
-                   model.learner_model_param->num_output_group);
+    PredictDMatrix<false>(dmat, out_preds, d_model, model.learner_model_param->num_feature,
+                          model.learner_model_param->num_output_group);
+  }
+
+  void PredictLeaf(DMatrix* dmat, HostDeviceVector<float>* out_preds, gbm::GBTreeModel const& model,
+                   DeviceModel const& d_model) const {
+    CHECK(dmat->PageExists<SparsePage>()) << "Column split for external memory is not support.";
+    PredictDMatrix<true>(dmat, out_preds, d_model, model.learner_model_param->num_feature,
+                         model.learner_model_param->num_output_group);
   }
 
  private:
   using BitType = BitVector::value_type;
 
+  template <bool predict_leaf>
   void PredictDMatrix(DMatrix* dmat, HostDeviceVector<float>* out_preds, DeviceModel const& model,
                       bst_feature_t num_features, std::uint32_t num_group) const {
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
     dh::caching_device_vector<BitType> decision_storage{};
     dh::caching_device_vector<BitType> missing_storage{};
 
     auto constexpr kBlockThreads = 128;
-    auto const max_shared_memory_bytes = dh::MaxSharedMemory(ctx_->gpu_id);
+    auto const max_shared_memory_bytes = dh::MaxSharedMemory(ctx_->Ordinal());
     auto const shared_memory_bytes =
         SharedMemoryBytes<kBlockThreads>(num_features, max_shared_memory_bytes);
     auto const use_shared = shared_memory_bytes != 0;
@@ -781,8 +809,8 @@ class ColumnSplitHelper {
       BitVector decision_bits{dh::ToSpan(decision_storage)};
       BitVector missing_bits{dh::ToSpan(missing_storage)};
 
-      batch.offset.SetDevice(ctx_->gpu_id);
-      batch.data.SetDevice(ctx_->gpu_id);
+      batch.offset.SetDevice(ctx_->Device());
+      batch.data.SetDevice(ctx_->Device());
       std::size_t entry_start = 0;
       SparsePageView data(batch.data.DeviceSpan(), batch.offset.DeviceSpan(), num_features);
 
@@ -798,7 +826,7 @@ class ColumnSplitHelper {
       AllReduceBitVectors(&decision_storage, &missing_storage);
 
       dh::LaunchKernel {grid, kBlockThreads, 0, ctx_->CUDACtx()->Stream()} (
-          PredictByBitVectorKernel, model.nodes.ConstDeviceSpan(),
+          PredictByBitVectorKernel<predict_leaf>, model.nodes.ConstDeviceSpan(),
           out_preds->DeviceSpan().subspan(batch_offset), model.tree_segments.ConstDeviceSpan(),
           model.tree_group.ConstDeviceSpan(), model.split_types.ConstDeviceSpan(),
           model.categories_tree_segments.ConstDeviceSpan(),
@@ -813,15 +841,14 @@ class ColumnSplitHelper {
   void AllReduceBitVectors(dh::caching_device_vector<BitType>* decision_storage,
                            dh::caching_device_vector<BitType>* missing_storage) const {
     collective::AllReduce<collective::Operation::kBitwiseOR>(
-        ctx_->gpu_id, decision_storage->data().get(), decision_storage->size());
+        ctx_->Ordinal(), decision_storage->data().get(), decision_storage->size());
     collective::AllReduce<collective::Operation::kBitwiseAND>(
-        ctx_->gpu_id, missing_storage->data().get(), missing_storage->size());
-    collective::Synchronize(ctx_->gpu_id);
+        ctx_->Ordinal(), missing_storage->data().get(), missing_storage->size());
   }
 
   void ResizeBitVectors(dh::caching_device_vector<BitType>* decision_storage,
-                               dh::caching_device_vector<BitType>* missing_storage,
-                               std::size_t total_bits) const {
+                        dh::caching_device_vector<BitType>* missing_storage,
+                        std::size_t total_bits) const {
     auto const size = BitVector::ComputeStorageSize(total_bits);
     if (decision_storage->size() < size) {
       decision_storage->resize(size);
@@ -844,12 +871,12 @@ class GPUPredictor : public xgboost::Predictor {
                        size_t num_features,
                        HostDeviceVector<bst_float>* predictions,
                        size_t batch_offset, bool is_dense) const {
-    batch.offset.SetDevice(ctx_->gpu_id);
-    batch.data.SetDevice(ctx_->gpu_id);
+    batch.offset.SetDevice(ctx_->Device());
+    batch.data.SetDevice(ctx_->Device());
     const uint32_t BLOCK_THREADS = 128;
     size_t num_rows = batch.Size();
     auto GRID_SIZE = static_cast<uint32_t>(common::DivRoundUp(num_rows, BLOCK_THREADS));
-    auto max_shared_memory_bytes = ConfigureDevice(ctx_->gpu_id);
+    auto max_shared_memory_bytes = ConfigureDevice(ctx_->Device());
     size_t shared_memory_bytes =
         SharedMemoryBytes<BLOCK_THREADS>(num_features, max_shared_memory_bytes);
     bool use_shared = shared_memory_bytes != 0;
@@ -905,12 +932,12 @@ class GPUPredictor : public xgboost::Predictor {
     if (tree_end - tree_begin == 0) {
       return;
     }
-    out_preds->SetDevice(ctx_->gpu_id);
+    out_preds->SetDevice(ctx_->Device());
     auto const& info = dmat->Info();
     DeviceModel d_model;
-    d_model.Init(model, tree_begin, tree_end, ctx_->gpu_id);
+    d_model.Init(model, tree_begin, tree_end, ctx_->Device());
 
-    if (dmat->Info().IsColumnSplit()) {
+    if (info.IsColumnSplit()) {
       column_split_helper_.PredictBatch(dmat, out_preds, model, d_model);
       return;
     }
@@ -925,10 +952,10 @@ class GPUPredictor : public xgboost::Predictor {
     } else {
       size_t batch_offset = 0;
       for (auto const& page : dmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
-        dmat->Info().feature_types.SetDevice(ctx_->gpu_id);
+        dmat->Info().feature_types.SetDevice(ctx_->Device());
         auto feature_types = dmat->Info().feature_types.ConstDeviceSpan();
         this->PredictInternal(
-            page.Impl()->GetDeviceAccessor(ctx_->gpu_id, feature_types),
+            page.Impl()->GetDeviceAccessor(ctx_->Device(), feature_types),
             d_model,
             out_preds,
             batch_offset);
@@ -942,16 +969,15 @@ class GPUPredictor : public xgboost::Predictor {
       : Predictor::Predictor{ctx}, column_split_helper_{ctx} {}
 
   ~GPUPredictor() override {
-    if (ctx_->gpu_id >= 0 && ctx_->gpu_id < common::AllVisibleGPUs()) {
-      dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
+    if (ctx_->IsCUDA() && ctx_->Ordinal() < common::AllVisibleGPUs()) {
+      dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
     }
   }
 
   void PredictBatch(DMatrix* dmat, PredictionCacheEntry* predts,
                     const gbm::GBTreeModel& model, uint32_t tree_begin,
                     uint32_t tree_end = 0) const override {
-    int device = ctx_->gpu_id;
-    CHECK_GE(device, 0) << "Set `gpu_id' to positive value for processing GPU data.";
+    CHECK(ctx_->Device().IsCUDA()) << "Set `device' to `cuda` for processing GPU data.";
     auto* out_preds = &predts->predictions;
     if (tree_end == 0) {
       tree_end = model.trees.size();
@@ -969,9 +995,9 @@ class GPUPredictor : public xgboost::Predictor {
     auto m = std::any_cast<std::shared_ptr<Adapter>>(x);
     CHECK_EQ(m->NumColumns(), model.learner_model_param->num_feature)
         << "Number of columns in data must equal to trained model.";
-    CHECK_EQ(dh::CurrentDevice(), m->DeviceIdx())
-        << "XGBoost is running on device: " << this->ctx_->gpu_id << ", "
-        << "but data is on: " << m->DeviceIdx();
+    CHECK_EQ(dh::CurrentDevice(), m->Device().ordinal)
+        << "XGBoost is running on device: " << this->ctx_->Device().Name() << ", "
+        << "but data is on: " << m->Device().Name();
     if (p_m) {
       p_m->Info().num_row_ = m->NumRows();
       this->InitOutPredictions(p_m->Info(), &(out_preds->predictions), model);
@@ -980,16 +1006,16 @@ class GPUPredictor : public xgboost::Predictor {
       info.num_row_ = m->NumRows();
       this->InitOutPredictions(info, &(out_preds->predictions), model);
     }
-    out_preds->predictions.SetDevice(m->DeviceIdx());
+    out_preds->predictions.SetDevice(m->Device());
 
     const uint32_t BLOCK_THREADS = 128;
     auto GRID_SIZE = static_cast<uint32_t>(common::DivRoundUp(m->NumRows(), BLOCK_THREADS));
 
-    auto max_shared_memory_bytes = dh::MaxSharedMemory(m->DeviceIdx());
+    auto max_shared_memory_bytes = dh::MaxSharedMemory(m->Device().ordinal);
     size_t shared_memory_bytes =
         SharedMemoryBytes<BLOCK_THREADS>(m->NumColumns(), max_shared_memory_bytes);
     DeviceModel d_model;
-    d_model.Init(model, tree_begin, tree_end, m->DeviceIdx());
+    d_model.Init(model, tree_begin, tree_end, m->Device());
 
     bool use_shared = shared_memory_bytes != 0;
     size_t entry_start = 0;
@@ -1039,10 +1065,10 @@ class GPUPredictor : public xgboost::Predictor {
     if (tree_weights != nullptr) {
       LOG(FATAL) << "Dart booster feature " << not_implemented;
     }
-
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
-
-    out_contribs->SetDevice(ctx_->gpu_id);
+    CHECK(!p_fmat->Info().IsColumnSplit())
+        << "Predict contribution support for column-wise data split is not yet implemented.";
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
+    out_contribs->SetDevice(ctx_->Device());
     if (tree_end == 0 || tree_end > model.trees.size()) {
       tree_end = static_cast<uint32_t>(model.trees.size());
     }
@@ -1060,12 +1086,12 @@ class GPUPredictor : public xgboost::Predictor {
     dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>>
         device_paths;
     DeviceModel d_model;
-    d_model.Init(model, 0, tree_end, ctx_->gpu_id);
+    d_model.Init(model, 0, tree_end, ctx_->Device());
     dh::device_vector<uint32_t> categories;
-    ExtractPaths(&device_paths, &d_model, &categories, ctx_->gpu_id);
+    ExtractPaths(&device_paths, &d_model, &categories, ctx_->Device());
     for (auto& batch : p_fmat->GetBatches<SparsePage>()) {
-      batch.data.SetDevice(ctx_->gpu_id);
-      batch.offset.SetDevice(ctx_->gpu_id);
+      batch.data.SetDevice(ctx_->Device());
+      batch.offset.SetDevice(ctx_->Device());
       SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
                        model.learner_model_param->num_feature);
       auto begin = dh::tbegin(phis) + batch.base_rowid * contributions_columns;
@@ -1074,7 +1100,7 @@ class GPUPredictor : public xgboost::Predictor {
           dh::tend(phis));
     }
     // Add the base margin term to last column
-    p_fmat->Info().base_margin_.SetDevice(ctx_->gpu_id);
+    p_fmat->Info().base_margin_.SetDevice(ctx_->Device());
     const auto margin = p_fmat->Info().base_margin_.Data()->ConstDeviceSpan();
 
     auto base_score = model.learner_model_param->BaseScore(ctx_);
@@ -1099,10 +1125,8 @@ class GPUPredictor : public xgboost::Predictor {
     if (tree_weights != nullptr) {
       LOG(FATAL) << "Dart booster feature " << not_implemented;
     }
-
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
-
-    out_contribs->SetDevice(ctx_->gpu_id);
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
+    out_contribs->SetDevice(ctx_->Device());
     if (tree_end == 0 || tree_end > model.trees.size()) {
       tree_end = static_cast<uint32_t>(model.trees.size());
     }
@@ -1121,12 +1145,12 @@ class GPUPredictor : public xgboost::Predictor {
     dh::device_vector<gpu_treeshap::PathElement<ShapSplitCondition>>
         device_paths;
     DeviceModel d_model;
-    d_model.Init(model, 0, tree_end, ctx_->gpu_id);
+    d_model.Init(model, 0, tree_end, ctx_->Device());
     dh::device_vector<uint32_t> categories;
-    ExtractPaths(&device_paths, &d_model, &categories, ctx_->gpu_id);
+    ExtractPaths(&device_paths, &d_model, &categories, ctx_->Device());
     for (auto& batch : p_fmat->GetBatches<SparsePage>()) {
-      batch.data.SetDevice(ctx_->gpu_id);
-      batch.offset.SetDevice(ctx_->gpu_id);
+      batch.data.SetDevice(ctx_->Device());
+      batch.offset.SetDevice(ctx_->Device());
       SparsePageView X(batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
                        model.learner_model_param->num_feature);
       auto begin = dh::tbegin(phis) + batch.base_rowid * contributions_columns;
@@ -1135,7 +1159,7 @@ class GPUPredictor : public xgboost::Predictor {
           dh::tend(phis));
     }
     // Add the base margin term to last column
-    p_fmat->Info().base_margin_.SetDevice(ctx_->gpu_id);
+    p_fmat->Info().base_margin_.SetDevice(ctx_->Device());
     const auto margin = p_fmat->Info().base_margin_.Data()->ConstDeviceSpan();
 
     auto base_score = model.learner_model_param->BaseScore(ctx_);
@@ -1160,30 +1184,35 @@ class GPUPredictor : public xgboost::Predictor {
   void PredictLeaf(DMatrix *p_fmat, HostDeviceVector<bst_float> *predictions,
                    const gbm::GBTreeModel &model,
                    unsigned tree_end) const override {
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
-    auto max_shared_memory_bytes = ConfigureDevice(ctx_->gpu_id);
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
+    auto max_shared_memory_bytes = ConfigureDevice(ctx_->Device());
 
     const MetaInfo& info = p_fmat->Info();
+    bst_row_t num_rows = info.num_row_;
+    if (tree_end == 0 || tree_end > model.trees.size()) {
+      tree_end = static_cast<uint32_t>(model.trees.size());
+    }
+    predictions->SetDevice(ctx_->Device());
+    predictions->Resize(num_rows * tree_end);
+    DeviceModel d_model;
+    d_model.Init(model, 0, tree_end, this->ctx_->Device());
+
+    if (info.IsColumnSplit()) {
+      column_split_helper_.PredictLeaf(p_fmat, predictions, model, d_model);
+      return;
+    }
+
     constexpr uint32_t kBlockThreads = 128;
     size_t shared_memory_bytes = SharedMemoryBytes<kBlockThreads>(
         info.num_col_, max_shared_memory_bytes);
     bool use_shared = shared_memory_bytes != 0;
     bst_feature_t num_features = info.num_col_;
-    bst_row_t num_rows = info.num_row_;
     size_t entry_start = 0;
 
-    if (tree_end == 0 || tree_end > model.trees.size()) {
-      tree_end = static_cast<uint32_t>(model.trees.size());
-    }
-    predictions->SetDevice(ctx_->gpu_id);
-    predictions->Resize(num_rows * tree_end);
-    DeviceModel d_model;
-    d_model.Init(model, 0, tree_end, this->ctx_->gpu_id);
-
     if (p_fmat->PageExists<SparsePage>()) {
       for (auto const& batch : p_fmat->GetBatches<SparsePage>()) {
-        batch.data.SetDevice(ctx_->gpu_id);
-        batch.offset.SetDevice(ctx_->gpu_id);
+        batch.data.SetDevice(ctx_->Device());
+        batch.offset.SetDevice(ctx_->Device());
         bst_row_t batch_offset = 0;
         SparsePageView data{batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
                             model.learner_model_param->num_feature};
@@ -1208,7 +1237,7 @@ class GPUPredictor : public xgboost::Predictor {
     } else {
       for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
         bst_row_t batch_offset = 0;
-        EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_->gpu_id)};
+        EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_->Device())};
         size_t num_rows = batch.Size();
         auto grid =
             static_cast<uint32_t>(common::DivRoundUp(num_rows, kBlockThreads));
@@ -1236,9 +1265,9 @@ class GPUPredictor : public xgboost::Predictor {
 
  private:
   /*! \brief Reconfigure the device when GPU is changed. */
-  static size_t ConfigureDevice(int device) {
-    if (device >= 0) {
-      return dh::MaxSharedMemory(device);
+  static size_t ConfigureDevice(DeviceOrd device) {
+    if (device.IsCUDA()) {
+      return dh::MaxSharedMemory(device.ordinal);
     }
     return 0;
   }
diff --git a/src/predictor/predictor.cc b/src/predictor/predictor.cc
index 08b97de47..5c24a037b 100644
--- a/src/predictor/predictor.cc
+++ b/src/predictor/predictor.cc
@@ -49,8 +49,8 @@ void Predictor::InitOutPredictions(const MetaInfo& info, HostDeviceVector<bst_fl
   std::size_t n{model.learner_model_param->OutputLength() * info.num_row_};
 
   const HostDeviceVector<bst_float>* base_margin = info.base_margin_.Data();
-  if (ctx_->gpu_id >= 0) {
-    out_preds->SetDevice(ctx_->gpu_id);
+  if (ctx_->Device().IsCUDA()) {
+    out_preds->SetDevice(ctx_->Device());
   }
   if (!base_margin->Empty()) {
     out_preds->Resize(n);
diff --git a/src/tree/fit_stump.cc b/src/tree/fit_stump.cc
index 684299e88..6bafa77d0 100644
--- a/src/tree/fit_stump.cc
+++ b/src/tree/fit_stump.cc
@@ -19,8 +19,7 @@
 #include "xgboost/linalg.h"                // TensorView, Tensor, Constant
 #include "xgboost/logging.h"               // CHECK_EQ
 
-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {
 namespace cpu_impl {
 void FitStump(Context const* ctx, MetaInfo const& info,
               linalg::TensorView<GradientPair const, 2> gpair,
@@ -68,13 +67,12 @@ inline void FitStump(Context const*, MetaInfo const&, linalg::TensorView<Gradien
 
 void FitStump(Context const* ctx, MetaInfo const& info, linalg::Matrix<GradientPair> const& gpair,
               bst_target_t n_targets, linalg::Vector<float>* out) {
-  out->SetDevice(ctx->gpu_id);
+  out->SetDevice(ctx->Device());
   out->Reshape(n_targets);
 
   gpair.SetDevice(ctx->Device());
   auto gpair_t = gpair.View(ctx->Device());
-  ctx->IsCPU() ? cpu_impl::FitStump(ctx, info, gpair_t, out->HostView())
-      : cuda_impl::FitStump(ctx, info, gpair_t, out->View(ctx->Device()));
+  ctx->IsCUDA() ? cuda_impl::FitStump(ctx, info, gpair_t, out->View(ctx->Device()))
+                : cpu_impl::FitStump(ctx, info, gpair_t, out->HostView());
 }
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
diff --git a/src/tree/fit_stump.cu b/src/tree/fit_stump.cu
index f87939cea..8bbb62a29 100644
--- a/src/tree/fit_stump.cu
+++ b/src/tree/fit_stump.cu
@@ -21,9 +21,7 @@
 #include "xgboost/logging.h"  // CHECK_EQ
 #include "xgboost/span.h"     // span
 
-namespace xgboost {
-namespace tree {
-namespace cuda_impl {
+namespace xgboost::tree::cuda_impl {
 void FitStump(Context const* ctx, MetaInfo const& info,
               linalg::TensorView<GradientPair const, 2> gpair, linalg::VectorView<float> out) {
   auto n_targets = out.Size();
@@ -56,7 +54,7 @@ void FitStump(Context const* ctx, MetaInfo const& info,
   thrust::reduce_by_key(policy, key_it, key_it + gpair.Size(), grad_it,
                         thrust::make_discard_iterator(), dh::tbegin(d_sum.Values()));
 
-  collective::GlobalSum(info, ctx->gpu_id, reinterpret_cast<double*>(d_sum.Values().data()),
+  collective::GlobalSum(info, ctx->Device(), reinterpret_cast<double*>(d_sum.Values().data()),
                         d_sum.Size() * 2);
 
   thrust::for_each_n(policy, thrust::make_counting_iterator(0ul), n_targets,
@@ -65,6 +63,4 @@ void FitStump(Context const* ctx, MetaInfo const& info,
                            CalcUnregularizedWeight(d_sum(i).GetGrad(), d_sum(i).GetHess()));
                      });
 }
-}  // namespace cuda_impl
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree::cuda_impl
diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu
index ad5992602..542a7b6a5 100644
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -451,7 +451,7 @@ void GPUHistEvaluator::EvaluateSplits(
     auto const world_size = collective::GetWorldSize();
     dh::TemporaryArray<DeviceSplitCandidate> all_candidate_storage(out_splits.size() * world_size);
     auto all_candidates = dh::ToSpan(all_candidate_storage);
-    collective::AllGather(device_, out_splits.data(), all_candidates.data(),
+    collective::AllGather(device_.ordinal, out_splits.data(), all_candidates.data(),
                           out_splits.size() * sizeof(DeviceSplitCandidate));
 
     // Reduce to get the best candidate from all workers.
diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh
index 667982aa9..7c61099a1 100644
--- a/src/tree/gpu_hist/evaluate_splits.cuh
+++ b/src/tree/gpu_hist/evaluate_splits.cuh
@@ -85,7 +85,7 @@ class GPUHistEvaluator {
   std::size_t node_categorical_storage_size_ = 0;
   // Is the data split column-wise?
   bool is_column_split_ = false;
-  int32_t device_;
+  DeviceOrd device_;
 
   // Copy the categories from device to host asynchronously.
   void CopyToHost( const std::vector<bst_node_t>& nidx);
@@ -133,14 +133,14 @@ class GPUHistEvaluator {
   }
 
  public:
-  GPUHistEvaluator(TrainParam const &param, bst_feature_t n_features, int32_t device)
+  GPUHistEvaluator(TrainParam const &param, bst_feature_t n_features, DeviceOrd device)
       : tree_evaluator_{param, n_features, device}, param_{param} {}
   /**
    * \brief Reset the evaluator, should be called before any use.
    */
   void Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
              bst_feature_t n_features, TrainParam const &param, bool is_column_split,
-             int32_t device);
+             DeviceOrd device);
 
   /**
    * \brief Get host category storage for nidx.  Different from the internal version, this
diff --git a/src/tree/gpu_hist/evaluator.cu b/src/tree/gpu_hist/evaluator.cu
index b23cb670b..e4ca29c97 100644
--- a/src/tree/gpu_hist/evaluator.cu
+++ b/src/tree/gpu_hist/evaluator.cu
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2022 by XGBoost Contributors
+ * Copyright 2022-2023 by XGBoost Contributors
  *
  * \brief Some components of GPU Hist evaluator, this file only exist to reduce nvcc
  *        compilation time.
@@ -12,11 +12,10 @@
 #include "evaluate_splits.cuh"
 #include "xgboost/data.h"
 
-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {
 void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
                              bst_feature_t n_features, TrainParam const &param,
-                             bool is_column_split, int32_t device) {
+                             bool is_column_split, DeviceOrd device) {
   param_ = param;
   tree_evaluator_ = TreeEvaluator{param, n_features, device};
   has_categoricals_ = cuts.HasCategorical();
@@ -201,6 +200,4 @@ common::Span<bst_feature_t const> GPUHistEvaluator::SortHistogram(
 #endif
   return dh::ToSpan(cat_sorted_idx_);
 }
-
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
diff --git a/src/tree/gpu_hist/feature_groups.cuh b/src/tree/gpu_hist/feature_groups.cuh
index 3af230c2c..671272822 100644
--- a/src/tree/gpu_hist/feature_groups.cuh
+++ b/src/tree/gpu_hist/feature_groups.cuh
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2020 by XGBoost Contributors
+/**
+ * Copyright 2020-2023 by XGBoost Contributors
  */
 #ifndef FEATURE_GROUPS_CUH_
 #define FEATURE_GROUPS_CUH_
@@ -102,11 +102,10 @@ struct FeatureGroups {
     InitSingle(cuts);
   }
 
-  FeatureGroupsAccessor DeviceAccessor(int device) const {
+  [[nodiscard]] FeatureGroupsAccessor DeviceAccessor(DeviceOrd device) const {
     feature_segments.SetDevice(device);
     bin_segments.SetDevice(device);
-    return {feature_segments.ConstDeviceSpan(), bin_segments.ConstDeviceSpan(),
-        max_group_bins};
+    return {feature_segments.ConstDeviceSpan(), bin_segments.ConstDeviceSpan(), max_group_bins};
   }
 
 private:
diff --git a/src/tree/gpu_hist/gradient_based_sampler.cu b/src/tree/gpu_hist/gradient_based_sampler.cu
index 1082f8955..58add0a93 100644
--- a/src/tree/gpu_hist/gradient_based_sampler.cu
+++ b/src/tree/gpu_hist/gradient_based_sampler.cu
@@ -167,10 +167,10 @@ GradientBasedSample ExternalMemoryNoSampling::Sample(Context const* ctx,
     for (auto& batch : dmat->GetBatches<EllpackPage>(ctx, batch_param_)) {
       auto page = batch.Impl();
       if (!page_) {
-        page_ = std::make_unique<EllpackPageImpl>(ctx->gpu_id, page->Cuts(), page->is_dense,
+        page_ = std::make_unique<EllpackPageImpl>(ctx->Device(), page->Cuts(), page->is_dense,
                                                   page->row_stride, dmat->Info().num_row_);
       }
-      size_t num_elements = page_->Copy(ctx->gpu_id, page, offset);
+      size_t num_elements = page_->Copy(ctx->Device(), page, offset);
       offset += num_elements;
     }
     page_concatenated_ = true;
@@ -228,13 +228,13 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(Context const* ctx,
   auto first_page = (*batch_iterator.begin()).Impl();
   // Create a new ELLPACK page with empty rows.
   page_.reset();  // Release the device memory first before reallocating
-  page_.reset(new EllpackPageImpl(ctx->gpu_id, first_page->Cuts(), first_page->is_dense,
+  page_.reset(new EllpackPageImpl(ctx->Device(), first_page->Cuts(), first_page->is_dense,
                                   first_page->row_stride, sample_rows));
 
   // Compact the ELLPACK pages into the single sample page.
   thrust::fill(cuctx->CTP(), dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
   for (auto& batch : batch_iterator) {
-    page_->Compact(ctx->gpu_id, batch.Impl(), dh::ToSpan(sample_row_index_));
+    page_->Compact(ctx->Device(), batch.Impl(), dh::ToSpan(sample_row_index_));
   }
 
   return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
@@ -306,13 +306,13 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c
   auto first_page = (*batch_iterator.begin()).Impl();
   // Create a new ELLPACK page with empty rows.
   page_.reset();  // Release the device memory first before reallocating
-  page_.reset(new EllpackPageImpl(ctx->gpu_id, first_page->Cuts(), first_page->is_dense,
+  page_.reset(new EllpackPageImpl(ctx->Device(), first_page->Cuts(), first_page->is_dense,
                                   first_page->row_stride, sample_rows));
 
   // Compact the ELLPACK pages into the single sample page.
   thrust::fill(dh::tbegin(page_->gidx_buffer), dh::tend(page_->gidx_buffer), 0);
   for (auto& batch : batch_iterator) {
-    page_->Compact(ctx->gpu_id, batch.Impl(), dh::ToSpan(sample_row_index_));
+    page_->Compact(ctx->Device(), batch.Impl(), dh::ToSpan(sample_row_index_));
   }
 
   return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu
index b1ded6cda..35b43d24b 100644
--- a/src/tree/gpu_hist/row_partitioner.cu
+++ b/src/tree/gpu_hist/row_partitioner.cu
@@ -13,17 +13,15 @@
 namespace xgboost {
 namespace tree {
 
-RowPartitioner::RowPartitioner(int device_idx, size_t num_rows)
+RowPartitioner::RowPartitioner(DeviceOrd device_idx, size_t num_rows)
     : device_idx_(device_idx), ridx_(num_rows), ridx_tmp_(num_rows) {
-
-  dh::safe_cuda(cudaSetDevice(device_idx_));
-
+  dh::safe_cuda(cudaSetDevice(device_idx_.ordinal));
   ridx_segments_.emplace_back(NodePositionInfo{Segment(0, num_rows)});
   thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size());
 }
 
 RowPartitioner::~RowPartitioner() {
-  dh::safe_cuda(cudaSetDevice(device_idx_));
+  dh::safe_cuda(cudaSetDevice(device_idx_.ordinal));
 }
 
 common::Span<const RowPartitioner::RowIndexT> RowPartitioner::GetRows(bst_node_t nidx) {
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index 74f0dee2b..7c43d2fd4 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -210,7 +210,7 @@ class RowPartitioner {
   static constexpr bst_node_t kIgnoredTreePosition = -1;
 
  private:
-  int device_idx_;
+  DeviceOrd device_idx_;
   /*! \brief In here if you want to find the rows belong to a node nid, first you need to
    * get the indices segment from ridx_segments[nid], then get the row index that
    * represents position of row in input data X.  `RowPartitioner::GetRows` would be a
@@ -234,7 +234,7 @@ class RowPartitioner {
   dh::PinnedMemory pinned2_;
 
  public:
-  RowPartitioner(int device_idx, size_t num_rows);
+  RowPartitioner(DeviceOrd device_idx, size_t num_rows);
   ~RowPartitioner();
   RowPartitioner(const RowPartitioner&) = delete;
   RowPartitioner& operator=(const RowPartitioner&) = delete;
diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h
index d0267b0ed..680c50398 100644
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -292,20 +292,19 @@ class HistEvaluator {
    */
   std::vector<CPUExpandEntry> Allgather(std::vector<CPUExpandEntry> const &entries) {
     auto const world = collective::GetWorldSize();
-    auto const rank = collective::GetRank();
     auto const num_entries = entries.size();
 
     // First, gather all the primitive fields.
-    std::vector<CPUExpandEntry> all_entries(num_entries * world);
+    std::vector<CPUExpandEntry> local_entries(num_entries);
     std::vector<uint32_t> cat_bits;
     std::vector<std::size_t> cat_bits_sizes;
     for (std::size_t i = 0; i < num_entries; i++) {
-      all_entries[num_entries * rank + i].CopyAndCollect(entries[i], &cat_bits, &cat_bits_sizes);
+      local_entries[i].CopyAndCollect(entries[i], &cat_bits, &cat_bits_sizes);
     }
-    collective::Allgather(all_entries.data(), all_entries.size() * sizeof(CPUExpandEntry));
+    auto all_entries = collective::Allgather(local_entries);
 
     // Gather all the cat_bits.
-    auto gathered = collective::AllgatherV(cat_bits, cat_bits_sizes);
+    auto gathered = collective::SpecialAllgatherV(cat_bits, cat_bits_sizes);
 
     common::ParallelFor(num_entries * world, ctx_->Threads(), [&] (auto i) {
       // Copy the cat_bits back into all expand entries.
@@ -477,7 +476,7 @@ class HistEvaluator {
       : ctx_{ctx},
         param_{param},
         column_sampler_{std::move(sampler)},
-        tree_evaluator_{*param, static_cast<bst_feature_t>(info.num_col_), Context::kCpuId},
+        tree_evaluator_{*param, static_cast<bst_feature_t>(info.num_col_), DeviceOrd::CPU()},
         is_col_split_{info.IsColumnSplit()} {
     interaction_constraints_.Configure(*param, info.num_col_);
     column_sampler_->Init(ctx, info.num_col_, info.feature_weights.HostVector(),
@@ -579,28 +578,24 @@ class HistMultiEvaluator {
    */
   std::vector<MultiExpandEntry> Allgather(std::vector<MultiExpandEntry> const &entries) {
     auto const world = collective::GetWorldSize();
-    auto const rank = collective::GetRank();
     auto const num_entries = entries.size();
 
     // First, gather all the primitive fields.
-    std::vector<MultiExpandEntry> all_entries(num_entries * world);
+    std::vector<MultiExpandEntry> local_entries(num_entries);
     std::vector<uint32_t> cat_bits;
     std::vector<std::size_t> cat_bits_sizes;
     std::vector<GradientPairPrecise> gradients;
     for (std::size_t i = 0; i < num_entries; i++) {
-      all_entries[num_entries * rank + i].CopyAndCollect(entries[i], &cat_bits, &cat_bits_sizes,
-                                                         &gradients);
+      local_entries[i].CopyAndCollect(entries[i], &cat_bits, &cat_bits_sizes, &gradients);
     }
-    collective::Allgather(all_entries.data(), all_entries.size() * sizeof(MultiExpandEntry));
+    auto all_entries = collective::Allgather(local_entries);
 
     // Gather all the cat_bits.
-    auto gathered_cat_bits = collective::AllgatherV(cat_bits, cat_bits_sizes);
+    auto gathered_cat_bits = collective::SpecialAllgatherV(cat_bits, cat_bits_sizes);
 
     // Gather all the gradients.
     auto const num_gradients = gradients.size();
-    std::vector<GradientPairPrecise> all_gradients(num_gradients * world);
-    std::copy_n(gradients.cbegin(), num_gradients, all_gradients.begin() + num_gradients * rank);
-    collective::Allgather(all_gradients.data(), all_gradients.size() * sizeof(GradientPairPrecise));
+    auto const all_gradients = collective::Allgather(gradients);
 
     auto const total_entries = num_entries * world;
     auto const gradients_per_entry = num_gradients / num_entries;
@@ -696,7 +691,7 @@ class HistMultiEvaluator {
     stats_ = linalg::Constant(ctx_, GradientPairPrecise{}, 1, n_targets);
     gain_.resize(1);
 
-    linalg::Vector<float> weight({n_targets}, ctx_->gpu_id);
+    linalg::Vector<float> weight({n_targets}, ctx_->Device());
     CalcWeight(*param_, root_sum, weight.HostView());
     auto root_gain = CalcGainGivenWeight(*param_, root_sum, weight.HostView());
     gain_.front() = root_gain;
diff --git a/src/tree/split_evaluator.h b/src/tree/split_evaluator.h
index c54197cb9..13085d1a0 100644
--- a/src/tree/split_evaluator.h
+++ b/src/tree/split_evaluator.h
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2018-2020 by Contributors
+/**
+ * Copyright 2018-2023 by Contributors
  * \file split_evaluator.h
  * \brief Used for implementing a loss term specific to decision trees. Useful for custom regularisation.
  * \author Henry Gouk
@@ -23,8 +23,7 @@
 #include "xgboost/host_device_vector.h"
 #include "xgboost/tree_model.h"
 
-namespace xgboost {
-namespace tree {
+namespace xgboost::tree {
 class TreeEvaluator {
   // hist and exact use parent id to calculate constraints.
   static constexpr bst_node_t kRootParentId =
@@ -33,13 +32,13 @@ class TreeEvaluator {
   HostDeviceVector<float> lower_bounds_;
   HostDeviceVector<float> upper_bounds_;
   HostDeviceVector<int32_t> monotone_;
-  int32_t device_;
+  DeviceOrd device_;
   bool has_constraint_;
 
  public:
-  TreeEvaluator(TrainParam const& p, bst_feature_t n_features, int32_t device) {
+  TreeEvaluator(TrainParam const& p, bst_feature_t n_features, DeviceOrd device) {
     device_ = device;
-    if (device != Context::kCpuId) {
+    if (device.IsCUDA()) {
       lower_bounds_.SetDevice(device);
       upper_bounds_.SetDevice(device);
       monotone_.SetDevice(device);
@@ -59,7 +58,7 @@ class TreeEvaluator {
       has_constraint_ = true;
     }
 
-    if (device_ != Context::kCpuId) {
+    if (device_.IsCUDA()) {
       // Pull to device early.
       lower_bounds_.ConstDeviceSpan();
       upper_bounds_.ConstDeviceSpan();
@@ -122,8 +121,8 @@ class TreeEvaluator {
     }
 
     // Fast floating point division instruction on device
-    XGBOOST_DEVICE float Divide(float a, float b) const {
-#if defined(__CUDA_ARCH__)
+    [[nodiscard]] XGBOOST_DEVICE float Divide(float a, float b) const {
+#ifdef __CUDA_ARCH__
       return __fdividef(a, b);
 #elif defined(__HIP_PLATFORM_AMD__)
       return a / b;
@@ -156,7 +155,7 @@ class TreeEvaluator {
  public:
   /* Get a view to the evaluator that can be passed down to device. */
   template <typename ParamT = TrainParam> auto GetEvaluator() const {
-    if (device_ != Context::kCpuId) {
+    if (device_.IsCUDA()) {
       auto constraints = monotone_.ConstDevicePointer();
       return SplitEvaluator<ParamT>{constraints, lower_bounds_.ConstDevicePointer(),
                                     upper_bounds_.ConstDevicePointer(), has_constraint_};
@@ -217,7 +216,6 @@ enum SplitType {
   // partition-based categorical split
   kPart = 2
 };
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
 
 #endif  // XGBOOST_TREE_SPLIT_EVALUATOR_H_
diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc
index 3afbe3e46..7a88bd30e 100644
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -154,7 +154,7 @@ class ColMaker: public TreeUpdater {
         : param_(param),
           colmaker_train_param_{colmaker_train_param},
           ctx_{ctx},
-          tree_evaluator_(param_, column_densities.size(), Context::kCpuId),
+          tree_evaluator_(param_, column_densities.size(), DeviceOrd::CPU()),
           interaction_constraints_{std::move(_interaction_constraints)},
           column_densities_(column_densities) {}
     // update one tree, growing
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 58074a79e..faf110394 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -75,7 +75,7 @@ class DeviceHistogramStorage {
   dh::device_vector<typename GradientSumT::ValueT> overflow_;
   std::map<int, size_t> overflow_nidx_map_;
   int n_bins_;
-  int device_id_;
+  DeviceOrd device_id_;
   static constexpr size_t kNumItemsInGradientSum =
       sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT);
   static_assert(kNumItemsInGradientSum == 2, "Number of items in gradient type should be 2.");
@@ -83,7 +83,7 @@ class DeviceHistogramStorage {
  public:
   // Start with about 16mb
   DeviceHistogramStorage() { data_.reserve(1 << 22); }
-  void Init(int device_id, int n_bins) {
+  void Init(DeviceOrd device_id, int n_bins) {
     this->n_bins_ = n_bins;
     this->device_id_ = device_id;
   }
@@ -197,7 +197,7 @@ struct GPUHistMakerDevice {
                      common::Span<FeatureType const> _feature_types, bst_row_t _n_rows,
                      TrainParam _param, std::shared_ptr<common::ColumnSampler> column_sampler,
                      uint32_t n_features, BatchParam batch_param, MetaInfo const& info)
-      : evaluator_{_param, n_features, ctx->gpu_id},
+      : evaluator_{_param, n_features, ctx->Device()},
         ctx_(ctx),
         feature_types{_feature_types},
         param(std::move(_param)),
@@ -212,7 +212,7 @@ struct GPUHistMakerDevice {
     }
 
     CHECK(column_sampler_);
-    monitor.Init(std::string("GPUHistMakerDevice") + std::to_string(ctx_->gpu_id));
+    monitor.Init(std::string("GPUHistMakerDevice") + ctx_->Device().Name());
   }
 
   ~GPUHistMakerDevice() = default;
@@ -221,7 +221,7 @@ struct GPUHistMakerDevice {
     if (!feature_groups) {
       CHECK(page);
       feature_groups = std::make_unique<FeatureGroups>(page->Cuts(), page->is_dense,
-                                                       dh::MaxSharedMemoryOptin(ctx_->gpu_id),
+                                                       dh::MaxSharedMemoryOptin(ctx_->Ordinal()),
                                                        sizeof(GradientPairPrecise));
     }
   }
@@ -232,7 +232,7 @@ struct GPUHistMakerDevice {
     this->column_sampler_->Init(ctx_, num_columns, info.feature_weights.HostVector(),
                                 param.colsample_bynode, param.colsample_bylevel,
                                 param.colsample_bytree);
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
 
     this->interaction_constraints.Reset();
 
@@ -247,15 +247,15 @@ struct GPUHistMakerDevice {
     gpair = sample.gpair;
 
     this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param,
-                           dmat->Info().IsColumnSplit(), ctx_->gpu_id);
+                           dmat->Info().IsColumnSplit(), ctx_->Device());
 
     quantiser = std::make_unique<GradientQuantiser>(this->gpair, dmat->Info());
 
     row_partitioner.reset();  // Release the device memory first before reallocating
-    row_partitioner = std::make_unique<RowPartitioner>(ctx_->gpu_id, sample.sample_rows);
+    row_partitioner = std::make_unique<RowPartitioner>(ctx_->Device(), sample.sample_rows);
 
     // Init histogram
-    hist.Init(ctx_->gpu_id, page->Cuts().TotalBins());
+    hist.Init(ctx_->Device(), page->Cuts().TotalBins());
     hist.Reset();
 
     this->InitFeatureGroupsOnce();
@@ -268,7 +268,7 @@ struct GPUHistMakerDevice {
     sampled_features->SetDevice(ctx_->Device());
     common::Span<bst_feature_t> feature_set =
         interaction_constraints.Query(sampled_features->DeviceSpan(), nidx);
-    auto matrix = page->GetDeviceAccessor(ctx_->gpu_id);
+    auto matrix = page->GetDeviceAccessor(ctx_->Device());
     EvaluateSplitInputs inputs{nidx, 0, root_sum, feature_set, hist.GetNodeHistogram(nidx)};
     EvaluateSplitSharedInputs shared_inputs{
         gpu_param,
@@ -290,7 +290,7 @@ struct GPUHistMakerDevice {
     dh::TemporaryArray<DeviceSplitCandidate> splits_out(2 * candidates.size());
     std::vector<bst_node_t> nidx(2 * candidates.size());
     auto h_node_inputs = pinned2.GetSpan<EvaluateSplitInputs>(2 * candidates.size());
-    auto matrix = page->GetDeviceAccessor(ctx_->gpu_id);
+    auto matrix = page->GetDeviceAccessor(ctx_->Device());
     EvaluateSplitSharedInputs shared_inputs{GPUTrainingParam{param}, *quantiser, feature_types,
                                             matrix.feature_segments, matrix.gidx_fvalue_map,
                                             matrix.min_fvalue,
@@ -343,9 +343,9 @@ struct GPUHistMakerDevice {
   void BuildHist(int nidx) {
     auto d_node_hist = hist.GetNodeHistogram(nidx);
     auto d_ridx = row_partitioner->GetRows(nidx);
-    BuildGradientHistogram(ctx_->CUDACtx(), page->GetDeviceAccessor(ctx_->gpu_id),
-                           feature_groups->DeviceAccessor(ctx_->gpu_id), gpair, d_ridx, d_node_hist,
-                           *quantiser);
+    BuildGradientHistogram(ctx_->CUDACtx(), page->GetDeviceAccessor(ctx_->Device()),
+                           feature_groups->DeviceAccessor(ctx_->Device()), gpair, d_ridx,
+                           d_node_hist, *quantiser);
   }
 
   // Attempt to do subtraction trick
@@ -414,10 +414,10 @@ struct GPUHistMakerDevice {
     });
 
     collective::AllReduce<collective::Operation::kBitwiseOR>(
-        ctx_->gpu_id, decision_storage.data().get(), decision_storage.size());
+        ctx_->Ordinal(), decision_storage.data().get(), decision_storage.size());
     collective::AllReduce<collective::Operation::kBitwiseAND>(
-        ctx_->gpu_id, missing_storage.data().get(), missing_storage.size());
-    collective::Synchronize(ctx_->gpu_id);
+        ctx_->Ordinal(), missing_storage.data().get(), missing_storage.size());
+    collective::Synchronize(ctx_->Ordinal());
 
     row_partitioner->UpdatePositionBatch(
         nidx, left_nidx, right_nidx, split_data,
@@ -455,7 +455,7 @@ struct GPUHistMakerDevice {
       CHECK_EQ(split_type == FeatureType::kCategorical, e.split.is_cat);
     }
 
-    auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
+    auto d_matrix = page->GetDeviceAccessor(ctx_->Device());
 
     if (info_.IsColumnSplit()) {
       UpdatePositionColumnSplit(d_matrix, split_data, nidx, left_nidx, right_nidx);
@@ -527,9 +527,9 @@ struct GPUHistMakerDevice {
       common::Span<FeatureType const> d_feature_types, common::Span<uint32_t const> categories,
       common::Span<RegTree::CategoricalSplitMatrix::Segment> categories_segments,
       HostDeviceVector<bst_node_t>* p_out_position) {
-    auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
+    auto d_matrix = page->GetDeviceAccessor(ctx_->Device());
     auto d_gpair = this->gpair;
-    p_out_position->SetDevice(ctx_->gpu_id);
+    p_out_position->SetDevice(ctx_->Device());
     p_out_position->Resize(row_partitioner->GetRows().size());
 
     auto new_position_op = [=] __device__(size_t row_id, int position) {
@@ -619,7 +619,7 @@ struct GPUHistMakerDevice {
     monitor.Start("AllReduce");
     auto d_node_hist = hist.GetNodeHistogram(nidx).data();
     using ReduceT = typename std::remove_pointer<decltype(d_node_hist)>::type::ValueT;
-    collective::GlobalSum(info_, ctx_->gpu_id, reinterpret_cast<ReduceT*>(d_node_hist),
+    collective::GlobalSum(info_, ctx_->Device(), reinterpret_cast<ReduceT*>(d_node_hist),
                           page->Cuts().TotalBins() * 2 * num_histograms);
 
     monitor.Stop("AllReduce");
@@ -862,7 +862,7 @@ class GPUHistMaker : public TreeUpdater {
   }
 
   void InitDataOnce(TrainParam const* param, DMatrix* dmat) {
-    CHECK_GE(ctx_->gpu_id, 0) << "Must have at least one device";
+    CHECK_GE(ctx_->Ordinal(), 0) << "Must have at least one device";
     info_ = &dmat->Info();
 
     // Synchronise the column sampling seed
@@ -871,9 +871,8 @@ class GPUHistMaker : public TreeUpdater {
     this->column_sampler_ = std::make_shared<common::ColumnSampler>(column_sampling_seed);
 
     auto batch_param = BatchParam{param->max_bin, TrainParam::DftSparseThreshold()};
-    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
-
-    info_->feature_types.SetDevice(ctx_->gpu_id);
+    dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
+    info_->feature_types.SetDevice(ctx_->Device());
     maker = std::make_unique<GPUHistMakerDevice>(
         ctx_, !dmat->SingleColBlock(), info_->feature_types.ConstDeviceSpan(), info_->num_row_,
         *param, column_sampler_, info_->num_col_, batch_param, dmat->Info());
@@ -898,7 +897,7 @@ class GPUHistMaker : public TreeUpdater {
     this->InitData(param, p_fmat, p_tree);
     monitor_.Stop("InitData");
 
-    gpair->SetDevice(ctx_->gpu_id);
+    gpair->SetDevice(ctx_->Device());
     maker->UpdateTree(gpair, p_fmat, task_, p_tree, p_out_position);
   }
 
@@ -1031,7 +1030,7 @@ class GPUGlobalApproxMaker : public TreeUpdater {
     this->InitData(p_fmat, p_tree);
     monitor_.Stop("InitData");
 
-    gpair->SetDevice(ctx_->gpu_id);
+    gpair->SetDevice(ctx_->Device());
     maker_->UpdateTree(gpair, p_fmat, task_, p_tree, p_out_position);
   }
 
diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index 34890c2e5..50943e1c4 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -518,7 +518,7 @@ class QuantileHistMaker : public TreeUpdater {
     auto need_copy = [&] { return trees.size() > 1 || n_targets > 1; };
     if (need_copy()) {
       // allocate buffer
-      sample_out = decltype(sample_out){h_gpair.Shape(), ctx_->gpu_id, linalg::Order::kF};
+      sample_out = decltype(sample_out){h_gpair.Shape(), ctx_->Device(), linalg::Order::kF};
       h_sample_out = sample_out.HostView();
     }
 
diff --git a/tests/buildkite/conftest.sh b/tests/buildkite/conftest.sh
index 9e821f0fe..3d820d727 100755
--- a/tests/buildkite/conftest.sh
+++ b/tests/buildkite/conftest.sh
@@ -24,7 +24,7 @@ set -x
 
 CUDA_VERSION=11.8.0
 NCCL_VERSION=2.16.5-1
-RAPIDS_VERSION=23.08
+RAPIDS_VERSION=23.10
 SPARK_VERSION=3.4.0
 JDK_VERSION=8
 
diff --git a/tests/ci_build/Dockerfile.gpu_build_centos7 b/tests/ci_build/Dockerfile.gpu_build_centos7
index 6134d49aa..98a0a7033 100644
--- a/tests/ci_build/Dockerfile.gpu_build_centos7
+++ b/tests/ci_build/Dockerfile.gpu_build_centos7
@@ -1,5 +1,5 @@
 ARG CUDA_VERSION_ARG
-FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
+FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
 ARG CUDA_VERSION_ARG
 ARG NCCL_VERSION_ARG
 ARG RAPIDS_VERSION_ARG
diff --git a/tests/ci_build/Dockerfile.gpu_build_r_centos7 b/tests/ci_build/Dockerfile.gpu_build_r_centos7
index 6cfd30fe5..b73cf5adb 100644
--- a/tests/ci_build/Dockerfile.gpu_build_r_centos7
+++ b/tests/ci_build/Dockerfile.gpu_build_r_centos7
@@ -1,5 +1,5 @@
 ARG CUDA_VERSION_ARG
-FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
+FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
 ARG CUDA_VERSION_ARG
 
 # Install all basic requirements
diff --git a/tests/ci_build/Dockerfile.jvm_gpu_build b/tests/ci_build/Dockerfile.jvm_gpu_build
index d4a580495..86ce7e72a 100644
--- a/tests/ci_build/Dockerfile.jvm_gpu_build
+++ b/tests/ci_build/Dockerfile.jvm_gpu_build
@@ -1,5 +1,5 @@
 ARG CUDA_VERSION_ARG
-FROM nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
+FROM nvcr.io/nvidia/cuda:$CUDA_VERSION_ARG-devel-centos7
 ARG CUDA_VERSION_ARG
 ARG NCCL_VERSION_ARG
 
diff --git a/tests/ci_build/ci_build.sh b/tests/ci_build/ci_build.sh
index a8567a89a..ef0c69183 100755
--- a/tests/ci_build/ci_build.sh
+++ b/tests/ci_build/ci_build.sh
@@ -148,10 +148,11 @@ then
     $(aws ecr get-login --no-include-email --region ${DOCKER_CACHE_ECR_REGION} --registry-ids ${DOCKER_CACHE_ECR_ID})
     # Pull pre-build container from Docker build cache,
     # if one exists for the particular branch or pull request
-    echo "docker pull --quiet ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}"
-    if time docker pull --quiet "${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}"
+    DOCKER_TAG="${BRANCH_NAME//\//-}"  # Slashes are not allow in Docker tag
+    echo "docker pull --quiet ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${DOCKER_TAG}"
+    if time docker pull --quiet "${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${DOCKER_TAG}"
     then
-      CACHE_FROM_CMD="--cache-from ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}"
+      CACHE_FROM_CMD="--cache-from ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${DOCKER_TAG}"
     else
       # If the build cache is empty of the particular branch or pull request,
       # use the build cache associated with the master branch
@@ -185,8 +186,8 @@ if [[ -n "${DOCKER_CACHE_REPO}" ]]
 then
     # Push the container we just built to the Docker build cache
     # that is associated with the particular branch or pull request
-    echo "docker tag ${DOCKER_IMG_NAME} ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}"
-    docker tag "${DOCKER_IMG_NAME}" "${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}"
+    echo "docker tag ${DOCKER_IMG_NAME} ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${DOCKER_TAG}"
+    docker tag "${DOCKER_IMG_NAME}" "${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${DOCKER_TAG}"
 
     # Attempt to create Docker repository; it will fail if the repository already exists
     echo "aws ecr create-repository --repository-name ${DOCKER_IMG_NAME} --region ${DOCKER_CACHE_ECR_REGION}"
@@ -214,10 +215,10 @@ then
 EOF
     fi
 
-    echo "docker push --quiet ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}"
-    docker push --quiet "${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}"
+    echo "docker push --quiet ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${DOCKER_TAG}"
+    docker push --quiet "${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${DOCKER_TAG}"
     if [[ $? != "0" ]]; then
-        echo "ERROR: could not update Docker cache ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${BRANCH_NAME}"
+        echo "ERROR: could not update Docker cache ${DOCKER_CACHE_REPO}/${DOCKER_IMG_NAME}:${DOCKER_TAG}"
         exit 1
     fi
 fi
diff --git a/tests/ci_build/lint_cmake.sh b/tests/ci_build/lint_cmake.sh
new file mode 100644
index 000000000..d67ecd084
--- /dev/null
+++ b/tests/ci_build/lint_cmake.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+set -e
+
+cmake_files=$(
+    find . -name CMakeLists.txt -o -path "./cmake/*.cmake" \
+    | grep -v dmlc-core \
+    | grep -v gputreeshap
+)
+cmakelint \
+    --linelength=120 \
+    --filter=-convention/filename,-package/stdargs,-readability/wonkycase \
+    ${cmake_files} \
+|| exit 1
diff --git a/tests/ci_build/lint_cpp.py b/tests/ci_build/lint_cpp.py
new file mode 100644
index 000000000..593b8f870
--- /dev/null
+++ b/tests/ci_build/lint_cpp.py
@@ -0,0 +1,166 @@
+import argparse
+import os
+import re
+import sys
+
+import cpplint
+from cpplint import _cpplint_state
+
+CXX_SUFFIX = set(["cc", "c", "cpp", "h", "cu", "hpp"])
+
+
+def filepath_enumerate(paths):
+    """Enumerate the file paths of all subfiles of the list of paths"""
+    out = []
+    for path in paths:
+        if os.path.isfile(path):
+            out.append(path)
+        else:
+            for root, dirs, files in os.walk(path):
+                for name in files:
+                    out.append(os.path.normpath(os.path.join(root, name)))
+    return out
+
+
+def get_header_guard_dmlc(filename):
+    """Get Header Guard Convention for DMLC Projects.
+
+    For headers in include, directly use the path
+    For headers in src, use project name plus path
+
+    Examples: with project-name = dmlc
+        include/dmlc/timer.h -> DMLC_TIMTER_H_
+        src/io/libsvm_parser.h -> DMLC_IO_LIBSVM_PARSER_H_
+    """
+    fileinfo = cpplint.FileInfo(filename)
+    file_path_from_root = fileinfo.RepositoryName()
+    inc_list = ["include", "api", "wrapper", "contrib"]
+    if os.name == "nt":
+        inc_list.append("mshadow")
+
+    if file_path_from_root.find("src/") != -1 and _HELPER.project_name is not None:
+        idx = file_path_from_root.find("src/")
+        file_path_from_root = _HELPER.project_name + file_path_from_root[idx + 3 :]
+    else:
+        idx = file_path_from_root.find("include/")
+        if idx != -1:
+            file_path_from_root = file_path_from_root[idx + 8 :]
+        for spath in inc_list:
+            prefix = spath + "/"
+            if file_path_from_root.startswith(prefix):
+                file_path_from_root = re.sub("^" + prefix, "", file_path_from_root)
+                break
+    return re.sub(r"[-./\s]", "_", file_path_from_root).upper() + "_"
+
+
+class Lint:
+    def __init__(self):
+        self.project_name = "xgboost"
+        self.cpp_header_map = {}
+        self.cpp_src_map = {}
+        self.python_map = {}
+
+        self.pylint_cats = set(["error", "warning", "convention", "refactor"])
+        # setup cpp lint
+        cpplint_args = ["--quiet", "--extensions=" + (",".join(CXX_SUFFIX)), "."]
+        _ = cpplint.ParseArguments(cpplint_args)
+        cpplint._SetFilters(
+            ",".join(
+                [
+                    "-build/c++11",
+                    "-build/include,",
+                    "+build/namespaces",
+                    "+build/include_what_you_use",
+                    "+build/include_order",
+                ]
+            )
+        )
+        cpplint._SetCountingStyle("toplevel")
+        cpplint._line_length = 100
+
+    def process_cpp(self, path, suffix):
+        """Process a cpp file."""
+        _cpplint_state.ResetErrorCounts()
+        cpplint.ProcessFile(str(path), _cpplint_state.verbose_level)
+        _cpplint_state.PrintErrorCounts()
+        errors = _cpplint_state.errors_by_category.copy()
+
+        if suffix == "h":
+            self.cpp_header_map[str(path)] = errors
+        else:
+            self.cpp_src_map[str(path)] = errors
+
+    @staticmethod
+    def _print_summary_map(strm, result_map, ftype):
+        """Print summary of certain result map."""
+        if len(result_map) == 0:
+            return 0
+        npass = sum(1 for x in result_map.values() if len(x) == 0)
+        strm.write(f"====={npass}/{len(result_map)} {ftype} files passed check=====\n")
+        for fname, emap in result_map.items():
+            if len(emap) == 0:
+                continue
+            strm.write(
+                f"{fname}: {sum(emap.values())} Errors of {len(emap)} Categories map={str(emap)}\n"
+            )
+        return len(result_map) - npass
+
+    def print_summary(self, strm):
+        """Print summary of lint."""
+        nerr = 0
+        nerr += Lint._print_summary_map(strm, self.cpp_header_map, "cpp-header")
+        nerr += Lint._print_summary_map(strm, self.cpp_src_map, "cpp-source")
+        if nerr == 0:
+            strm.write("All passed!\n")
+        else:
+            strm.write(f"{nerr} files failed lint\n")
+        return nerr
+
+
+_HELPER = Lint()
+
+cpplint.GetHeaderGuardCPPVariable = get_header_guard_dmlc
+
+
+def process(fname, allow_type):
+    """Process a file."""
+    fname = str(fname)
+    arr = fname.rsplit(".", 1)
+    if fname.find("#") != -1 or arr[-1] not in allow_type:
+        return
+    if arr[-1] in CXX_SUFFIX:
+        _HELPER.process_cpp(fname, arr[-1])
+
+
+def main():
+    parser = argparse.ArgumentParser(description="run cpp lint")
+    parser.add_argument("path", nargs="+", help="path to traverse")
+    parser.add_argument(
+        "--exclude_path",
+        nargs="+",
+        default=[],
+        help="exclude this path, and all subfolders if path is a folder",
+    )
+    args = parser.parse_args()
+    excluded_paths = filepath_enumerate(args.exclude_path)
+
+    allow_type = []
+    allow_type += CXX_SUFFIX
+
+    for path in args.path:
+        if os.path.isfile(path):
+            normpath = os.path.normpath(path)
+            if normpath not in excluded_paths:
+                process(path, allow_type)
+        else:
+            for root, dirs, files in os.walk(path):
+                for name in files:
+                    file_path = os.path.normpath(os.path.join(root, name))
+                    if file_path not in excluded_paths:
+                        process(file_path, allow_type)
+    nerr = _HELPER.print_summary(sys.stderr)
+    sys.exit(nerr > 0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/ci_build/lint_r.R b/tests/ci_build/lint_r.R
index ce512482d..9b55ebfce 100644
--- a/tests/ci_build/lint_r.R
+++ b/tests/ci_build/lint_r.R
@@ -28,6 +28,7 @@ my_linters <- list(
   equals_na = lintr::equals_na_linter(),
   fixed_regex = lintr::fixed_regex_linter(),
   for_loop_index = lintr::for_loop_index_linter(),
+  function_left_parentheses = lintr::function_left_parentheses_linter(),
   function_return = lintr::function_return_linter(),
   infix_spaces_linter = lintr::infix_spaces_linter(),
   is_numeric = lintr::is_numeric_linter(),
diff --git a/tests/ci_build/test_r_package.py b/tests/ci_build/test_r_package.py
index f1e179d43..853bf0502 100644
--- a/tests/ci_build/test_r_package.py
+++ b/tests/ci_build/test_r_package.py
@@ -3,9 +3,15 @@ import argparse
 import os
 import shutil
 import subprocess
+from io import StringIO
 from pathlib import Path
 from platform import system
 
+try:
+    import pandas as pd
+except ImportError:
+    pd = None
+
 from test_utils import R_PACKAGE, ROOT, DirectoryExcursion, cd, print_time, record_time
 
 
@@ -97,16 +103,47 @@ def build_rpackage(path: str) -> str:
     return tarball
 
 
+def check_example_timing(rcheck_dir: Path, threshold: float) -> None:
+    with open(rcheck_dir / "xgboost-Ex.timings", "r") as fd:
+        timings = fd.readlines()
+        newlines = []
+        for line in timings:
+            line = line.strip()
+            newlines.append(line)
+        con_timings = "\n".join(newlines)
+        df = pd.read_csv(StringIO(con_timings), delimiter="\t")
+        ratio_n = "user/elapsed"
+        df[ratio_n] = df["user"] / df["elapsed"]
+        offending = df[df[ratio_n] > threshold]
+
+    try:
+        # requires the tabulate package
+        df.to_markdown("timings.md")
+        offending.to_markdown("offending.md")
+    except ImportError:
+        print("failed to export markdown files.")
+        pass
+
+    if offending.shape[0] == 0:
+        return
+
+    print(offending)
+    raise ValueError("There are examples using too many threads")
+
+
 @cd(ROOT)
 @record_time
 def check_rpackage(path: str) -> None:
     env = os.environ.copy()
     print("Ncpus:", f"{os.cpu_count()}")
+    threshold = 2.5
     env.update(
         {
             "MAKEFLAGS": f"-j{os.cpu_count()}",
             # cran specific environment variables
-            "_R_CHECK_EXAMPLE_TIMING_CPU_TO_ELAPSED_THRESHOLD_": str(2.5),
+            "_R_CHECK_EXAMPLE_TIMING_CPU_TO_ELAPSED_THRESHOLD_": str(threshold),
+            "_R_CHECK_TEST_TIMING_CPU_TO_ELAPSED_THRESHOLD_": str(threshold),
+            "_R_CHECK_VIGNETTE_TIMING_CPU_TO_ELAPSED_THRESHOLD_": str(threshold),
         }
     )
 
@@ -118,11 +155,14 @@ def check_rpackage(path: str) -> None:
         CC = os.path.join(mingw_bin, "gcc.exe")
         env.update({"CC": CC, "CXX": CXX})
 
-    status = subprocess.run([R, "CMD", "check", "--as-cran", path], env=env)
-    with open(Path("xgboost.Rcheck") / "00check.log", "r") as fd:
+    status = subprocess.run(
+        [R, "CMD", "check", "--as-cran", "--timings", path], env=env
+    )
+    rcheck_dir = Path("xgboost.Rcheck")
+    with open(rcheck_dir / "00check.log", "r") as fd:
         check_log = fd.read()
 
-    with open(Path("xgboost.Rcheck") / "00install.out", "r") as fd:
+    with open(rcheck_dir / "00install.out", "r") as fd:
         install_log = fd.read()
 
     msg = f"""
@@ -144,6 +184,8 @@ def check_rpackage(path: str) -> None:
     if check_log.find("Examples with CPU time") != -1:
         print(msg)
         raise ValueError("Suspicious NOTE.")
+    if pd is not None:
+        check_example_timing(rcheck_dir, threshold)
 
 
 @cd(R_PACKAGE)
@@ -264,6 +306,8 @@ def main(args: argparse.Namespace) -> None:
             test_with_autotools()
         else:
             test_with_cmake(args)
+    elif args.task == "timings":
+        check_example_timing(Path("xgboost.Rcheck"), 2.5)
     else:
         raise ValueError("Unexpected task.")
 
@@ -279,7 +323,7 @@ if __name__ == "__main__":
     parser.add_argument(
         "--task",
         type=str,
-        choices=["pack", "build", "check", "doc"],
+        choices=["pack", "build", "check", "doc", "timings"],
         default="check",
         required=False,
     )
diff --git a/tests/cpp/CMakeLists.txt b/tests/cpp/CMakeLists.txt
index e833c7a15..2b2b12a3a 100644
--- a/tests/cpp/CMakeLists.txt
+++ b/tests/cpp/CMakeLists.txt
@@ -1,17 +1,17 @@
-if (USE_DMLC_GTEST)
-  if (NOT TARGET gtest)
+if(USE_DMLC_GTEST)
+  if(NOT TARGET gtest)
     message(FATAL_ERROR "USE_DMLC_GTEST=ON but dmlc-core didn't bundle gtest")
-  endif (NOT TARGET gtest)
+  endif()
   set(GTEST_LIBRARIES gtest)
-else (USE_DMLC_GTEST)
+else()
   find_package(GTest REQUIRED)
-endif (USE_DMLC_GTEST)
+endif()
 file(GLOB_RECURSE TEST_SOURCES "*.cc")
 
-if (USE_CUDA)
+if(USE_CUDA)
   file(GLOB_RECURSE CUDA_TEST_SOURCES "*.cu")
   list(APPEND TEST_SOURCES ${CUDA_TEST_SOURCES})
-endif (USE_CUDA)
+endif()
 
 if (USE_HIP)
   file(GLOB_RECURSE HIP_TEST_SOURCES "*.hip")
@@ -19,24 +19,24 @@ if (USE_HIP)
 endif (USE_HIP)
 
 file(GLOB_RECURSE ONEAPI_TEST_SOURCES "plugin/*_oneapi.cc")
-if (NOT PLUGIN_UPDATER_ONEAPI)
+if(NOT PLUGIN_UPDATER_ONEAPI)
   list(REMOVE_ITEM TEST_SOURCES ${ONEAPI_TEST_SOURCES})
-endif (NOT PLUGIN_UPDATER_ONEAPI)
+endif()
 
-if (PLUGIN_FEDERATED)
+if(PLUGIN_FEDERATED)
   target_include_directories(testxgboost PRIVATE ${xgboost_SOURCE_DIR}/plugin/federated)
   target_link_libraries(testxgboost PRIVATE federated_client)
-else (PLUGIN_FEDERATED)
+else()
   file(GLOB_RECURSE FEDERATED_TEST_SOURCES "plugin/*_federated_*.*")
   list(REMOVE_ITEM TEST_SOURCES ${FEDERATED_TEST_SOURCES})
-endif (PLUGIN_FEDERATED)
+endif()
 
 target_sources(testxgboost PRIVATE ${TEST_SOURCES} ${xgboost_SOURCE_DIR}/plugin/example/custom_obj.cc)
 
-if (USE_CUDA AND PLUGIN_RMM)
+if(USE_CUDA AND PLUGIN_RMM)
   find_package(CUDA)
   target_include_directories(testxgboost PRIVATE ${CUDA_INCLUDE_DIRS})
-endif (USE_CUDA AND PLUGIN_RMM)
+endif()
 
 if (USE_HIP AND PLUGIN_RMM)
   find_package(HIP)
diff --git a/tests/cpp/c_api/test_c_api.cc b/tests/cpp/c_api/test_c_api.cc
index c02cf0bcf..41975ace8 100644
--- a/tests/cpp/c_api/test_c_api.cc
+++ b/tests/cpp/c_api/test_c_api.cc
@@ -108,6 +108,7 @@ TEST(CAPI, XGDMatrixCreateFromCSR) {
   Json::Dump(data_arr, &sdata);
   Json config{Object{}};
   config["missing"] = Number{std::numeric_limits<float>::quiet_NaN()};
+  config["data_split_mode"] = Integer{static_cast<int64_t>(DataSplitMode::kCol)};
   Json::Dump(config, &sconfig);
 
   DMatrixHandle handle;
@@ -120,6 +121,8 @@ TEST(CAPI, XGDMatrixCreateFromCSR) {
   ASSERT_EQ(n, 3);
   ASSERT_EQ(XGDMatrixNumNonMissing(handle, &n), 0);
   ASSERT_EQ(n, 3);
+  ASSERT_EQ(XGDMatrixDataSplitMode(handle, &n), 0);
+  ASSERT_EQ(n, static_cast<int64_t>(DataSplitMode::kCol));
 
   std::shared_ptr<xgboost::DMatrix> *pp_fmat =
       static_cast<std::shared_ptr<xgboost::DMatrix> *>(handle);
diff --git a/tests/cpp/collective/net_test.h b/tests/cpp/collective/net_test.h
new file mode 100644
index 000000000..ed15ed256
--- /dev/null
+++ b/tests/cpp/collective/net_test.h
@@ -0,0 +1,41 @@
+/**
+ * Copyright 2022-2023, XGBoost Contributors
+ */
+#pragma once
+
+#include <gtest/gtest.h>
+#include <xgboost/collective/socket.h>
+
+#include <fstream>  // ifstream
+
+#include "../helpers.h"  // for FileExists
+
+namespace xgboost::collective {
+class SocketTest : public ::testing::Test {
+ protected:
+  std::string skip_msg_{"Skipping IPv6 test"};
+
+  bool SkipTest() {
+    std::string path{"/sys/module/ipv6/parameters/disable"};
+    if (FileExists(path)) {
+      std::ifstream fin(path);
+      if (!fin) {
+        return true;
+      }
+      std::string s_value;
+      fin >> s_value;
+      auto value = std::stoi(s_value);
+      if (value != 0) {
+        return true;
+      }
+    } else {
+      return true;
+    }
+    return false;
+  }
+
+ protected:
+  void SetUp() override { system::SocketStartup(); }
+  void TearDown() override { system::SocketFinalize(); }
+};
+}  // namespace xgboost::collective
diff --git a/tests/cpp/collective/test_allgather.cc b/tests/cpp/collective/test_allgather.cc
new file mode 100644
index 000000000..a74b9f149
--- /dev/null
+++ b/tests/cpp/collective/test_allgather.cc
@@ -0,0 +1,117 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include <gtest/gtest.h>   // for ASSERT_EQ
+#include <xgboost/span.h>  // for Span, oper...
+
+#include <algorithm>  // for min
+#include <chrono>     // for seconds
+#include <cstddef>    // for size_t
+#include <cstdint>    // for int32_t
+#include <numeric>    // for iota
+#include <string>     // for string
+#include <thread>     // for thread
+#include <vector>     // for vector
+
+#include "../../../src/collective/allgather.h"  // for RingAllgather
+#include "../../../src/collective/comm.h"       // for RabitComm
+#include "gtest/gtest.h"                        // for AssertionR...
+#include "test_worker.h"                        // for TestDistri...
+#include "xgboost/collective/result.h"          // for Result
+
+namespace xgboost::collective {
+namespace {
+class AllgatherTest : public TrackerTest {};
+
+class Worker : public WorkerForTest {
+ public:
+  using WorkerForTest::WorkerForTest;
+
+  void Run() {
+    {
+      // basic test
+      std::vector<std::int32_t> data(comm_.World(), 0);
+      data[comm_.Rank()] = comm_.Rank();
+
+      auto rc = RingAllgather(this->comm_, common::Span{data.data(), data.size()}, 1);
+      ASSERT_TRUE(rc.OK()) << rc.Report();
+
+      for (std::int32_t r = 0; r < comm_.World(); ++r) {
+        ASSERT_EQ(data[r], r);
+      }
+    }
+    {
+      // test for limited socket buffer
+      this->LimitSockBuf(4096);
+
+      std::size_t n = 8192;  // n_bytes = 8192 * sizeof(int)
+      std::vector<std::int32_t> data(comm_.World() * n, 0);
+      auto s_data = common::Span{data.data(), data.size()};
+      auto seg = s_data.subspan(comm_.Rank() * n, n);
+      std::iota(seg.begin(), seg.end(), comm_.Rank());
+
+      auto rc = RingAllgather(comm_, common::Span{data.data(), data.size()}, n);
+      ASSERT_TRUE(rc.OK()) << rc.Report();
+
+      for (std::int32_t r = 0; r < comm_.World(); ++r) {
+        auto seg = s_data.subspan(r * n, n);
+        for (std::int32_t i = 0; i < static_cast<std::int32_t>(seg.size()); ++i) {
+          auto v = seg[i];
+          ASSERT_EQ(v, r + i);
+        }
+      }
+    }
+  }
+
+  void TestV() {
+    {
+      // basic test
+      std::int32_t n{comm_.Rank()};
+      std::vector<std::int32_t> result;
+      auto rc = RingAllgatherV(comm_, common::Span{&n, 1}, &result);
+      ASSERT_TRUE(rc.OK()) << rc.Report();
+      for (std::int32_t i = 0; i < comm_.World(); ++i) {
+        ASSERT_EQ(result[i], i);
+      }
+    }
+
+    {
+      // V test
+      std::vector<std::int32_t> data(comm_.Rank() + 1, comm_.Rank());
+      std::vector<std::int32_t> result;
+      auto rc = RingAllgatherV(comm_, common::Span{data.data(), data.size()}, &result);
+      ASSERT_TRUE(rc.OK()) << rc.Report();
+      ASSERT_EQ(result.size(), (1 + comm_.World()) * comm_.World() / 2);
+      std::int32_t k{0};
+      for (std::int32_t r = 0; r < comm_.World(); ++r) {
+        auto seg = common::Span{result.data(), result.size()}.subspan(k, (r + 1));
+        if (comm_.Rank() == 0) {
+          for (auto v : seg) {
+            ASSERT_EQ(v, r);
+          }
+          k += seg.size();
+        }
+      }
+    }
+  }
+};
+}  // namespace
+
+TEST_F(AllgatherTest, Basic) {
+  std::int32_t n_workers = std::min(7u, std::thread::hardware_concurrency());
+  TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
+                                 std::int32_t r) {
+    Worker worker{host, port, timeout, n_workers, r};
+    worker.Run();
+  });
+}
+
+TEST_F(AllgatherTest, V) {
+  std::int32_t n_workers = std::min(7u, std::thread::hardware_concurrency());
+  TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
+                                 std::int32_t r) {
+    Worker worker{host, port, timeout, n_workers, r};
+    worker.TestV();
+  });
+}
+}  // namespace xgboost::collective
diff --git a/tests/cpp/collective/test_allreduce.cc b/tests/cpp/collective/test_allreduce.cc
new file mode 100644
index 000000000..62b87e411
--- /dev/null
+++ b/tests/cpp/collective/test_allreduce.cc
@@ -0,0 +1,72 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+
+#include "../../../src/collective/allreduce.h"
+#include "../../../src/collective/tracker.h"
+#include "test_worker.h"  // for WorkerForTest, TestDistributed
+
+namespace xgboost::collective {
+
+namespace {
+class AllreduceWorker : public WorkerForTest {
+ public:
+  using WorkerForTest::WorkerForTest;
+
+  void Basic() {
+    {
+      std::vector<double> data(13, 0.0);
+      Allreduce(comm_, common::Span{data.data(), data.size()}, [](auto lhs, auto rhs) {
+        for (std::size_t i = 0; i < rhs.size(); ++i) {
+          rhs[i] += lhs[i];
+        }
+      });
+      ASSERT_EQ(std::accumulate(data.cbegin(), data.cend(), 0.0), 0.0);
+    }
+    {
+      std::vector<double> data(1, 1.0);
+      Allreduce(comm_, common::Span{data.data(), data.size()}, [](auto lhs, auto rhs) {
+        for (std::size_t i = 0; i < rhs.size(); ++i) {
+          rhs[i] += lhs[i];
+        }
+      });
+      ASSERT_EQ(data[0], static_cast<double>(comm_.World()));
+    }
+  }
+
+  void Acc() {
+    std::vector<double> data(314, 1.5);
+    Allreduce(comm_, common::Span{data.data(), data.size()}, [](auto lhs, auto rhs) {
+      for (std::size_t i = 0; i < rhs.size(); ++i) {
+        rhs[i] += lhs[i];
+      }
+    });
+    for (std::size_t i = 0; i < data.size(); ++i) {
+      auto v = data[i];
+      ASSERT_EQ(v, 1.5 * static_cast<double>(comm_.World())) << i;
+    }
+  }
+};
+
+class AllreduceTest : public SocketTest {};
+}  // namespace
+
+TEST_F(AllreduceTest, Basic) {
+  std::int32_t n_workers = std::min(7u, std::thread::hardware_concurrency());
+  TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
+                                 std::int32_t r) {
+    AllreduceWorker worker{host, port, timeout, n_workers, r};
+    worker.Basic();
+  });
+}
+
+TEST_F(AllreduceTest, Sum) {
+  std::int32_t n_workers = std::min(7u, std::thread::hardware_concurrency());
+  TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
+                                 std::int32_t r) {
+    AllreduceWorker worker{host, port, timeout, n_workers, r};
+    worker.Acc();
+  });
+}
+}  // namespace xgboost::collective
diff --git a/tests/cpp/collective/test_broadcast.cc b/tests/cpp/collective/test_broadcast.cc
new file mode 100644
index 000000000..0ade86567
--- /dev/null
+++ b/tests/cpp/collective/test_broadcast.cc
@@ -0,0 +1,51 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+#include <xgboost/collective/socket.h>
+
+#include <cstdint>  // for int32_t
+#include <string>   // for string
+#include <thread>   // for thread
+#include <vector>   // for vector
+
+#include "../../../src/collective/broadcast.h"  // for Broadcast
+#include "../../../src/collective/tracker.h"    // for GetHostAddress
+#include "test_worker.h"                        // for WorkerForTest, TestDistributed
+
+namespace xgboost::collective {
+namespace {
+class Worker : public WorkerForTest {
+ public:
+  using WorkerForTest::WorkerForTest;
+
+  void Run() {
+    for (std::int32_t r = 0; r < comm_.World(); ++r) {
+      // basic test
+      std::vector<std::int32_t> data(1, comm_.Rank());
+      auto rc = Broadcast(this->comm_, common::Span{data.data(), data.size()}, r);
+      ASSERT_TRUE(rc.OK()) << rc.Report();
+      ASSERT_EQ(data[0], r);
+    }
+
+    for (std::int32_t r = 0; r < comm_.World(); ++r) {
+      std::vector<std::int32_t> data(1 << 16, comm_.Rank());
+      auto rc = Broadcast(this->comm_, common::Span{data.data(), data.size()}, r);
+      ASSERT_TRUE(rc.OK()) << rc.Report();
+      ASSERT_EQ(data[0], r);
+    }
+  }
+};
+
+class BroadcastTest : public SocketTest {};
+}  // namespace
+
+TEST_F(BroadcastTest, Basic) {
+  std::int32_t n_workers = std::min(7u, std::thread::hardware_concurrency());
+  TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
+                                 std::int32_t r) {
+    Worker worker{host, port, timeout, n_workers, r};
+    worker.Run();
+  });
+}
+}  // namespace xgboost::collective
diff --git a/tests/cpp/collective/test_comm.cc b/tests/cpp/collective/test_comm.cc
new file mode 100644
index 000000000..7792c4c25
--- /dev/null
+++ b/tests/cpp/collective/test_comm.cc
@@ -0,0 +1,47 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+
+#include "../../../src/collective/comm.h"
+#include "test_worker.h"
+namespace xgboost::collective {
+namespace {
+class CommTest : public TrackerTest {};
+}  // namespace
+
+TEST_F(CommTest, Channel) {
+  auto n_workers = 4;
+  RabitTracker tracker{host, n_workers, 0, timeout};
+  auto fut = tracker.Run();
+
+  std::vector<std::thread> workers;
+  std::int32_t port = tracker.Port();
+
+  for (std::int32_t i = 0; i < n_workers; ++i) {
+    workers.emplace_back([=] {
+      WorkerForTest worker{host, port, timeout, n_workers, i};
+      if (i % 2 == 0) {
+        auto p_chan = worker.Comm().Chan(i + 1);
+        p_chan->SendAll(
+            EraseType(common::Span<std::int32_t const>{&i, static_cast<std::size_t>(1)}));
+        auto rc = p_chan->Block();
+        ASSERT_TRUE(rc.OK()) << rc.Report();
+      } else {
+        auto p_chan = worker.Comm().Chan(i - 1);
+        std::int32_t r{-1};
+        p_chan->RecvAll(EraseType(common::Span<std::int32_t>{&r, static_cast<std::size_t>(1)}));
+        auto rc = p_chan->Block();
+        ASSERT_TRUE(rc.OK()) << rc.Report();
+        ASSERT_EQ(r, i - 1);
+      }
+    });
+  }
+
+  for (auto &w : workers) {
+    w.join();
+  }
+
+  ASSERT_TRUE(fut.get().OK());
+}
+}  // namespace xgboost::collective
diff --git a/tests/cpp/collective/test_in_memory_communicator.cc b/tests/cpp/collective/test_in_memory_communicator.cc
index f36e30e33..69c427a4e 100644
--- a/tests/cpp/collective/test_in_memory_communicator.cc
+++ b/tests/cpp/collective/test_in_memory_communicator.cc
@@ -29,6 +29,11 @@ class InMemoryCommunicatorTest : public ::testing::Test {
     VerifyAllgather(comm, rank);
   }
 
+  static void AllgatherV(int rank) {
+    InMemoryCommunicator comm{kWorldSize, rank};
+    VerifyAllgatherV(comm, rank);
+  }
+
   static void AllreduceMax(int rank) {
     InMemoryCommunicator comm{kWorldSize, rank};
     VerifyAllreduceMax(comm, rank);
@@ -80,14 +85,19 @@ class InMemoryCommunicatorTest : public ::testing::Test {
 
  protected:
   static void VerifyAllgather(InMemoryCommunicator &comm, int rank) {
-    char buffer[kWorldSize] = {'a', 'b', 'c'};
-    buffer[rank] = '0' + rank;
-    comm.AllGather(buffer, kWorldSize);
+    std::string input{static_cast<char>('0' + rank)};
+    auto output = comm.AllGather(input);
     for (auto i = 0; i < kWorldSize; i++) {
-      EXPECT_EQ(buffer[i], '0' + i);
+      EXPECT_EQ(output[i], static_cast<char>('0' + i));
     }
   }
 
+  static void VerifyAllgatherV(InMemoryCommunicator &comm, int rank) {
+    std::vector<std::string_view> inputs{"a", "bb", "ccc"};
+    auto output = comm.AllGatherV(inputs[rank]);
+    EXPECT_EQ(output, "abbccc");
+  }
+
   static void VerifyAllreduceMax(InMemoryCommunicator &comm, int rank) {
     int buffer[] = {1 + rank, 2 + rank, 3 + rank, 4 + rank, 5 + rank};
     comm.AllReduce(buffer, sizeof(buffer) / sizeof(buffer[0]), DataType::kInt32, Operation::kMax);
@@ -205,6 +215,8 @@ TEST(InMemoryCommunicatorSimpleTest, IsDistributed) {
 
 TEST_F(InMemoryCommunicatorTest, Allgather) { Verify(&Allgather); }
 
+TEST_F(InMemoryCommunicatorTest, AllgatherV) { Verify(&AllgatherV); }
+
 TEST_F(InMemoryCommunicatorTest, AllreduceMax) { Verify(&AllreduceMax); }
 
 TEST_F(InMemoryCommunicatorTest, AllreduceMin) { Verify(&AllreduceMin); }
diff --git a/tests/cpp/collective/test_loop.cc b/tests/cpp/collective/test_loop.cc
new file mode 100644
index 000000000..4686060ce
--- /dev/null
+++ b/tests/cpp/collective/test_loop.cc
@@ -0,0 +1,81 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include <gtest/gtest.h>                // for ASSERT_TRUE, ASSERT_EQ
+#include <xgboost/collective/socket.h>  // for TCPSocket, Connect, SocketFinalize, SocketStartup
+#include <xgboost/string_view.h>        // for StringView
+
+#include <chrono>        // for seconds
+#include <cstdint>       // for int8_t
+#include <memory>        // for make_shared, shared_ptr
+#include <system_error>  // for make_error_code, errc
+#include <utility>       // for pair
+#include <vector>        // for vector
+
+#include "../../../src/collective/loop.h"  // for Loop
+
+namespace xgboost::collective {
+namespace {
+class LoopTest : public ::testing::Test {
+ protected:
+  std::pair<TCPSocket, TCPSocket> pair_;
+  std::shared_ptr<Loop> loop_;
+
+ protected:
+  void SetUp() override {
+    system::SocketStartup();
+    std::chrono::seconds timeout{1};
+
+    auto domain = SockDomain::kV4;
+    pair_.first = TCPSocket::Create(domain);
+    auto port = pair_.first.BindHost();
+    pair_.first.Listen();
+
+    auto const& addr = SockAddrV4::Loopback().Addr();
+    auto rc = Connect(StringView{addr}, port, 1, timeout, &pair_.second);
+    ASSERT_TRUE(rc.OK());
+    rc = pair_.second.NonBlocking(true);
+    ASSERT_TRUE(rc.OK());
+
+    pair_.first = pair_.first.Accept();
+    rc = pair_.first.NonBlocking(true);
+    ASSERT_TRUE(rc.OK());
+
+    loop_ = std::make_shared<Loop>(timeout);
+  }
+
+  void TearDown() override {
+    pair_ = decltype(pair_){};
+    system::SocketFinalize();
+  }
+};
+}  // namespace
+
+TEST_F(LoopTest, Timeout) {
+  std::vector<std::int8_t> data(1);
+  Loop::Op op{Loop::Op::kRead, 0, data.data(), data.size(), &pair_.second, 0};
+  loop_->Submit(op);
+  auto rc = loop_->Block();
+  ASSERT_FALSE(rc.OK());
+  ASSERT_EQ(rc.Code(), std::make_error_code(std::errc::timed_out)) << rc.Report();
+}
+
+TEST_F(LoopTest, Op) {
+  TCPSocket& send = pair_.first;
+  TCPSocket& recv = pair_.second;
+
+  std::vector<std::int8_t> wbuf(1, 1);
+  std::vector<std::int8_t> rbuf(1, 0);
+
+  Loop::Op wop{Loop::Op::kWrite, 0, wbuf.data(), wbuf.size(), &send, 0};
+  Loop::Op rop{Loop::Op::kRead, 0, rbuf.data(), rbuf.size(), &recv, 0};
+
+  loop_->Submit(wop);
+  loop_->Submit(rop);
+
+  auto rc = loop_->Block();
+  ASSERT_TRUE(rc.OK()) << rc.Report();
+
+  ASSERT_EQ(rbuf[0], wbuf[0]);
+}
+}  // namespace xgboost::collective
diff --git a/tests/cpp/collective/test_nccl_device_communicator.cu b/tests/cpp/collective/test_nccl_device_communicator.cu
index 986085d42..1402dee37 100644
--- a/tests/cpp/collective/test_nccl_device_communicator.cu
+++ b/tests/cpp/collective/test_nccl_device_communicator.cu
@@ -38,7 +38,7 @@ void VerifyAllReduceBitwiseAND() {
   auto const rank = collective::GetRank();
   std::bitset<64> original{};
   original[rank] = true;
-  HostDeviceVector<uint64_t> buffer({original.to_ullong()}, rank);
+  HostDeviceVector<uint64_t> buffer({original.to_ullong()}, DeviceOrd::CUDA(rank));
   collective::AllReduce<collective::Operation::kBitwiseAND>(rank, buffer.DevicePointer(), 1);
   collective::Synchronize(rank);
   EXPECT_EQ(buffer.HostVector()[0], 0ULL);
@@ -60,7 +60,7 @@ void VerifyAllReduceBitwiseOR() {
   auto const rank = collective::GetRank();
   std::bitset<64> original{};
   original[rank] = true;
-  HostDeviceVector<uint64_t> buffer({original.to_ullong()}, rank);
+  HostDeviceVector<uint64_t> buffer({original.to_ullong()}, DeviceOrd::CUDA(rank));
   collective::AllReduce<collective::Operation::kBitwiseOR>(rank, buffer.DevicePointer(), 1);
   collective::Synchronize(rank);
   EXPECT_EQ(buffer.HostVector()[0], (1ULL << world_size) - 1);
@@ -82,7 +82,7 @@ void VerifyAllReduceBitwiseXOR() {
   auto const rank = collective::GetRank();
   std::bitset<64> original{~0ULL};
   original[rank] = false;
-  HostDeviceVector<uint64_t> buffer({original.to_ullong()}, rank);
+  HostDeviceVector<uint64_t> buffer({original.to_ullong()}, DeviceOrd::CUDA(rank));
   collective::AllReduce<collective::Operation::kBitwiseXOR>(rank, buffer.DevicePointer(), 1);
   collective::Synchronize(rank);
   EXPECT_EQ(buffer.HostVector()[0], (1ULL << world_size) - 1);
diff --git a/tests/cpp/collective/test_socket.cc b/tests/cpp/collective/test_socket.cc
index ddc73d1f2..ced795fef 100644
--- a/tests/cpp/collective/test_socket.cc
+++ b/tests/cpp/collective/test_socket.cc
@@ -1,19 +1,16 @@
 /**
- * Copyright 2022-2023 by XGBoost Contributors
+ * Copyright 2022-2023, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/collective/socket.h>
 
 #include <cerrno>        // EADDRNOTAVAIL
-#include <fstream>       // ifstream
 #include <system_error>  // std::error_code, std::system_category
 
-#include "../helpers.h"
+#include "test_worker.h"  // for SocketTest
 
 namespace xgboost::collective {
-TEST(Socket, Basic) {
-  system::SocketStartup();
-
+TEST_F(SocketTest, Basic) {
   SockAddress addr{SockAddrV6::Loopback()};
   ASSERT_TRUE(addr.IsV6());
   addr = SockAddress{SockAddrV4::Loopback()};
@@ -54,23 +51,27 @@ TEST(Socket, Basic) {
 
   run_test(SockDomain::kV4);
 
-  std::string path{"/sys/module/ipv6/parameters/disable"};
-  if (FileExists(path)) {
-    std::ifstream fin(path);
-    if (!fin) {
-      GTEST_SKIP_(msg.c_str());
-    }
-    std::string s_value;
-    fin >> s_value;
-    auto value = std::stoi(s_value);
-    if (value != 0) {
-      GTEST_SKIP_(msg.c_str());
-    }
-  } else {
-    GTEST_SKIP_(msg.c_str());
+  if (SkipTest()) {
+    GTEST_SKIP_(skip_msg_.c_str());
   }
   run_test(SockDomain::kV6);
+}
 
-  system::SocketFinalize();
+TEST_F(SocketTest, Bind) {
+  auto run = [](SockDomain domain) {
+    auto any =
+        domain == SockDomain::kV4 ? SockAddrV4::InaddrAny().Addr() : SockAddrV6::InaddrAny().Addr();
+    auto sock = TCPSocket::Create(domain);
+    std::int32_t port{0};
+    auto rc = sock.Bind(any, &port);
+    ASSERT_TRUE(rc.OK());
+    ASSERT_NE(port, 0);
+  };
+
+  run(SockDomain::kV4);
+  if (SkipTest()) {
+    GTEST_SKIP_(skip_msg_.c_str());
+  }
+  run(SockDomain::kV6);
 }
 }  // namespace xgboost::collective
diff --git a/tests/cpp/collective/test_tracker.cc b/tests/cpp/collective/test_tracker.cc
new file mode 100644
index 000000000..8fc5f0b3f
--- /dev/null
+++ b/tests/cpp/collective/test_tracker.cc
@@ -0,0 +1,67 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+
+#include <chrono>   // for seconds
+#include <cstdint>  // for int32_t
+#include <string>   // for string
+#include <thread>   // for thread
+#include <vector>   // for vector
+
+#include "../../../src/collective/comm.h"
+#include "test_worker.h"
+
+namespace xgboost::collective {
+namespace {
+class PrintWorker : public WorkerForTest {
+ public:
+  using WorkerForTest::WorkerForTest;
+
+  void Print() {
+    auto rc = comm_.LogTracker("ack:" + std::to_string(this->comm_.Rank()));
+    ASSERT_TRUE(rc.OK()) << rc.Report();
+  }
+};
+}  // namespace
+
+TEST_F(TrackerTest, Bootstrap) {
+  RabitTracker tracker{host, n_workers, 0, timeout};
+  auto fut = tracker.Run();
+
+  std::vector<std::thread> workers;
+  std::int32_t port = tracker.Port();
+
+  for (std::int32_t i = 0; i < n_workers; ++i) {
+    workers.emplace_back([=] { WorkerForTest worker{host, port, timeout, n_workers, i}; });
+  }
+  for (auto &w : workers) {
+    w.join();
+  }
+
+  ASSERT_TRUE(fut.get().OK());
+}
+
+TEST_F(TrackerTest, Print) {
+  RabitTracker tracker{host, n_workers, 0, timeout};
+  auto fut = tracker.Run();
+
+  std::vector<std::thread> workers;
+  std::int32_t port = tracker.Port();
+
+  for (std::int32_t i = 0; i < n_workers; ++i) {
+    workers.emplace_back([=] {
+      PrintWorker worker{host, port, timeout, n_workers, i};
+      worker.Print();
+    });
+  }
+
+  for (auto &w : workers) {
+    w.join();
+  }
+
+  ASSERT_TRUE(fut.get().OK());
+}
+
+TEST_F(TrackerTest, GetHostAddress) { ASSERT_TRUE(host.find("127.") == std::string::npos); }
+}  // namespace xgboost::collective
diff --git a/tests/cpp/collective/test_worker.h b/tests/cpp/collective/test_worker.h
new file mode 100644
index 000000000..a3d6de875
--- /dev/null
+++ b/tests/cpp/collective/test_worker.h
@@ -0,0 +1,114 @@
+/**
+ * Copyright 2023, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+
+#include <chrono>   // for seconds
+#include <cstdint>  // for int32_t
+#include <string>   // for string
+#include <thread>   // for thread
+#include <utility>  // for move
+#include <vector>   // for vector
+
+#include "../../../src/collective/comm.h"
+#include "../../../src/collective/tracker.h"  // for GetHostAddress
+#include "../helpers.h"                       // for FileExists
+
+namespace xgboost::collective {
+class WorkerForTest {
+  std::string tracker_host_;
+  std::int32_t tracker_port_;
+  std::int32_t world_size_;
+
+ protected:
+  std::int32_t retry_{1};
+  std::string task_id_;
+  RabitComm comm_;
+
+ public:
+  WorkerForTest(std::string host, std::int32_t port, std::chrono::seconds timeout,
+                std::int32_t world, std::int32_t rank)
+      : tracker_host_{std::move(host)},
+        tracker_port_{port},
+        world_size_{world},
+        task_id_{"t:" + std::to_string(rank)},
+        comm_{tracker_host_, tracker_port_, timeout, retry_, task_id_} {
+    CHECK_EQ(world_size_, comm_.World());
+  }
+  virtual ~WorkerForTest() = default;
+  auto& Comm() { return comm_; }
+
+  void LimitSockBuf(std::int32_t n_bytes) {
+    for (std::int32_t i = 0; i < comm_.World(); ++i) {
+      if (i != comm_.Rank()) {
+        ASSERT_TRUE(comm_.Chan(i)->Socket()->NonBlocking());
+        ASSERT_TRUE(comm_.Chan(i)->Socket()->SetBufSize(n_bytes).OK());
+      }
+    }
+  }
+};
+
+class SocketTest : public ::testing::Test {
+ protected:
+  std::string skip_msg_{"Skipping IPv6 test"};
+
+  bool SkipTest() {
+    std::string path{"/sys/module/ipv6/parameters/disable"};
+    if (FileExists(path)) {
+      std::ifstream fin(path);
+      if (!fin) {
+        return true;
+      }
+      std::string s_value;
+      fin >> s_value;
+      auto value = std::stoi(s_value);
+      if (value != 0) {
+        return true;
+      }
+    } else {
+      return true;
+    }
+    return false;
+  }
+
+ protected:
+  void SetUp() override { system::SocketStartup(); }
+  void TearDown() override { system::SocketFinalize(); }
+};
+
+class TrackerTest : public SocketTest {
+ public:
+  std::int32_t n_workers{2};
+  std::chrono::seconds timeout{1};
+  std::string host;
+
+  void SetUp() override {
+    SocketTest::SetUp();
+    auto rc = GetHostAddress(&host);
+    ASSERT_TRUE(rc.OK()) << rc.Report();
+  }
+};
+
+template <typename WorkerFn>
+void TestDistributed(std::int32_t n_workers, WorkerFn worker_fn) {
+  std::chrono::seconds timeout{1};
+
+  std::string host;
+  ASSERT_TRUE(GetHostAddress(&host).OK());
+  RabitTracker tracker{StringView{host}, n_workers, 0, timeout};
+  auto fut = tracker.Run();
+
+  std::vector<std::thread> workers;
+  std::int32_t port = tracker.Port();
+
+  for (std::int32_t i = 0; i < n_workers; ++i) {
+    workers.emplace_back([=] { worker_fn(host, port, timeout, i); });
+  }
+
+  for (auto& t : workers) {
+    t.join();
+  }
+
+  ASSERT_TRUE(fut.get().OK());
+}
+}  // namespace xgboost::collective
diff --git a/tests/cpp/common/test_hist_util.cc b/tests/cpp/common/test_hist_util.cc
index 70ebecd3d..5391bc2cf 100644
--- a/tests/cpp/common/test_hist_util.cc
+++ b/tests/cpp/common/test_hist_util.cc
@@ -147,7 +147,7 @@ TEST(CutsBuilder, SearchGroupInd) {
 
   EXPECT_ANY_THROW(HostSketchContainer::SearchGroupIndFromRow(p_mat->Info().group_ptr_, 17));
 
-  p_mat->Info().Validate(-1);
+  p_mat->Info().Validate(DeviceOrd::CPU());
   EXPECT_THROW(HostSketchContainer::SearchGroupIndFromRow(p_mat->Info().group_ptr_, 17),
                dmlc::Error);
 
@@ -330,7 +330,7 @@ TEST(HistUtil, IndexBinData) {
 void TestSketchFromWeights(bool with_group) {
   size_t constexpr kRows = 300, kCols = 20, kBins = 256;
   size_t constexpr kGroups = 10;
-  auto m = RandomDataGenerator{kRows, kCols, 0}.Device(0).GenerateDMatrix();
+  auto m = RandomDataGenerator{kRows, kCols, 0}.Device(DeviceOrd::CUDA(0)).GenerateDMatrix();
   Context ctx;
   common::HistogramCuts cuts = SketchOnDMatrix(&ctx, m.get(), kBins);
 
diff --git a/tests/cpp/common/test_hist_util.cu b/tests/cpp/common/test_hist_util.cu
index c029413be..50f673a12 100644
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@@ -222,7 +222,7 @@ TEST(HistUtil, RemoveDuplicatedCategories) {
   ASSERT_EQ(info.feature_types.Size(), n_features);
 
   HostDeviceVector<bst_row_t> cuts_ptr{0, n_samples, n_samples * 2, n_samples * 3};
-  cuts_ptr.SetDevice(0);
+  cuts_ptr.SetDevice(DeviceOrd::CUDA(0));
 
   dh::device_vector<float> weight(n_samples * n_features, 0);
   dh::Iota(dh::ToSpan(weight));
@@ -235,7 +235,7 @@ TEST(HistUtil, RemoveDuplicatedCategories) {
   thrust::sort_by_key(sorted_entries.begin(), sorted_entries.end(), weight.begin(),
                       detail::EntryCompareOp());
 
-  detail::RemoveDuplicatedCategories(ctx.gpu_id, info, cuts_ptr.DeviceSpan(), &sorted_entries,
+  detail::RemoveDuplicatedCategories(ctx.Device(), info, cuts_ptr.DeviceSpan(), &sorted_entries,
                                      &weight, &columns_ptr);
 
   auto const& h_cptr = cuts_ptr.ConstHostVector();
@@ -377,7 +377,8 @@ template <typename Adapter>
 auto MakeUnweightedCutsForTest(Adapter adapter, int32_t num_bins, float missing, size_t batch_size = 0) {
   common::HistogramCuts batched_cuts;
   HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch_container(ft, num_bins, adapter.NumColumns(), adapter.NumRows(), 0);
+  SketchContainer sketch_container(ft, num_bins, adapter.NumColumns(), adapter.NumRows(),
+                                   DeviceOrd::CUDA(0));
   MetaInfo info;
   AdapterDeviceSketch(adapter.Value(), num_bins, info, missing, &sketch_container, batch_size);
   sketch_container.MakeCuts(&batched_cuts, info.IsColumnSplit());
@@ -444,7 +445,7 @@ TEST(HistUtil, AdapterSketchSlidingWindowMemory) {
   ConsoleLogger::Configure({{"verbosity", "3"}});
   common::HistogramCuts batched_cuts;
   HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch_container(ft, num_bins, num_columns, num_rows, 0);
+  SketchContainer sketch_container(ft, num_bins, num_columns, num_rows, DeviceOrd::CUDA(0));
   AdapterDeviceSketch(adapter.Value(), num_bins, info, std::numeric_limits<float>::quiet_NaN(),
                       &sketch_container);
   HistogramCuts cuts;
@@ -472,7 +473,7 @@ TEST(HistUtil, AdapterSketchSlidingWindowWeightedMemory) {
   ConsoleLogger::Configure({{"verbosity", "3"}});
   common::HistogramCuts batched_cuts;
   HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch_container(ft, num_bins, num_columns, num_rows, 0);
+  SketchContainer sketch_container(ft, num_bins, num_columns, num_rows, DeviceOrd::CUDA(0));
   AdapterDeviceSketch(adapter.Value(), num_bins, info,
                       std::numeric_limits<float>::quiet_NaN(),
                       &sketch_container);
@@ -507,7 +508,7 @@ void TestCategoricalSketchAdapter(size_t n, size_t num_categories,
   }
 
   ASSERT_EQ(info.feature_types.Size(), 1);
-  SketchContainer container(info.feature_types, num_bins, 1, n, 0);
+  SketchContainer container(info.feature_types, num_bins, 1, n, DeviceOrd::CUDA(0));
   AdapterDeviceSketch(adapter.Value(), num_bins, info,
                       std::numeric_limits<float>::quiet_NaN(), &container);
   HistogramCuts cuts;
@@ -580,11 +581,7 @@ TEST(HistUtil, AdapterDeviceSketchBatches) {
 
 namespace {
 auto MakeData(Context const* ctx, std::size_t n_samples, bst_feature_t n_features) {
-#if defined(XGBOOST_USE_CUDA)
-  dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(ctx->gpu_id));
-#endif
+  dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
   auto n = n_samples * n_features;
   std::vector<float> x;
   x.resize(n);
@@ -624,21 +621,21 @@ void TestGetColumnSize(std::size_t n_samples) {
   std::vector<std::size_t> h_column_size_1(column_sizes_scan.size());
 
   detail::LaunchGetColumnSizeKernel<decltype(batch_iter), true, true>(
-      ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
+      ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
   thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size.begin());
 
   detail::LaunchGetColumnSizeKernel<decltype(batch_iter), true, false>(
-      ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
+      ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
   thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin());
   ASSERT_EQ(h_column_size, h_column_size_1);
 
   detail::LaunchGetColumnSizeKernel<decltype(batch_iter), false, true>(
-      ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
+      ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
   thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin());
   ASSERT_EQ(h_column_size, h_column_size_1);
 
   detail::LaunchGetColumnSizeKernel<decltype(batch_iter), false, false>(
-      ctx.gpu_id, IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
+      ctx.Device(), IterSpan{batch_iter, batch.Size()}, is_valid, dh::ToSpan(column_sizes_scan));
   thrust::copy(column_sizes_scan.begin(), column_sizes_scan.end(), h_column_size_1.begin());
   ASSERT_EQ(h_column_size, h_column_size_1);
 }
@@ -715,9 +712,9 @@ void TestAdapterSketchFromWeights(bool with_group) {
   size_t constexpr kRows = 300, kCols = 20, kBins = 256;
   size_t constexpr kGroups = 10;
   HostDeviceVector<float> storage;
-  std::string m =
-      RandomDataGenerator{kRows, kCols, 0}.Device(0).GenerateArrayInterface(
-          &storage);
+  std::string m = RandomDataGenerator{kRows, kCols, 0}
+                      .Device(DeviceOrd::CUDA(0))
+                      .GenerateArrayInterface(&storage);
   MetaInfo info;
   Context ctx;
   auto& h_weights = info.weights_.HostVector();
@@ -736,14 +733,14 @@ void TestAdapterSketchFromWeights(bool with_group) {
     info.SetInfo(ctx, "group", groups.data(), DataType::kUInt32, kGroups);
   }
 
-  info.weights_.SetDevice(0);
+  info.weights_.SetDevice(DeviceOrd::CUDA(0));
   info.num_row_ = kRows;
   info.num_col_ = kCols;
 
   data::CupyAdapter adapter(m);
   auto const& batch = adapter.Value();
   HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch_container(ft, kBins, kCols, kRows, 0);
+  SketchContainer sketch_container(ft, kBins, kCols, kRows, DeviceOrd::CUDA(0));
   AdapterDeviceSketch(adapter.Value(), kBins, info, std::numeric_limits<float>::quiet_NaN(),
                       &sketch_container);
 
@@ -787,7 +784,7 @@ void TestAdapterSketchFromWeights(bool with_group) {
       // https://github.com/dmlc/xgboost/issues/7946
       h_weights[i] = (i % 2 == 0 ? 1 : 2) / static_cast<float>(kGroups);
     }
-    SketchContainer sketch_container(ft, kBins, kCols, kRows, 0);
+    SketchContainer sketch_container{ft, kBins, kCols, kRows, DeviceOrd::CUDA(0)};
     AdapterDeviceSketch(adapter.Value(), kBins, info, std::numeric_limits<float>::quiet_NaN(),
                         &sketch_container);
     sketch_container.MakeCuts(&weighted, info.IsColumnSplit());
diff --git a/tests/cpp/common/test_host_device_vector.cu b/tests/cpp/common/test_host_device_vector.cu
index 5ac155e09..0783f3a33 100644
--- a/tests/cpp/common/test_host_device_vector.cu
+++ b/tests/cpp/common/test_host_device_vector.cu
@@ -1,7 +1,6 @@
-/*!
- * Copyright 2018 XGBoost contributors
+/**
+ * Copyright 2018-2023 XGBoost contributors
  */
-
 #include <gtest/gtest.h>
 #include <thrust/equal.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -13,21 +12,14 @@
 #endif
 #include <xgboost/host_device_vector.h>
 
-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 namespace {
-void SetDeviceForTest(int device) {
+void SetDeviceForTest(DeviceOrd device) {
   int n_devices;
 
-#if defined(XGBOOST_USE_CUDA)
   dh::safe_cuda(cudaGetDeviceCount(&n_devices));
-  device %= n_devices;
-  dh::safe_cuda(cudaSetDevice(device));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipGetDeviceCount(&n_devices));
-  device %= n_devices;
-  dh::safe_cuda(hipSetDevice(device));
-#endif
+  device.ordinal %= n_devices;
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
 }
 }  // namespace
 
@@ -42,13 +34,13 @@ struct HostDeviceVectorSetDeviceHandler {
   }
 };
 
-void InitHostDeviceVector(size_t n, int device, HostDeviceVector<int> *v) {
+void InitHostDeviceVector(size_t n, DeviceOrd device, HostDeviceVector<int> *v) {
   // create the vector
   v->SetDevice(device);
   v->Resize(n);
 
   ASSERT_EQ(v->Size(), n);
-  ASSERT_EQ(v->DeviceIdx(), device);
+  ASSERT_EQ(v->Device(), device);
   // ensure that the device have read-write access
   ASSERT_TRUE(v->DeviceCanRead());
   ASSERT_TRUE(v->DeviceCanWrite());
@@ -68,7 +60,7 @@ void InitHostDeviceVector(size_t n, int device, HostDeviceVector<int> *v) {
 }
 
 void PlusOne(HostDeviceVector<int> *v) {
-  int device = v->DeviceIdx();
+  auto device = v->Device();
   SetDeviceForTest(device);
   thrust::transform(dh::tcbegin(*v), dh::tcend(*v), dh::tbegin(*v),
                     [=]__device__(unsigned int a){ return a + 1; });
@@ -80,7 +72,7 @@ void CheckDevice(HostDeviceVector<int>* v,
                  unsigned int first,
                  GPUAccess access) {
   ASSERT_EQ(v->Size(), size);
-  SetDeviceForTest(v->DeviceIdx());
+  SetDeviceForTest(v->Device());
 
   ASSERT_TRUE(thrust::equal(dh::tcbegin(*v), dh::tcend(*v),
                             thrust::make_counting_iterator(first)));
@@ -111,7 +103,7 @@ void CheckHost(HostDeviceVector<int> *v, GPUAccess access) {
   ASSERT_FALSE(v->DeviceCanWrite());
 }
 
-void TestHostDeviceVector(size_t n, int device) {
+void TestHostDeviceVector(size_t n, DeviceOrd device) {
   HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
   HostDeviceVector<int> v;
   InitHostDeviceVector(n, device, &v);
@@ -124,13 +116,13 @@ void TestHostDeviceVector(size_t n, int device) {
 
 TEST(HostDeviceVector, Basic) {
   size_t n = 1001;
-  int device = 0;
+  DeviceOrd device = DeviceOrd::CUDA(0);
   TestHostDeviceVector(n, device);
 }
 
 TEST(HostDeviceVector, Copy) {
   size_t n = 1001;
-  int device = 0;
+  auto device = DeviceOrd::CUDA(0);
   HostDeviceVectorSetDeviceHandler hdvec_dev_hndlr(SetDevice);
 
   HostDeviceVector<int> v;
@@ -154,15 +146,15 @@ TEST(HostDeviceVector, SetDevice) {
     h_vec[i] = i;
   }
   HostDeviceVector<int> vec (h_vec);
-  auto device = 0;
+  auto device = DeviceOrd::CUDA(0);
 
   vec.SetDevice(device);
   ASSERT_EQ(vec.Size(), h_vec.size());
   auto span = vec.DeviceSpan();  // sync to device
 
-  vec.SetDevice(-1);  // pull back to cpu.
+  vec.SetDevice(DeviceOrd::CPU());  // pull back to cpu.
   ASSERT_EQ(vec.Size(), h_vec.size());
-  ASSERT_EQ(vec.DeviceIdx(), -1);
+  ASSERT_EQ(vec.Device(), DeviceOrd::CPU());
 
   auto h_vec_1 = vec.HostVector();
   ASSERT_TRUE(std::equal(h_vec_1.cbegin(), h_vec_1.cend(), h_vec.cbegin()));
@@ -170,7 +162,7 @@ TEST(HostDeviceVector, SetDevice) {
 
 TEST(HostDeviceVector, Span) {
   HostDeviceVector<float> vec {1.0f, 2.0f, 3.0f, 4.0f};
-  vec.SetDevice(0);
+  vec.SetDevice(DeviceOrd::CUDA(0));
   auto span = vec.DeviceSpan();
   ASSERT_EQ(vec.Size(), span.size());
   ASSERT_EQ(vec.DevicePointer(), span.data());
@@ -194,5 +186,4 @@ TEST(HostDeviceVector, Empty) {
   ASSERT_FALSE(another.Empty());
   ASSERT_TRUE(vec.Empty());
 }
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
diff --git a/tests/cpp/common/test_io.cc b/tests/cpp/common/test_io.cc
index e4d65c1f4..4c4d4efe0 100644
--- a/tests/cpp/common/test_io.cc
+++ b/tests/cpp/common/test_io.cc
@@ -144,7 +144,8 @@ TEST(IO, Resource) {
     fout << 1.0 << std::endl;
     fout.close();
 
-    auto resource = std::make_shared<MmapResource>(path, 0, sizeof(double));
+    auto resource = std::shared_ptr<MmapResource>{
+      new MmapResource{path, 0, sizeof(double)}};
     ASSERT_EQ(resource->Size(), sizeof(double));
     ASSERT_EQ(resource->Type(), ResourceHandler::kMmap);
     ASSERT_EQ(resource->DataAs<double>()[0], val);
diff --git a/tests/cpp/common/test_json.cc b/tests/cpp/common/test_json.cc
index 4d498ffd5..d361552ce 100644
--- a/tests/cpp/common/test_json.cc
+++ b/tests/cpp/common/test_json.cc
@@ -1,13 +1,15 @@
 /**
- * Copyright (c) 2019-2023, XGBoost Contributors
+ * Copyright 2019-2023, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 
 #include <fstream>
+#include <iterator>  // for back_inserter
 #include <map>
 
 #include "../../../src/common/charconv.h"
 #include "../../../src/common/io.h"
+#include "../../../src/common/json_utils.h"
 #include "../../../src/common/threading_utils.h"  // for ParallelFor
 #include "../filesystem.h"                        // dmlc::TemporaryDirectory
 #include "../helpers.h"
@@ -691,4 +693,16 @@ TEST(Json, TypeCheck) {
     ASSERT_NE(err.find("foo"), std::string::npos);
   }
 }
+
+TEST(Json, Dump) {
+  auto str = GetModelStr();
+  auto jobj = Json::Load(str);
+  std::string result_s = Json::Dump(jobj);
+
+  std::vector<char> result_v = Json::Dump<std::vector<char>>(jobj);
+  ASSERT_EQ(result_s.size(), result_v.size());
+  for (std::size_t i = 0; i < result_s.size(); ++i) {
+    ASSERT_EQ(result_s[i], result_v[i]);
+  }
+}
 }  // namespace xgboost
diff --git a/tests/cpp/common/test_linalg.cu b/tests/cpp/common/test_linalg.cu
index 67b3b79ff..8bc06447c 100644
--- a/tests/cpp/common/test_linalg.cu
+++ b/tests/cpp/common/test_linalg.cu
@@ -16,7 +16,7 @@ namespace xgboost::linalg {
 namespace {
 void TestElementWiseKernel() {
   auto device = DeviceOrd::CUDA(0);
-  Tensor<float, 3> l{{2, 3, 4}, 0};
+  Tensor<float, 3> l{{2, 3, 4}, device};
   {
     /**
      * Non-contiguous
diff --git a/tests/cpp/common/test_quantile.cc b/tests/cpp/common/test_quantile.cc
index 59d06f5d3..343f59cda 100644
--- a/tests/cpp/common/test_quantile.cc
+++ b/tests/cpp/common/test_quantile.cc
@@ -9,9 +9,7 @@
 #include "../../../src/data/adapter.h"
 #include "xgboost/context.h"
 
-namespace xgboost {
-namespace common {
-
+namespace xgboost::common {
 TEST(Quantile, LoadBalance) {
   size_t constexpr kRows = 1000, kCols = 100;
   auto m = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
@@ -314,7 +312,7 @@ void TestSameOnAllWorkers() {
         }
 
         auto m = RandomDataGenerator{kRows, kCols, 0}
-                     .Device(Context::kCpuId)
+                     .Device(DeviceOrd::CPU())
                      .Type(ft)
                      .MaxCategory(17)
                      .Seed(rank + seed)
@@ -373,6 +371,4 @@ TEST(Quantile, SameOnAllWorkers) {
   auto constexpr kWorkers = 4;
   RunWithInMemoryCommunicator(kWorkers, TestSameOnAllWorkers);
 }
-
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
diff --git a/tests/cpp/common/test_quantile.cu b/tests/cpp/common/test_quantile.cu
index 1233a338d..5fe39e38a 100644
--- a/tests/cpp/common/test_quantile.cu
+++ b/tests/cpp/common/test_quantile.cu
@@ -32,7 +32,7 @@ class MGPUQuantileTest : public BaseMGPUTest {};
 TEST(GPUQuantile, Basic) {
   constexpr size_t kRows = 1000, kCols = 100, kBins = 256;
   HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch(ft, kBins, kCols, kRows, 0);
+  SketchContainer sketch(ft, kBins, kCols, kRows, FstCU());
   dh::caching_device_vector<Entry> entries;
   dh::device_vector<bst_row_t> cuts_ptr(kCols+1);
   thrust::fill(cuts_ptr.begin(), cuts_ptr.end(), 0);
@@ -45,12 +45,12 @@ void TestSketchUnique(float sparsity) {
   constexpr size_t kRows = 1000, kCols = 100;
   RunWithSeedsAndBins(kRows, [kRows, kCols, sparsity](int32_t seed, size_t n_bins, MetaInfo const& info) {
     HostDeviceVector<FeatureType> ft;
-    SketchContainer sketch(ft, n_bins, kCols, kRows, 0);
+    SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU());
 
     HostDeviceVector<float> storage;
     std::string interface_str = RandomDataGenerator{kRows, kCols, sparsity}
                                     .Seed(seed)
-                                    .Device(0)
+                                    .Device(FstCU())
                                     .GenerateArrayInterface(&storage);
     data::CupyAdapter adapter(interface_str);
     AdapterDeviceSketch(adapter.Value(), n_bins, info,
@@ -65,7 +65,7 @@ void TestSketchUnique(float sparsity) {
         thrust::make_counting_iterator(0llu),
         [=] __device__(size_t idx) { return batch.GetElement(idx); });
     auto end = kCols * kRows;
-    detail::GetColumnSizesScan(0, kCols, n_cuts, IterSpan{batch_iter, end}, is_valid,
+    detail::GetColumnSizesScan(FstCU(), kCols, n_cuts, IterSpan{batch_iter, end}, is_valid,
                                &cut_sizes_scan, &column_sizes_scan);
     auto const& cut_sizes = cut_sizes_scan.HostVector();
     ASSERT_LE(sketch.Data().size(), cut_sizes.back());
@@ -93,13 +93,9 @@ TEST(GPUQuantile, Unique) {
 }
 
 // if with_error is true, the test tolerates floating point error
-void TestQuantileElemRank(int32_t device, Span<SketchEntry const> in,
+void TestQuantileElemRank(DeviceOrd device, Span<SketchEntry const> in,
                           Span<bst_row_t const> d_columns_ptr, bool with_error = false) {
-#if defined(XGBOOST_USE_CUDA)
-  dh::safe_cuda(cudaSetDevice(device));
-#elif defined(XGBOOST_USE_HIP)
-  dh::safe_cuda(hipSetDevice(device));
-#endif
+  dh::safe_cuda(cudaSetDevice(device.ordinal));
   std::vector<SketchEntry> h_in(in.size());
   dh::CopyDeviceSpanToVector(&h_in, in);
   std::vector<bst_row_t> h_columns_ptr(d_columns_ptr.size());
@@ -134,13 +130,12 @@ TEST(GPUQuantile, Prune) {
   constexpr size_t kRows = 1000, kCols = 100;
   RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
     HostDeviceVector<FeatureType> ft;
-    SketchContainer sketch(ft, n_bins, kCols, kRows, 0);
+    SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU());
 
     HostDeviceVector<float> storage;
-    std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
-                                    .Device(0)
-                                    .Seed(seed)
-                                    .GenerateArrayInterface(&storage);
+    std::string interface_str =
+        RandomDataGenerator{kRows, kCols, 0}.Device(FstCU()).Seed(seed).GenerateArrayInterface(
+            &storage);
     data::CupyAdapter adapter(interface_str);
     AdapterDeviceSketch(adapter.Value(), n_bins, info,
                         std::numeric_limits<float>::quiet_NaN(), &sketch);
@@ -156,7 +151,7 @@ TEST(GPUQuantile, Prune) {
     ASSERT_TRUE(thrust::is_sorted(thrust::device, sketch.Data().data(),
                                   sketch.Data().data() + sketch.Data().size(),
                                   detail::SketchUnique{}));
-    TestQuantileElemRank(0, sketch.Data(), sketch.ColumnsPtr());
+    TestQuantileElemRank(FstCU(), sketch.Data(), sketch.ColumnsPtr());
   });
 }
 
@@ -164,10 +159,10 @@ TEST(GPUQuantile, MergeEmpty) {
   constexpr size_t kRows = 1000, kCols = 100;
   size_t n_bins = 10;
   HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch_0(ft, n_bins, kCols, kRows, 0);
+  SketchContainer sketch_0(ft, n_bins, kCols, kRows, FstCU());
   HostDeviceVector<float> storage_0;
   std::string interface_str_0 =
-      RandomDataGenerator{kRows, kCols, 0}.Device(0).GenerateArrayInterface(
+      RandomDataGenerator{kRows, kCols, 0}.Device(FstCU()).GenerateArrayInterface(
           &storage_0);
   data::CupyAdapter adapter_0(interface_str_0);
   MetaInfo info;
@@ -204,34 +199,33 @@ TEST(GPUQuantile, MergeBasic) {
   constexpr size_t kRows = 1000, kCols = 100;
   RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const &info) {
     HostDeviceVector<FeatureType> ft;
-    SketchContainer sketch_0(ft, n_bins, kCols, kRows, 0);
+    SketchContainer sketch_0(ft, n_bins, kCols, kRows, FstCU());
     HostDeviceVector<float> storage_0;
     std::string interface_str_0 = RandomDataGenerator{kRows, kCols, 0}
-                                      .Device(0)
+                                  .Device(FstCU())
                                       .Seed(seed)
                                       .GenerateArrayInterface(&storage_0);
     data::CupyAdapter adapter_0(interface_str_0);
     AdapterDeviceSketch(adapter_0.Value(), n_bins, info,
                         std::numeric_limits<float>::quiet_NaN(), &sketch_0);
 
-    SketchContainer sketch_1(ft, n_bins, kCols, kRows * kRows, 0);
+    SketchContainer sketch_1(ft, n_bins, kCols, kRows * kRows, FstCU());
     HostDeviceVector<float> storage_1;
-    std::string interface_str_1 = RandomDataGenerator{kRows, kCols, 0}
-                                      .Device(0)
-                                      .Seed(seed)
-                                      .GenerateArrayInterface(&storage_1);
+    std::string interface_str_1 =
+        RandomDataGenerator{kRows, kCols, 0}.Device(FstCU()).Seed(seed).GenerateArrayInterface(
+            &storage_1);
     data::CupyAdapter adapter_1(interface_str_1);
-    AdapterDeviceSketch(adapter_1.Value(), n_bins, info,
-                        std::numeric_limits<float>::quiet_NaN(), &sketch_1);
+    AdapterDeviceSketch(adapter_1.Value(), n_bins, info, std::numeric_limits<float>::quiet_NaN(),
+                        &sketch_1);
 
     size_t size_before_merge = sketch_0.Data().size();
     sketch_0.Merge(sketch_1.ColumnsPtr(), sketch_1.Data());
     if (info.weights_.Size() != 0) {
-      TestQuantileElemRank(0, sketch_0.Data(), sketch_0.ColumnsPtr(), true);
+      TestQuantileElemRank(FstCU(), sketch_0.Data(), sketch_0.ColumnsPtr(), true);
       sketch_0.FixError();
-      TestQuantileElemRank(0, sketch_0.Data(), sketch_0.ColumnsPtr(), false);
+      TestQuantileElemRank(FstCU(), sketch_0.Data(), sketch_0.ColumnsPtr(), false);
     } else {
-      TestQuantileElemRank(0, sketch_0.Data(), sketch_0.ColumnsPtr());
+      TestQuantileElemRank(FstCU(), sketch_0.Data(), sketch_0.ColumnsPtr());
     }
 
     auto columns_ptr = sketch_0.ColumnsPtr();
@@ -251,24 +245,22 @@ void TestMergeDuplicated(int32_t n_bins, size_t cols, size_t rows, float frac) {
   MetaInfo info;
   int32_t seed = 0;
   HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch_0(ft, n_bins, cols, rows, 0);
+  SketchContainer sketch_0(ft, n_bins, cols, rows, FstCU());
   HostDeviceVector<float> storage_0;
-  std::string interface_str_0 = RandomDataGenerator{rows, cols, 0}
-                                    .Device(0)
-                                    .Seed(seed)
-                                    .GenerateArrayInterface(&storage_0);
+  std::string interface_str_0 =
+      RandomDataGenerator{rows, cols, 0}.Device(FstCU()).Seed(seed).GenerateArrayInterface(
+          &storage_0);
   data::CupyAdapter adapter_0(interface_str_0);
   AdapterDeviceSketch(adapter_0.Value(), n_bins, info,
                       std::numeric_limits<float>::quiet_NaN(),
                       &sketch_0);
 
   size_t f_rows = rows * frac;
-  SketchContainer sketch_1(ft, n_bins, cols, f_rows, 0);
+  SketchContainer sketch_1(ft, n_bins, cols, f_rows, FstCU());
   HostDeviceVector<float> storage_1;
-  std::string interface_str_1 = RandomDataGenerator{f_rows, cols, 0}
-                                    .Device(0)
-                                    .Seed(seed)
-                                    .GenerateArrayInterface(&storage_1);
+  std::string interface_str_1 =
+      RandomDataGenerator{f_rows, cols, 0}.Device(FstCU()).Seed(seed).GenerateArrayInterface(
+          &storage_1);
   auto data_1 = storage_1.DeviceSpan();
   auto tuple_it = thrust::make_tuple(
       thrust::make_counting_iterator<size_t>(0ul), data_1.data());
@@ -290,7 +282,7 @@ void TestMergeDuplicated(int32_t n_bins, size_t cols, size_t rows, float frac) {
 
   size_t size_before_merge = sketch_0.Data().size();
   sketch_0.Merge(sketch_1.ColumnsPtr(), sketch_1.Data());
-  TestQuantileElemRank(0, sketch_0.Data(), sketch_0.ColumnsPtr());
+  TestQuantileElemRank(FstCU(), sketch_0.Data(), sketch_0.ColumnsPtr());
 
   auto columns_ptr = sketch_0.ColumnsPtr();
   std::vector<bst_row_t> h_columns_ptr(columns_ptr.size());
@@ -321,11 +313,10 @@ TEST(GPUQuantile, MergeDuplicated) {
 TEST(GPUQuantile, MultiMerge) {
   constexpr size_t kRows = 20, kCols = 1;
   int32_t world = 2;
-  RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins,
-                                 MetaInfo const &info) {
+  RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
     // Set up single node version
     HostDeviceVector<FeatureType> ft;
-    SketchContainer sketch_on_single_node(ft, n_bins, kCols, kRows, 0);
+    SketchContainer sketch_on_single_node(ft, n_bins, kCols, kRows, FstCU());
 
     size_t intermediate_num_cuts = std::min(
         kRows * world, static_cast<size_t>(n_bins * WQSketch::kFactor));
@@ -333,12 +324,12 @@ TEST(GPUQuantile, MultiMerge) {
     for (auto rank = 0; rank < world; ++rank) {
       HostDeviceVector<float> storage;
       std::string interface_str = RandomDataGenerator{kRows, kCols, 0}
-                                      .Device(0)
+                                      .Device(FstCU())
                                       .Seed(rank + seed)
                                       .GenerateArrayInterface(&storage);
       data::CupyAdapter adapter(interface_str);
       HostDeviceVector<FeatureType> ft;
-      containers.emplace_back(ft, n_bins, kCols, kRows, 0);
+      containers.emplace_back(ft, n_bins, kCols, kRows, FstCU());
       AdapterDeviceSketch(adapter.Value(), n_bins, info,
                           std::numeric_limits<float>::quiet_NaN(),
                           &containers.back());
@@ -348,21 +339,44 @@ TEST(GPUQuantile, MultiMerge) {
       sketch_on_single_node.Merge(sketch.ColumnsPtr(), sketch.Data());
       sketch_on_single_node.FixError();
     }
-    TestQuantileElemRank(0, sketch_on_single_node.Data(),
-                         sketch_on_single_node.ColumnsPtr());
+    TestQuantileElemRank(FstCU(), sketch_on_single_node.Data(), sketch_on_single_node.ColumnsPtr());
 
     sketch_on_single_node.Unique();
-    TestQuantileElemRank(0, sketch_on_single_node.Data(),
-                         sketch_on_single_node.ColumnsPtr());
+    TestQuantileElemRank(FstCU(), sketch_on_single_node.Data(), sketch_on_single_node.ColumnsPtr());
   });
 }
 
+TEST(GPUQuantile, MissingColumns) {
+  auto dmat = std::unique_ptr<DMatrix>{[=]() {
+    std::size_t constexpr kRows = 1000, kCols = 100;
+    auto sparsity = 0.5f;
+    std::vector<FeatureType> ft(kCols);
+    for (size_t i = 0; i < ft.size(); ++i) {
+      ft[i] = (i % 2 == 0) ? FeatureType::kNumerical : FeatureType::kCategorical;
+    }
+    auto dmat = RandomDataGenerator{kRows, kCols, sparsity}
+                    .Seed(0)
+                    .Lower(.0f)
+                    .Upper(1.0f)
+                    .Type(ft)
+                    .MaxCategory(13)
+                    .GenerateDMatrix();
+    return dmat->SliceCol(2, 1);
+  }()};
+  dmat->Info().data_split_mode = DataSplitMode::kRow;
+
+  auto ctx = MakeCUDACtx(0);
+  std::size_t constexpr kBins = 64;
+  HistogramCuts cuts = common::DeviceSketch(&ctx, dmat.get(), kBins);
+  ASSERT_TRUE(cuts.HasCategorical());
+}
+
 namespace {
 void TestAllReduceBasic() {
   auto const world = collective::GetWorldSize();
   constexpr size_t kRows = 1000, kCols = 100;
   RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
-    auto const device = GPUIDX;
+    auto const device = DeviceOrd::CUDA(GPUIDX);
 
     // Set up single node version;
     HostDeviceVector<FeatureType> ft({}, device);
@@ -440,18 +454,14 @@ TEST_F(MGPUQuantileTest, AllReduceBasic) {
 }
 
 namespace {
-void TestColumnSplitBasic() {
+void TestColumnSplit(DMatrix* dmat) {
   auto const world = collective::GetWorldSize();
   auto const rank = collective::GetRank();
-  std::size_t constexpr kRows = 1000, kCols = 100, kBins = 64;
-
-  auto m = std::unique_ptr<DMatrix>{[=]() {
-    auto dmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
-    return dmat->SliceCol(world, rank);
-  }()};
+  auto m = std::unique_ptr<DMatrix>{dmat->SliceCol(world, rank)};
 
   // Generate cuts for distributed environment.
   auto ctx = MakeCUDACtx(GPUIDX);
+  std::size_t constexpr kBins = 64;
   HistogramCuts distributed_cuts = common::DeviceSketch(&ctx, m.get(), kBins);
 
   // Generate cuts for single node environment
@@ -484,7 +494,26 @@ void TestColumnSplitBasic() {
 }  // anonymous namespace
 
 TEST_F(MGPUQuantileTest, ColumnSplitBasic) {
-  DoTest(TestColumnSplitBasic);
+  std::size_t constexpr kRows = 1000, kCols = 100;
+  auto dmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
+  DoTest(TestColumnSplit, dmat.get());
+}
+
+TEST_F(MGPUQuantileTest, ColumnSplitCategorical) {
+  std::size_t constexpr kRows = 1000, kCols = 100;
+  auto sparsity = 0.5f;
+  std::vector<FeatureType> ft(kCols);
+  for (size_t i = 0; i < ft.size(); ++i) {
+    ft[i] = (i % 2 == 0) ? FeatureType::kNumerical : FeatureType::kCategorical;
+  }
+  auto dmat = RandomDataGenerator{kRows, kCols, sparsity}
+                  .Seed(0)
+                  .Lower(.0f)
+                  .Upper(1.0f)
+                  .Type(ft)
+                  .MaxCategory(13)
+                  .GenerateDMatrix();
+  DoTest(TestColumnSplit, dmat.get());
 }
 
 namespace {
@@ -494,7 +523,7 @@ void TestSameOnAllWorkers() {
   RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins,
                                  MetaInfo const &info) {
     auto const rank = collective::GetRank();
-    auto const device = GPUIDX;
+    auto const device = DeviceOrd::CUDA(GPUIDX);
     HostDeviceVector<FeatureType> ft({}, device);
     SketchContainer sketch_distributed(ft, n_bins, kCols, kRows, device);
     HostDeviceVector<float> storage({}, device);
@@ -525,9 +554,9 @@ void TestSameOnAllWorkers() {
     thrust::copy(thrust::device, local_data.data(),
                  local_data.data() + local_data.size(),
                  all_workers.begin() + local_data.size() * rank);
-    collective::AllReduce<collective::Operation::kSum>(device, all_workers.data().get(),
+    collective::AllReduce<collective::Operation::kSum>(device.ordinal, all_workers.data().get(),
                                                        all_workers.size());
-    collective::Synchronize(device);
+    collective::Synchronize(device.ordinal);
 
     auto base_line = dh::ToSpan(all_workers).subspan(0, size_as_float);
     std::vector<float> h_base_line(base_line.size());
@@ -573,7 +602,7 @@ TEST(GPUQuantile, Push) {
   columns_ptr[1] = kRows;
 
   HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch(ft, n_bins, kCols, kRows, 0);
+  SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU());
   sketch.Push(dh::ToSpan(d_entries), dh::ToSpan(columns_ptr), dh::ToSpan(columns_ptr), kRows, {});
 
   auto sketch_data = sketch.Data();
@@ -613,7 +642,7 @@ TEST(GPUQuantile, MultiColPush) {
 
   int32_t n_bins = 16;
   HostDeviceVector<FeatureType> ft;
-  SketchContainer sketch(ft, n_bins, kCols, kRows, 0);
+  SketchContainer sketch(ft, n_bins, kCols, kRows, FstCU());
   dh::device_vector<Entry> d_entries {entries};
 
   dh::device_vector<size_t> columns_ptr(kCols + 1, 0);
diff --git a/tests/cpp/common/test_ranking_utils.cc b/tests/cpp/common/test_ranking_utils.cc
index 919102278..b57ee90cd 100644
--- a/tests/cpp/common/test_ranking_utils.cc
+++ b/tests/cpp/common/test_ranking_utils.cc
@@ -95,7 +95,7 @@ void TestRankingCache(Context const* ctx) {
   HostDeviceVector<float> predt(info.num_row_, 0);
   auto& h_predt = predt.HostVector();
   std::iota(h_predt.begin(), h_predt.end(), 0.0f);
-  predt.SetDevice(ctx->gpu_id);
+  predt.SetDevice(ctx->Device());
 
   auto rank_idx =
       cache.SortedIdx(ctx, ctx->IsCPU() ? predt.ConstHostSpan() : predt.ConstDeviceSpan());
@@ -129,7 +129,7 @@ void TestNDCGCache(Context const* ctx) {
     auto fail = [&]() { NDCGCache cache{ctx, info, param}; };
     // empty label
     ASSERT_THROW(fail(), dmlc::Error);
-    info.labels = linalg::Matrix<float>{{0.0f, 0.1f, 0.2f}, {3}, Context::kCpuId};
+    info.labels = linalg::Matrix<float>{{0.0f, 0.1f, 0.2f}, {3}, DeviceOrd::CPU()};
     // invalid label
     ASSERT_THROW(fail(), dmlc::Error);
     auto h_labels = info.labels.HostView();
diff --git a/tests/cpp/common/test_ranking_utils.cu b/tests/cpp/common/test_ranking_utils.cu
index f42c903c4..f3a59e55b 100644
--- a/tests/cpp/common/test_ranking_utils.cu
+++ b/tests/cpp/common/test_ranking_utils.cu
@@ -42,7 +42,7 @@ void TestCalcQueriesInvIDCG() {
   auto d_scores = dh::ToSpan(scores);
   common::SegmentedSequence(&ctx, d_group_ptr, d_scores);
 
-  linalg::Vector<double> inv_IDCG({n_groups}, ctx.gpu_id);
+  linalg::Vector<double> inv_IDCG({n_groups}, ctx.Device());
 
   ltr::LambdaRankParam p;
   p.UpdateAllowUnknown(Args{{"ndcg_exp_gain", "false"}});
@@ -77,7 +77,7 @@ void TestRankingCache(Context const* ctx) {
   HostDeviceVector<float> predt(info.num_row_, 0);
   auto& h_predt = predt.HostVector();
   std::iota(h_predt.begin(), h_predt.end(), 0.0f);
-  predt.SetDevice(ctx->gpu_id);
+  predt.SetDevice(ctx->Device());
 
   auto rank_idx =
       cache.SortedIdx(ctx, ctx->IsCPU() ? predt.ConstHostSpan() : predt.ConstDeviceSpan());
diff --git a/tests/cpp/common/test_stats.cc b/tests/cpp/common/test_stats.cc
index fb982af12..070c9d6f1 100644
--- a/tests/cpp/common/test_stats.cc
+++ b/tests/cpp/common/test_stats.cc
@@ -9,12 +9,11 @@
 #include "../../../src/common/transform_iterator.h"  // common::MakeIndexTransformIter
 #include "../helpers.h"
 
-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 TEST(Stats, Quantile) {
   Context ctx;
   {
-    linalg::Tensor<float, 1> arr({20.f, 0.f, 15.f, 50.f, 40.f, 0.f, 35.f}, {7}, Context::kCpuId);
+    linalg::Tensor<float, 1> arr({20.f, 0.f, 15.f, 50.f, 40.f, 0.f, 35.f}, {7}, DeviceOrd::CPU());
     std::vector<size_t> index{0, 2, 3, 4, 6};
     auto h_arr = arr.HostView();
     auto beg = MakeIndexTransformIter([&](size_t i) { return h_arr(index[i]); });
@@ -40,8 +39,8 @@ TEST(Stats, Quantile) {
 
 TEST(Stats, WeightedQuantile) {
   Context ctx;
-  linalg::Tensor<float, 1> arr({1.f, 2.f, 3.f, 4.f, 5.f}, {5}, Context::kCpuId);
-  linalg::Tensor<float, 1> weight({1.f, 1.f, 1.f, 1.f, 1.f}, {5}, Context::kCpuId);
+  linalg::Tensor<float, 1> arr({1.f, 2.f, 3.f, 4.f, 5.f}, {5}, DeviceOrd::CPU());
+  linalg::Tensor<float, 1> weight({1.f, 1.f, 1.f, 1.f, 1.f}, {5}, DeviceOrd::CPU());
 
   auto h_arr = arr.HostView();
   auto h_weight = weight.HostView();
@@ -64,7 +63,7 @@ TEST(Stats, Median) {
   Context ctx;
 
   {
-    linalg::Tensor<float, 2> values{{.0f, .0f, 1.f, 2.f}, {4}, Context::kCpuId};
+    linalg::Tensor<float, 2> values{{.0f, .0f, 1.f, 2.f}, {4}, DeviceOrd::CPU()};
     HostDeviceVector<float> weights;
     linalg::Tensor<float, 1> out;
     Median(&ctx, values, weights, &out);
@@ -83,7 +82,7 @@ TEST(Stats, Median) {
   {
     ctx = ctx.MakeCPU();
     // 4x2 matrix
-    linalg::Tensor<float, 2> values{{0.f, 0.f, 0.f, 0.f, 1.f, 1.f, 2.f, 2.f}, {4, 2}, ctx.gpu_id};
+    linalg::Tensor<float, 2> values{{0.f, 0.f, 0.f, 0.f, 1.f, 1.f, 2.f, 2.f}, {4, 2}, ctx.Device()};
     HostDeviceVector<float> weights;
     linalg::Tensor<float, 1> out;
     Median(&ctx, values, weights, &out);
@@ -102,14 +101,14 @@ TEST(Stats, Median) {
 namespace {
 void TestMean(Context const* ctx) {
   std::size_t n{128};
-  linalg::Vector<float> data({n}, ctx->gpu_id);
+  linalg::Vector<float> data({n}, ctx->Device());
   auto h_v = data.HostView().Values();
   std::iota(h_v.begin(), h_v.end(), .0f);
 
   auto nf = static_cast<float>(n);
   float mean = nf * (nf - 1) / 2 / n;
 
-  linalg::Vector<float> res{{1}, ctx->gpu_id};
+  linalg::Vector<float> res{{1}, ctx->Device()};
   Mean(ctx, data, &res);
   auto h_res = res.HostView();
   ASSERT_EQ(h_res.Size(), 1);
@@ -127,6 +126,5 @@ TEST(Stats, GPUMean) {
   auto ctx = MakeCUDACtx(0);
   TestMean(&ctx);
 }
-#endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
-}  // namespace common
-}  // namespace xgboost
+#endif  // defined(XGBOOST_USE_CUDA)
+}  // namespace xgboost::common
diff --git a/tests/cpp/common/test_stats.cu b/tests/cpp/common/test_stats.cu
index 4911643f7..e07383fff 100644
--- a/tests/cpp/common/test_stats.cu
+++ b/tests/cpp/common/test_stats.cu
@@ -25,8 +25,8 @@ namespace common {
 namespace {
 class StatsGPU : public ::testing::Test {
  private:
-  linalg::Tensor<float, 1> arr_{{1.f, 2.f, 3.f, 4.f, 5.f, 2.f, 4.f, 5.f, 3.f, 1.f}, {10}, 0};
-  linalg::Tensor<std::size_t, 1> indptr_{{0, 5, 10}, {3}, 0};
+  linalg::Tensor<float, 1> arr_{{1.f, 2.f, 3.f, 4.f, 5.f, 2.f, 4.f, 5.f, 3.f, 1.f}, {10}, FstCU()};
+  linalg::Tensor<std::size_t, 1> indptr_{{0, 5, 10}, {3}, FstCU()};
   HostDeviceVector<float> results_;
   using TestSet = std::vector<std::pair<float, float>>;
   Context ctx_;
@@ -51,7 +51,7 @@ class StatsGPU : public ::testing::Test {
     data.insert(data.cend(), seg.begin(), seg.end());
     data.insert(data.cend(), seg.begin(), seg.end());
     data.insert(data.cend(), seg.begin(), seg.end());
-    linalg::Tensor<float, 1> arr{data.cbegin(), data.cend(), {data.size()}, 0};
+    linalg::Tensor<float, 1> arr{data.cbegin(), data.cend(), {data.size()}, FstCU()};
     auto d_arr = arr.View(DeviceOrd::CUDA(0));
 
     auto key_it = dh::MakeTransformIterator<std::size_t>(
@@ -63,7 +63,7 @@ class StatsGPU : public ::testing::Test {
 
     // one alpha for each segment
     HostDeviceVector<float> alphas{0.0f, 0.5f, 1.0f};
-    alphas.SetDevice(0);
+    alphas.SetDevice(FstCU());
     auto d_alphas = alphas.ConstDeviceSpan();
     auto w_it = thrust::make_constant_iterator(0.1f);
     SegmentedWeightedQuantile(&ctx_, d_alphas.data(), key_it, key_it + d_alphas.size() + 1, val_it,
@@ -85,7 +85,7 @@ class StatsGPU : public ::testing::Test {
     auto val_it =
         dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
                                          [=] XGBOOST_DEVICE(std::size_t i) { return d_arr(i); });
-    linalg::Tensor<float, 1> weights{{10}, 0};
+    linalg::Tensor<float, 1> weights{{10}, FstCU()};
     linalg::ElementWiseTransformDevice(weights.View(DeviceOrd::CUDA(0)),
                                        [=] XGBOOST_DEVICE(std::size_t, float) { return 1.0; });
     auto w_it = weights.Data()->ConstDevicePointer();
@@ -106,7 +106,7 @@ class StatsGPU : public ::testing::Test {
     data.insert(data.cend(), seg.begin(), seg.end());
     data.insert(data.cend(), seg.begin(), seg.end());
     data.insert(data.cend(), seg.begin(), seg.end());
-    linalg::Tensor<float, 1> arr{data.cbegin(), data.cend(), {data.size()}, 0};
+    linalg::Tensor<float, 1> arr{data.cbegin(), data.cend(), {data.size()}, FstCU()};
     auto d_arr = arr.View(DeviceOrd::CUDA(0));
 
     auto key_it = dh::MakeTransformIterator<std::size_t>(
@@ -118,7 +118,7 @@ class StatsGPU : public ::testing::Test {
 
     // one alpha for each segment
     HostDeviceVector<float> alphas{0.1f, 0.2f, 0.4f};
-    alphas.SetDevice(0);
+    alphas.SetDevice(FstCU());
     auto d_alphas = alphas.ConstDeviceSpan();
     SegmentedQuantile(&ctx_, d_alphas.data(), key_it, key_it + d_alphas.size() + 1, val_it,
                       val_it + d_arr.Size(), &results_);
diff --git a/tests/cpp/common/test_transform_range.cc b/tests/cpp/common/test_transform_range.cc
index 396d9f307..af130830b 100644
--- a/tests/cpp/common/test_transform_range.cc
+++ b/tests/cpp/common/test_transform_range.cc
@@ -11,63 +11,59 @@
 #include "../../../src/common/transform.h"
 #include "../helpers.h"
 
+namespace xgboost::common {
+namespace {
+constexpr DeviceOrd TransformDevice() {
 #if defined(__CUDACC__) || defined(__HIP_PLATFORM_AMD__)
-
-#define TRANSFORM_GPU 0
-
+  return DeviceOrd::CUDA(0);
 #else
-
-#define TRANSFORM_GPU -1
-
+  return DeviceOrd::CPU();
 #endif
-
-namespace xgboost {
-namespace common {
+}
+}  // namespace
 
 template <typename T>
 struct TestTransformRange {
-  void XGBOOST_DEVICE operator()(size_t _idx,
-                                 Span<bst_float> _out, Span<const bst_float> _in) {
+  void XGBOOST_DEVICE operator()(std::size_t _idx, Span<float> _out, Span<const float> _in) {
     _out[_idx] = _in[_idx];
   }
 };
 
 TEST(Transform, DeclareUnifiedTest(Basic)) {
-  const size_t size {256};
-  std::vector<bst_float> h_in(size);
-  std::vector<bst_float> h_out(size);
+  const size_t size{256};
+  std::vector<float> h_in(size);
+  std::vector<float> h_out(size);
   std::iota(h_in.begin(), h_in.end(), 0);
-  std::vector<bst_float> h_sol(size);
+  std::vector<float> h_sol(size);
   std::iota(h_sol.begin(), h_sol.end(), 0);
 
-  const HostDeviceVector<bst_float> in_vec{h_in, TRANSFORM_GPU};
-  HostDeviceVector<bst_float> out_vec{h_out, TRANSFORM_GPU};
+  auto device = TransformDevice();
+  HostDeviceVector<float> const in_vec{h_in, device};
+  HostDeviceVector<float> out_vec{h_out, device};
   out_vec.Fill(0);
 
-  Transform<>::Init(TestTransformRange<bst_float>{},
+  Transform<>::Init(TestTransformRange<float>{},
                     Range{0, static_cast<Range::DifferenceType>(size)}, AllThreadsForTest(),
-                    TRANSFORM_GPU)
+                    TransformDevice())
       .Eval(&out_vec, &in_vec);
-  std::vector<bst_float> res = out_vec.HostVector();
+  std::vector<float> res = out_vec.HostVector();
 
   ASSERT_TRUE(std::equal(h_sol.begin(), h_sol.end(), res.begin()));
 }
 
 #if !defined(__CUDACC__) && !defined(__HIP_PLATFORM_AMD__)
 TEST(TransformDeathTest, Exception) {
-  size_t const kSize {16};
-  std::vector<bst_float> h_in(kSize);
-  const HostDeviceVector<bst_float> in_vec{h_in, -1};
+  size_t const kSize{16};
+  std::vector<float> h_in(kSize);
+  const HostDeviceVector<float> in_vec{h_in, DeviceOrd::CPU()};
   EXPECT_DEATH(
       {
         Transform<>::Init([](size_t idx, common::Span<float const> _in) { _in[idx + 1]; },
                           Range(0, static_cast<Range::DifferenceType>(kSize)), AllThreadsForTest(),
-                          -1)
+                          DeviceOrd::CPU())
             .Eval(&in_vec);
       },
       "");
 }
 #endif
-
-} // namespace common
-} // namespace xgboost
+}  // namespace xgboost::common
diff --git a/tests/cpp/common/test_transform_range.cu b/tests/cpp/common/test_transform_range.cu
new file mode 100644
index 000000000..b0fa7c102
--- /dev/null
+++ b/tests/cpp/common/test_transform_range.cu
@@ -0,0 +1,5 @@
+/**
+ * Copyright 2023 XGBoost contributors
+ */
+// Dummy file to keep the CUDA tests.
+#include "test_transform_range.cc"
diff --git a/tests/cpp/data/test_device_adapter.cu b/tests/cpp/data/test_device_adapter.cu
index 047fbdd03..19e220c48 100644
--- a/tests/cpp/data/test_device_adapter.cu
+++ b/tests/cpp/data/test_device_adapter.cu
@@ -70,12 +70,12 @@ TEST(DeviceAdapter, GetRowCounts) {
   for (bst_feature_t n_features : {1, 2, 4, 64, 128, 256}) {
     HostDeviceVector<float> storage;
     auto str_arr = RandomDataGenerator{8192, n_features, 0.0}
-                       .Device(ctx.gpu_id)
+                       .Device(ctx.Device())
                        .GenerateArrayInterface(&storage);
     auto adapter = CupyAdapter{str_arr};
     HostDeviceVector<bst_row_t> offset(adapter.NumRows() + 1, 0);
-    offset.SetDevice(ctx.gpu_id);
-    auto rstride = GetRowCounts(adapter.Value(), offset.DeviceSpan(), ctx.gpu_id,
+    offset.SetDevice(ctx.Device());
+    auto rstride = GetRowCounts(adapter.Value(), offset.DeviceSpan(), ctx.Device(),
                                 std::numeric_limits<float>::quiet_NaN());
     ASSERT_EQ(rstride, n_features);
   }
diff --git a/tests/cpp/data/test_ellpack_page.cu b/tests/cpp/data/test_ellpack_page.cu
index ac6cbd539..dd3a30f7d 100644
--- a/tests/cpp/data/test_ellpack_page.cu
+++ b/tests/cpp/data/test_ellpack_page.cu
@@ -98,7 +98,7 @@ TEST(EllpackPage, FromCategoricalBasic) {
   Context ctx{MakeCUDACtx(0)};
   auto p = BatchParam{max_bins, tree::TrainParam::DftSparseThreshold()};
   auto ellpack = EllpackPage(&ctx, m.get(), p);
-  auto accessor = ellpack.Impl()->GetDeviceAccessor(0);
+  auto accessor = ellpack.Impl()->GetDeviceAccessor(FstCU());
   ASSERT_EQ(kCats, accessor.NumBins());
 
   auto x_copy = x;
@@ -156,13 +156,12 @@ TEST(EllpackPage, Copy) {
   auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
 
   // Create an empty result page.
-  EllpackPageImpl result(0, page->Cuts(), page->is_dense, page->row_stride,
-                         kRows);
+  EllpackPageImpl result(FstCU(), page->Cuts(), page->is_dense, page->row_stride, kRows);
 
   // Copy batch pages into the result page.
   size_t offset = 0;
   for (auto& batch : dmat->GetBatches<EllpackPage>(&ctx, param)) {
-    size_t num_elements = result.Copy(0, batch.Impl(), offset);
+    size_t num_elements = result.Copy(FstCU(), batch.Impl(), offset);
     offset += num_elements;
   }
 
@@ -176,10 +175,12 @@ TEST(EllpackPage, Copy) {
     EXPECT_EQ(impl->base_rowid, current_row);
 
     for (size_t i = 0; i < impl->Size(); i++) {
-      dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(0), current_row, row_d.data().get()));
+      dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(FstCU()), current_row,
+                                         row_d.data().get()));
       thrust::copy(row_d.begin(), row_d.end(), row.begin());
 
-      dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(0), current_row, row_result_d.data().get()));
+      dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(FstCU()), current_row,
+                                         row_result_d.data().get()));
       thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin());
 
       EXPECT_EQ(row, row_result);
@@ -203,8 +204,7 @@ TEST(EllpackPage, Compact) {
   auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
 
   // Create an empty result page.
-  EllpackPageImpl result(0, page->Cuts(), page->is_dense, page->row_stride,
-                         kCompactedRows);
+  EllpackPageImpl result(FstCU(), page->Cuts(), page->is_dense, page->row_stride, kCompactedRows);
 
   // Compact batch pages into the result page.
   std::vector<size_t> row_indexes_h {
@@ -213,7 +213,7 @@ TEST(EllpackPage, Compact) {
   thrust::device_vector<size_t> row_indexes_d = row_indexes_h;
   common::Span<size_t> row_indexes_span(row_indexes_d.data().get(), kRows);
   for (auto& batch : dmat->GetBatches<EllpackPage>(&ctx, param)) {
-    result.Compact(0, batch.Impl(), row_indexes_span);
+    result.Compact(FstCU(), batch.Impl(), row_indexes_span);
   }
 
   size_t current_row = 0;
@@ -232,7 +232,7 @@ TEST(EllpackPage, Compact) {
         continue;
       }
 
-      dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(0),
+      dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(FstCU()),
                                          current_row, row_d.data().get()));
 #if defined(XGBOOST_USE_CUDA)
       dh::safe_cuda(cudaDeviceSynchronize());
@@ -242,7 +242,7 @@ TEST(EllpackPage, Compact) {
       thrust::copy(row_d.begin(), row_d.end(), row.begin());
 
       dh::LaunchN(kCols,
-                  ReadRowFunction(result.GetDeviceAccessor(0), compacted_row,
+                  ReadRowFunction(result.GetDeviceAccessor(FstCU()), compacted_row,
                                   row_result_d.data().get()));
       thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin());
 
diff --git a/tests/cpp/data/test_gradient_index.cc b/tests/cpp/data/test_gradient_index.cc
index 2e1ffcc01..8f8281991 100644
--- a/tests/cpp/data/test_gradient_index.cc
+++ b/tests/cpp/data/test_gradient_index.cc
@@ -30,7 +30,7 @@ namespace xgboost::data {
 TEST(GradientIndex, ExternalMemoryBaseRowID) {
   Context ctx;
   auto p_fmat = RandomDataGenerator{4096, 256, 0.5}
-                    .Device(ctx.gpu_id)
+                    .Device(ctx.Device())
                     .Batches(8)
                     .GenerateSparsePageDMatrix("cache", true);
 
diff --git a/tests/cpp/data/test_iterative_dmatrix.cu b/tests/cpp/data/test_iterative_dmatrix.cu
index da71f9677..81539c22d 100644
--- a/tests/cpp/data/test_iterative_dmatrix.cu
+++ b/tests/cpp/data/test_iterative_dmatrix.cu
@@ -16,9 +16,7 @@
 #include "../helpers.h"
 #include "test_iterative_dmatrix.h"
 
-namespace xgboost {
-namespace data {
-
+namespace xgboost::data {
 void TestEquivalent(float sparsity) {
   Context ctx{MakeCUDACtx(0)};
 
@@ -28,14 +26,14 @@ void TestEquivalent(float sparsity) {
   std::size_t offset = 0;
   auto first = (*m.GetEllpackBatches(&ctx, {}).begin()).Impl();
   std::unique_ptr<EllpackPageImpl> page_concatenated {
-    new EllpackPageImpl(0, first->Cuts(), first->is_dense,
+    new EllpackPageImpl(ctx.Device(), first->Cuts(), first->is_dense,
                         first->row_stride, 1000 * 100)};
   for (auto& batch : m.GetBatches<EllpackPage>(&ctx, {})) {
     auto page = batch.Impl();
-    size_t num_elements = page_concatenated->Copy(0, page, offset);
+    size_t num_elements = page_concatenated->Copy(ctx.Device(), page, offset);
     offset += num_elements;
   }
-  auto from_iter = page_concatenated->GetDeviceAccessor(0);
+  auto from_iter = page_concatenated->GetDeviceAccessor(ctx.Device());
   ASSERT_EQ(m.Info().num_col_, CudaArrayIterForTest::Cols());
   ASSERT_EQ(m.Info().num_row_, CudaArrayIterForTest::Rows());
 
@@ -45,7 +43,7 @@ void TestEquivalent(float sparsity) {
       DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 0)};
   auto bp = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
   for (auto& ellpack : dm->GetBatches<EllpackPage>(&ctx, bp)) {
-    auto from_data = ellpack.Impl()->GetDeviceAccessor(0);
+    auto from_data = ellpack.Impl()->GetDeviceAccessor(ctx.Device());
 
     std::vector<float> cuts_from_iter(from_iter.gidx_fvalue_map.size());
     std::vector<float> min_fvalues_iter(from_iter.min_fvalue.size());
@@ -157,10 +155,10 @@ TEST(IterativeDeviceDMatrix, RowMajorMissing) {
   auto impl = ellpack.Impl();
   common::CompressedIterator<uint32_t> iterator(
       impl->gidx_buffer.HostVector().data(), impl->NumSymbols());
-  EXPECT_EQ(iterator[1], impl->GetDeviceAccessor(0).NullValue());
-  EXPECT_EQ(iterator[5], impl->GetDeviceAccessor(0).NullValue());
+  EXPECT_EQ(iterator[1], impl->GetDeviceAccessor(ctx.Device()).NullValue());
+  EXPECT_EQ(iterator[5], impl->GetDeviceAccessor(ctx.Device()).NullValue());
   // null values get placed after valid values in a row
-  EXPECT_EQ(iterator[7], impl->GetDeviceAccessor(0).NullValue());
+  EXPECT_EQ(iterator[7], impl->GetDeviceAccessor(ctx.Device()).NullValue());
   EXPECT_EQ(m.Info().num_col_, cols);
   EXPECT_EQ(m.Info().num_row_, rows);
   EXPECT_EQ(m.Info().num_nonzero_, rows* cols - 3);
@@ -188,5 +186,4 @@ TEST(IterativeDeviceDMatrix, Ref) {
   TestRefDMatrix<EllpackPage, CudaArrayIterForTest>(
       &ctx, [](EllpackPage const& page) { return page.Impl()->Cuts(); });
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
diff --git a/tests/cpp/data/test_metainfo.cc b/tests/cpp/data/test_metainfo.cc
index e8d6fbb45..b3f3a67ca 100644
--- a/tests/cpp/data/test_metainfo.cc
+++ b/tests/cpp/data/test_metainfo.cc
@@ -12,6 +12,7 @@
 #include "../helpers.h"
 #include "xgboost/base.h"
 
+namespace xgboost {
 TEST(MetaInfo, GetSet) {
   xgboost::Context ctx;
   xgboost::MetaInfo info;
@@ -73,6 +74,49 @@ TEST(MetaInfo, GetSetFeature) {
   // Other conditions are tested in `SaveLoadBinary`.
 }
 
+namespace {
+void VerifyGetSetFeatureColumnSplit() {
+  xgboost::MetaInfo info;
+  info.data_split_mode = DataSplitMode::kCol;
+  auto const world_size = collective::GetWorldSize();
+
+  auto constexpr kCols{2};
+  std::vector<std::string> types{u8"float", u8"c"};
+  std::vector<char const *> c_types(kCols);
+  std::transform(types.cbegin(), types.cend(), c_types.begin(),
+                 [](auto const &str) { return str.c_str(); });
+  info.num_col_ = kCols;
+  EXPECT_THROW(info.SetFeatureInfo(u8"feature_type", c_types.data(), c_types.size()), dmlc::Error);
+  info.num_col_ = kCols * world_size;
+  EXPECT_NO_THROW(info.SetFeatureInfo(u8"feature_type", c_types.data(), c_types.size()));
+  std::vector<std::string> expected_type_names{u8"float", u8"c",     u8"float",
+                                               u8"c",     u8"float", u8"c"};
+  EXPECT_EQ(info.feature_type_names, expected_type_names);
+  std::vector<xgboost::FeatureType> expected_types{
+      xgboost::FeatureType::kNumerical, xgboost::FeatureType::kCategorical,
+      xgboost::FeatureType::kNumerical, xgboost::FeatureType::kCategorical,
+      xgboost::FeatureType::kNumerical, xgboost::FeatureType::kCategorical};
+  EXPECT_EQ(info.feature_types.HostVector(), expected_types);
+
+  std::vector<std::string> names{u8"feature0", u8"feature1"};
+  std::vector<char const *> c_names(kCols);
+  std::transform(names.cbegin(), names.cend(), c_names.begin(),
+                 [](auto const &str) { return str.c_str(); });
+  info.num_col_ = kCols;
+  EXPECT_THROW(info.SetFeatureInfo(u8"feature_name", c_names.data(), c_names.size()), dmlc::Error);
+  info.num_col_ = kCols * world_size;
+  EXPECT_NO_THROW(info.SetFeatureInfo(u8"feature_name", c_names.data(), c_names.size()));
+  std::vector<std::string> expected_names{u8"0.feature0", u8"0.feature1", u8"1.feature0",
+                                          u8"1.feature1", u8"2.feature0", u8"2.feature1"};
+  EXPECT_EQ(info.feature_names, expected_names);
+}
+}  // anonymous namespace
+
+TEST(MetaInfo, GetSetFeatureColumnSplit) {
+  auto constexpr kWorldSize{3};
+  RunWithInMemoryCommunicator(kWorldSize, VerifyGetSetFeatureColumnSplit);
+}
+
 TEST(MetaInfo, SaveLoadBinary) {
   xgboost::MetaInfo info;
   xgboost::Context ctx;
@@ -236,9 +280,9 @@ TEST(MetaInfo, Validate) {
   info.num_nonzero_ = 12;
   info.num_col_ = 3;
   std::vector<xgboost::bst_group_t> groups (11);
-  xgboost::Context ctx;
+  Context ctx;
   info.SetInfo(ctx, "group", groups.data(), xgboost::DataType::kUInt32, 11);
-  EXPECT_THROW(info.Validate(0), dmlc::Error);
+  EXPECT_THROW(info.Validate(FstCU()), dmlc::Error);
 
   std::vector<float> labels(info.num_row_ + 1);
   EXPECT_THROW(
@@ -261,11 +305,11 @@ TEST(MetaInfo, Validate) {
   info.group_ptr_.clear();
   labels.resize(info.num_row_);
   info.SetInfo(ctx, "label", labels.data(), xgboost::DataType::kFloat32, info.num_row_);
-  info.labels.SetDevice(0);
-  EXPECT_THROW(info.Validate(1), dmlc::Error);
+  info.labels.SetDevice(FstCU());
+  EXPECT_THROW(info.Validate(DeviceOrd::CUDA(1)), dmlc::Error);
 
   xgboost::HostDeviceVector<xgboost::bst_group_t> d_groups{groups};
-  d_groups.SetDevice(0);
+  d_groups.SetDevice(FstCU());
   d_groups.DevicePointer();  // pull to device
   std::string arr_interface_str{ArrayInterfaceStr(xgboost::linalg::MakeVec(
       d_groups.ConstDevicePointer(), d_groups.Size(), xgboost::DeviceOrd::CUDA(0)))};
@@ -306,6 +350,5 @@ TEST(MetaInfo, HostExtend) {
   }
 }
 
-namespace xgboost {
 TEST(MetaInfo, CPUStridedData) { TestMetaInfoStridedData(DeviceOrd::CPU()); }
 }  // namespace xgboost
diff --git a/tests/cpp/data/test_proxy_dmatrix.cc b/tests/cpp/data/test_proxy_dmatrix.cc
index a6d0b2188..996836ed6 100644
--- a/tests/cpp/data/test_proxy_dmatrix.cc
+++ b/tests/cpp/data/test_proxy_dmatrix.cc
@@ -1,31 +1,27 @@
-/*!
- * Copyright 2021 XGBoost contributors
+/**
+ * Copyright 2021-2023, XGBoost contributors
  */
 #include <gtest/gtest.h>
-#include "../helpers.h"
-#include "../../../src/data/proxy_dmatrix.h"
-#include "../../../src/data/adapter.h"
 
-namespace xgboost {
-namespace data {
+#include "../../../src/data/adapter.h"
+#include "../../../src/data/proxy_dmatrix.h"
+#include "../helpers.h"
+
+namespace xgboost::data {
 TEST(ProxyDMatrix, HostData) {
   DMatrixProxy proxy;
   size_t constexpr kRows = 100, kCols = 10;
   std::vector<HostDeviceVector<float>> label_storage(1);
 
   HostDeviceVector<float> storage;
-  auto data = RandomDataGenerator(kRows, kCols, 0.5)
-                  .Device(0)
-                  .GenerateArrayInterface(&storage);
+  auto data =
+      RandomDataGenerator(kRows, kCols, 0.5).Device(FstCU()).GenerateArrayInterface(&storage);
 
   proxy.SetArrayData(data.c_str());
 
-  auto n_samples = HostAdapterDispatch(
-      &proxy, [](auto const &value) { return value.Size(); });
+  auto n_samples = HostAdapterDispatch(&proxy, [](auto const &value) { return value.Size(); });
   ASSERT_EQ(n_samples, kRows);
-  auto n_features = HostAdapterDispatch(
-      &proxy, [](auto const &value) { return value.NumCols(); });
+  auto n_features = HostAdapterDispatch(&proxy, [](auto const &value) { return value.NumCols(); });
   ASSERT_EQ(n_features, kCols);
 }
-}  // namespace data
-}  // namespace xgboost
+}  // namespace xgboost::data
diff --git a/tests/cpp/data/test_proxy_dmatrix.cu b/tests/cpp/data/test_proxy_dmatrix.cu
index cfbe731ec..d8ee84810 100644
--- a/tests/cpp/data/test_proxy_dmatrix.cu
+++ b/tests/cpp/data/test_proxy_dmatrix.cu
@@ -19,10 +19,12 @@ namespace xgboost::data {
 TEST(ProxyDMatrix, DeviceData) {
   constexpr size_t kRows{100}, kCols{100};
   HostDeviceVector<float> storage;
-  auto data = RandomDataGenerator(kRows, kCols, 0.5).Device(0).GenerateArrayInterface(&storage);
+  auto data =
+      RandomDataGenerator(kRows, kCols, 0.5).Device(FstCU()).GenerateArrayInterface(&storage);
   std::vector<HostDeviceVector<float>> label_storage(1);
-  auto labels =
-      RandomDataGenerator(kRows, 1, 0).Device(0).GenerateColumnarArrayInterface(&label_storage);
+  auto labels = RandomDataGenerator(kRows, 1, 0)
+                    .Device(FstCU())
+                    .GenerateColumnarArrayInterface(&label_storage);
 
   DMatrixProxy proxy;
   proxy.SetCUDAArray(data.c_str());
@@ -35,7 +37,7 @@ TEST(ProxyDMatrix, DeviceData) {
 
   std::vector<HostDeviceVector<float>> columnar_storage(kCols);
   data = RandomDataGenerator(kRows, kCols, 0)
-             .Device(0)
+             .Device(FstCU())
              .GenerateColumnarArrayInterface(&columnar_storage);
   proxy.SetCUDAArray(data.c_str());
   ASSERT_EQ(proxy.Adapter().type(), typeid(std::shared_ptr<CudfAdapter>));
diff --git a/tests/cpp/data/test_simple_dmatrix.cc b/tests/cpp/data/test_simple_dmatrix.cc
index f1d588196..fa4165796 100644
--- a/tests/cpp/data/test_simple_dmatrix.cc
+++ b/tests/cpp/data/test_simple_dmatrix.cc
@@ -268,7 +268,7 @@ TEST(SimpleDMatrix, Slice) {
   std::iota(upper.begin(), upper.end(), 1.0f);
 
   auto& margin = p_m->Info().base_margin_;
-  margin = decltype(p_m->Info().base_margin_){{kRows, kClasses}, Context::kCpuId};
+  margin = decltype(p_m->Info().base_margin_){{kRows, kClasses}, DeviceOrd::CPU()};
 
   std::array<int32_t, 3> ridxs {1, 3, 5};
   std::unique_ptr<DMatrix> out { p_m->Slice(ridxs) };
@@ -341,7 +341,7 @@ TEST(SimpleDMatrix, SliceCol) {
   std::iota(upper.begin(), upper.end(), 1.0f);
 
   auto& margin = p_m->Info().base_margin_;
-  margin = decltype(p_m->Info().base_margin_){{kRows, kClasses}, Context::kCpuId};
+  margin = decltype(p_m->Info().base_margin_){{kRows, kClasses}, DeviceOrd::CPU()};
 
   auto constexpr kSlices {2};
   auto constexpr kSliceSize {4};
@@ -428,3 +428,21 @@ TEST(SimpleDMatrix, Threads) {
       DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 0, "")};
   ASSERT_EQ(p_fmat->Ctx()->Threads(), AllThreadsForTest());
 }
+
+namespace {
+void VerifyColumnSplit() {
+  size_t constexpr kRows {16};
+  size_t constexpr kCols {8};
+  auto dmat =
+      RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(false, false, 1, DataSplitMode::kCol);
+
+  ASSERT_EQ(dmat->Info().num_col_, kCols * collective::GetWorldSize());
+  ASSERT_EQ(dmat->Info().num_row_, kRows);
+  ASSERT_EQ(dmat->Info().data_split_mode, DataSplitMode::kCol);
+}
+}  // anonymous namespace
+
+TEST(SimpleDMatrix, ColumnSplit) {
+  auto constexpr kWorldSize{3};
+  RunWithInMemoryCommunicator(kWorldSize, VerifyColumnSplit);
+}
diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cu b/tests/cpp/data/test_sparse_page_dmatrix.cu
index 306cb79ca..9ec746ea3 100644
--- a/tests/cpp/data/test_sparse_page_dmatrix.cu
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cu
@@ -138,11 +138,11 @@ TEST(SparsePageDMatrix, EllpackPageContent) {
   size_t offset = 0;
   for (auto& batch : dmat_ext->GetBatches<EllpackPage>(&ctx, param)) {
     if (!impl_ext) {
-      impl_ext.reset(new EllpackPageImpl(
-          batch.Impl()->gidx_buffer.DeviceIdx(), batch.Impl()->Cuts(),
-          batch.Impl()->is_dense, batch.Impl()->row_stride, kRows));
+      impl_ext = std::make_unique<EllpackPageImpl>(batch.Impl()->gidx_buffer.Device(),
+                                                   batch.Impl()->Cuts(), batch.Impl()->is_dense,
+                                                   batch.Impl()->row_stride, kRows);
     }
-    auto n_elems = impl_ext->Copy(0, batch.Impl(), offset);
+    auto n_elems = impl_ext->Copy(ctx.Device(), batch.Impl(), offset);
     offset += n_elems;
   }
   EXPECT_EQ(impl_ext->base_rowid, 0);
@@ -202,10 +202,12 @@ TEST(SparsePageDMatrix, MultipleEllpackPageContent) {
     EXPECT_EQ(impl_ext->base_rowid, current_row);
 
     for (size_t i = 0; i < impl_ext->Size(); i++) {
-      dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(0), current_row, row_d.data().get()));
+      dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(ctx.Device()), current_row,
+                                         row_d.data().get()));
       thrust::copy(row_d.begin(), row_d.end(), row.begin());
 
-      dh::LaunchN(kCols, ReadRowFunction(impl_ext->GetDeviceAccessor(0), current_row, row_ext_d.data().get()));
+      dh::LaunchN(kCols, ReadRowFunction(impl_ext->GetDeviceAccessor(ctx.Device()), current_row,
+                                         row_ext_d.data().get()));
       thrust::copy(row_ext_d.begin(), row_ext_d.end(), row_ext.begin());
 
       EXPECT_EQ(row, row_ext);
diff --git a/tests/cpp/gbm/test_gbtree.cc b/tests/cpp/gbm/test_gbtree.cc
index 3d53c0f49..f18f3133d 100644
--- a/tests/cpp/gbm/test_gbtree.cc
+++ b/tests/cpp/gbm/test_gbtree.cc
@@ -65,7 +65,7 @@ TEST(GBTree, PredictionCache) {
 
   gbtree.Configure({{"tree_method", "hist"}});
   auto p_m = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
-  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Ordinal());
+  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
   gpair.Data()->Copy(GenerateRandomGradients(kRows));
 
   PredictionCacheEntry out_predictions;
@@ -156,7 +156,7 @@ TEST(GBTree, ChoosePredictor) {
 
   // pull data into device.
   data.HostVector();
-  data.SetDevice(0);
+  data.SetDevice(DeviceOrd::CUDA(0));
   data.DeviceSpan();
   ASSERT_FALSE(data.HostCanWrite());
 
@@ -215,7 +215,7 @@ TEST(GBTree, ChooseTreeMethod) {
     }
     learner->Configure();
     for (std::int32_t i = 0; i < 3; ++i) {
-      linalg::Matrix<GradientPair> gpair{{Xy->Info().num_row_}, Context::kCpuId};
+      linalg::Matrix<GradientPair> gpair{{Xy->Info().num_row_}, DeviceOrd::CPU()};
       gpair.Data()->Copy(GenerateRandomGradients(Xy->Info().num_row_));
       learner->BoostOneIter(0, Xy, &gpair);
     }
@@ -400,7 +400,7 @@ class Dart : public testing::TestWithParam<char const*> {
     if (device == "GPU") {
       ctx = MakeCUDACtx(0);
     }
-    auto rng = RandomDataGenerator(kRows, kCols, 0).Device(ctx.gpu_id);
+    auto rng = RandomDataGenerator(kRows, kCols, 0).Device(ctx.Device());
     auto array_str = rng.GenerateArrayInterface(&data);
     auto p_mat = GetDMatrixFromData(data.HostVector(), kRows, kCols);
 
@@ -710,7 +710,7 @@ TEST(GBTree, InplacePredictionError) {
   auto test_qdm_err = [&](std::string booster, Context const* ctx) {
     std::shared_ptr<DMatrix> p_fmat;
     bst_bin_t max_bins = 16;
-    auto rng = RandomDataGenerator{n_samples, n_features, 0.5f}.Device(ctx->gpu_id).Bins(max_bins);
+    auto rng = RandomDataGenerator{n_samples, n_features, 0.5f}.Device(ctx->Device()).Bins(max_bins);
     if (ctx->IsCPU()) {
       p_fmat = rng.GenerateQuantileDMatrix(true);
     } else {
diff --git a/tests/cpp/gbm/test_gbtree.cu b/tests/cpp/gbm/test_gbtree.cu
index 801c935d6..f308e3b3e 100644
--- a/tests/cpp/gbm/test_gbtree.cu
+++ b/tests/cpp/gbm/test_gbtree.cu
@@ -22,7 +22,7 @@ void TestInplaceFallback(Context const* ctx) {
   bst_feature_t n_features{32};
   HostDeviceVector<float> X_storage;
   // use a different device than the learner
-  std::int32_t data_ordinal = ctx->IsCPU() ? 0 : -1;
+  auto data_ordinal = ctx->IsCPU() ? DeviceOrd::CUDA(0) : DeviceOrd::CPU();
   auto X = RandomDataGenerator{n_samples, n_features, 0.0}
                .Device(data_ordinal)
                .GenerateArrayInterface(&X_storage);
@@ -30,7 +30,7 @@ void TestInplaceFallback(Context const* ctx) {
   auto y = RandomDataGenerator{n_samples, 1u, 0.0}.GenerateArrayInterface(&y_storage);
 
   std::shared_ptr<DMatrix> Xy;
-  if (data_ordinal == Context::kCpuId) {
+  if (data_ordinal.IsCPU()) {
     auto X_adapter = data::ArrayAdapter{StringView{X}};
     Xy.reset(DMatrix::Create(&X_adapter, std::numeric_limits<float>::quiet_NaN(), ctx->Threads()));
   } else {
@@ -49,7 +49,7 @@ void TestInplaceFallback(Context const* ctx) {
 
   std::shared_ptr<DMatrix> p_m{new data::DMatrixProxy};
   auto proxy = std::dynamic_pointer_cast<data::DMatrixProxy>(p_m);
-  if (data_ordinal == Context::kCpuId) {
+  if (data_ordinal.IsCPU()) {
     proxy->SetArrayData(StringView{X});
   } else {
     proxy->SetCUDAArray(X.c_str());
@@ -64,7 +64,7 @@ void TestInplaceFallback(Context const* ctx) {
 
   // test when the contexts match
   Context new_ctx = *proxy->Ctx();
-  ASSERT_NE(new_ctx.gpu_id, ctx->gpu_id);
+  ASSERT_NE(new_ctx.Ordinal(), ctx->Ordinal());
 
   learner->SetParam("device", new_ctx.DeviceName());
   HostDeviceVector<float>* out_predt_1{nullptr};
diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index 276814095..648278b29 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -119,8 +119,10 @@ void CheckObjFunction(std::unique_ptr<xgboost::ObjFunction> const& obj,
                       std::vector<xgboost::bst_float> out_hess) {
   xgboost::MetaInfo info;
   info.num_row_ = labels.size();
-  info.labels = xgboost::linalg::Tensor<float, 2>{
-      labels.cbegin(), labels.cend(), {labels.size(), static_cast<std::size_t>(1)}, -1};
+  info.labels = xgboost::linalg::Tensor<float, 2>{labels.cbegin(),
+                                                  labels.cend(),
+                                                  {labels.size(), static_cast<std::size_t>(1)},
+                                                  xgboost::DeviceOrd::CPU()};
   info.weights_.HostVector() = weights;
 
   CheckObjFunctionImpl(obj, preds, labels, weights, info, out_grad, out_hess);
@@ -155,8 +157,10 @@ void CheckRankingObjFunction(std::unique_ptr<xgboost::ObjFunction> const& obj,
                              std::vector<xgboost::bst_float> out_hess) {
   xgboost::MetaInfo info;
   info.num_row_ = labels.size();
-  info.labels = xgboost::linalg::Matrix<float>{
-      labels.cbegin(), labels.cend(), {labels.size(), static_cast<std::size_t>(1)}, -1};
+  info.labels = xgboost::linalg::Matrix<float>{labels.cbegin(),
+                                               labels.cend(),
+                                               {labels.size(), static_cast<std::size_t>(1)},
+                                               xgboost::DeviceOrd::CPU()};
   info.weights_.HostVector() = weights;
   info.group_ptr_ = groups;
 
@@ -171,8 +175,9 @@ xgboost::bst_float GetMetricEval(xgboost::Metric* metric,
                                  xgboost::DataSplitMode data_split_mode) {
   return GetMultiMetricEval(
       metric, preds,
-      xgboost::linalg::Tensor<float, 2>{labels.begin(), labels.end(), {labels.size()}, -1}, weights,
-      groups, data_split_mode);
+      xgboost::linalg::Tensor<float, 2>{
+          labels.begin(), labels.end(), {labels.size()}, xgboost::DeviceOrd::CPU()},
+      weights, groups, data_split_mode);
 }
 
 double GetMultiMetricEval(xgboost::Metric* metric,
@@ -215,7 +220,7 @@ void RandomDataGenerator::GenerateLabels(std::shared_ptr<DMatrix> p_fmat) const
       p_fmat->Info().labels.Data());
   CHECK_EQ(p_fmat->Info().labels.Size(), this->rows_ * this->n_targets_);
   p_fmat->Info().labels.Reshape(this->rows_, this->n_targets_);
-  if (device_ != Context::kCpuId) {
+  if (device_.IsCUDA()) {
     p_fmat->Info().labels.SetDevice(device_);
   }
 }
@@ -236,7 +241,7 @@ void RandomDataGenerator::GenerateDense(HostDeviceVector<float> *out) const {
       v = dist(&lcg);
     }
   }
-  if (device_ >= 0) {
+  if (device_.IsCUDA()) {
     out->SetDevice(device_);
     out->DeviceSpan();
   }
@@ -258,7 +263,7 @@ std::string RandomDataGenerator::GenerateArrayInterface(
 
 std::pair<std::vector<std::string>, std::string> MakeArrayInterfaceBatch(
     HostDeviceVector<float> const* storage, std::size_t n_samples, bst_feature_t n_features,
-    std::size_t batches, std::int32_t device) {
+    std::size_t batches, DeviceOrd device) {
   std::vector<std::string> result(batches);
   std::vector<Json> objects;
 
@@ -267,7 +272,7 @@ std::pair<std::vector<std::string>, std::string> MakeArrayInterfaceBatch(
   auto make_interface = [storage, device, n_features](std::size_t offset, std::size_t rows) {
     Json array_interface{Object()};
     array_interface["data"] = std::vector<Json>(2);
-    if (device >= 0) {
+    if (device.IsCUDA()) {
       array_interface["data"][0] =
           Integer(reinterpret_cast<int64_t>(storage->DevicePointer() + offset));
       array_interface["stream"] = Null{};
@@ -359,7 +364,7 @@ void RandomDataGenerator::GenerateCSR(
     h_rptr.emplace_back(rptr);
   }
 
-  if (device_ >= 0) {
+  if (device_.IsCUDA()) {
     value->SetDevice(device_);
     value->DeviceSpan();
     row_ptr->SetDevice(device_);
@@ -373,9 +378,8 @@ void RandomDataGenerator::GenerateCSR(
   CHECK_EQ(columns->Size(), value->Size());
 }
 
-[[nodiscard]] std::shared_ptr<DMatrix> RandomDataGenerator::GenerateDMatrix(bool with_label,
-                                                                            bool float_label,
-                                                                            size_t classes) const {
+[[nodiscard]] std::shared_ptr<DMatrix> RandomDataGenerator::GenerateDMatrix(
+    bool with_label, bool float_label, size_t classes, DataSplitMode data_split_mode) const {
   HostDeviceVector<float> data;
   HostDeviceVector<bst_row_t> rptrs;
   HostDeviceVector<bst_feature_t> columns;
@@ -383,7 +387,7 @@ void RandomDataGenerator::GenerateCSR(
   data::CSRAdapter adapter(rptrs.HostPointer(), columns.HostPointer(), data.HostPointer(), rows_,
                            data.Size(), cols_);
   std::shared_ptr<DMatrix> out{
-      DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1)};
+      DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1, "", data_split_mode)};
 
   if (with_label) {
     RandomDataGenerator gen{rows_, n_targets_, 0.0f};
@@ -400,7 +404,7 @@ void RandomDataGenerator::GenerateCSR(
       out->Info().labels.Reshape(this->rows_, this->n_targets_);
     }
   }
-  if (device_ >= 0) {
+  if (device_.IsCUDA()) {
     out->Info().labels.SetDevice(device_);
     out->Info().feature_types.SetDevice(device_);
     for (auto const& page : out->GetBatches<SparsePage>()) {
@@ -423,7 +427,7 @@ void RandomDataGenerator::GenerateCSR(
   CHECK_GE(this->n_batches_, 1)
       << "Must set the n_batches before generating an external memory DMatrix.";
   std::unique_ptr<ArrayIterForTest> iter;
-  if (device_ == Context::kCpuId) {
+  if (device_.IsCPU()) {
     iter = std::make_unique<NumpyArrayIterForTest>(this->sparsity_, rows_, cols_, n_batches_);
   } else {
 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
@@ -487,7 +491,7 @@ int CudaArrayIterForTest::Next() {
 NumpyArrayIterForTest::NumpyArrayIterForTest(float sparsity, size_t rows, size_t cols,
                                              size_t batches)
     : ArrayIterForTest{sparsity, rows, cols, batches} {
-  rng_->Device(Context::kCpuId);
+  rng_->Device(DeviceOrd::CPU());
   std::tie(batches_, interface_) = rng_->GenerateArrayInterfaceBatch(&data_, n_batches_);
   this->Reset();
 }
@@ -644,8 +648,8 @@ std::unique_ptr<GradientBooster> CreateTrainedGBM(std::string name, Args kwargs,
     labels[i] = i;
   }
   p_dmat->Info().labels =
-      linalg::Tensor<float, 2>{labels.cbegin(), labels.cend(), {labels.size()}, -1};
-  linalg::Matrix<GradientPair> gpair({kRows}, ctx->Ordinal());
+      linalg::Tensor<float, 2>{labels.cbegin(), labels.cend(), {labels.size()}, DeviceOrd::CPU()};
+  linalg::Matrix<GradientPair> gpair({kRows}, ctx->Device());
   auto h_gpair = gpair.HostView();
   for (size_t i = 0; i < kRows; ++i) {
     h_gpair(i) = GradientPair{static_cast<float>(i), 1};
@@ -674,7 +678,7 @@ ArrayIterForTest::ArrayIterForTest(Context const* ctx, HostDeviceVector<float> c
   CHECK_EQ(this->data_.Size(), rows_ * cols_ * n_batches);
   this->data_.Copy(data);
   std::tie(batches_, interface_) =
-      MakeArrayInterfaceBatch(&data_, rows_, cols_, n_batches_, ctx->gpu_id);
+      MakeArrayInterfaceBatch(&data_, rows_, cols_, n_batches_, ctx->Device());
 }
 
 ArrayIterForTest::~ArrayIterForTest() { XGDMatrixFree(proxy_); }
diff --git a/tests/cpp/helpers.cu b/tests/cpp/helpers.cu
index 7885c6def..00789452e 100644
--- a/tests/cpp/helpers.cu
+++ b/tests/cpp/helpers.cu
@@ -13,7 +13,7 @@ namespace xgboost {
 CudaArrayIterForTest::CudaArrayIterForTest(float sparsity, size_t rows,
                                            size_t cols, size_t batches)
     : ArrayIterForTest{sparsity, rows, cols, batches} {
-  rng_->Device(0);
+  rng_->Device(FstCU());
   std::tie(batches_, interface_) =
       rng_->GenerateArrayInterfaceBatch(&data_, n_batches_);
   this->Reset();
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index e8cf8394b..06fb43564 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -231,7 +231,7 @@ class RandomDataGenerator {
 
   bst_target_t n_targets_{1};
 
-  std::int32_t device_{Context::kCpuId};
+  DeviceOrd device_{DeviceOrd::CPU()};
   std::size_t n_batches_{0};
   std::uint64_t seed_{0};
   SimpleLCG lcg_;
@@ -256,7 +256,7 @@ class RandomDataGenerator {
     upper_ = v;
     return *this;
   }
-  RandomDataGenerator& Device(int32_t d) {
+  RandomDataGenerator& Device(DeviceOrd d) {
     device_ = d;
     return *this;
   }
@@ -310,9 +310,9 @@ class RandomDataGenerator {
   void GenerateCSR(HostDeviceVector<float>* value, HostDeviceVector<bst_row_t>* row_ptr,
                    HostDeviceVector<bst_feature_t>* columns) const;
 
-  [[nodiscard]] std::shared_ptr<DMatrix> GenerateDMatrix(bool with_label = false,
-                                                         bool float_label = true,
-                                                         size_t classes = 1) const;
+  [[nodiscard]] std::shared_ptr<DMatrix> GenerateDMatrix(
+      bool with_label = false, bool float_label = true, size_t classes = 1,
+      DataSplitMode data_split_mode = DataSplitMode::kRow) const;
 
   [[nodiscard]] std::shared_ptr<DMatrix> GenerateSparsePageDMatrix(std::string prefix,
                                                                    bool with_label) const;
@@ -391,7 +391,7 @@ std::unique_ptr<GradientBooster> CreateTrainedGBM(std::string name, Args kwargs,
  * \brief Make a context that uses CUDA if device >= 0.
  */
 inline Context MakeCUDACtx(std::int32_t device) {
-  if (device == Context::kCpuId) {
+  if (device == DeviceOrd::CPUOrdinal()) {
     return Context{};
   }
   return Context{}.MakeCUDA(device);
@@ -501,7 +501,7 @@ RMMAllocatorPtr SetUpRMMResourceForCppTests(int argc, char** argv);
  * \brief Make learner model param
  */
 inline LearnerModelParam MakeMP(bst_feature_t n_features, float base_score, uint32_t n_groups,
-                                int32_t device = Context::kCpuId) {
+                                DeviceOrd device = DeviceOrd::CPU()) {
   size_t shape[1]{1};
   LearnerModelParam mparam(n_features, linalg::Tensor<float, 1>{{base_score}, shape, device},
                            n_groups, 1, MultiStrategy::kOneOutputPerTree);
@@ -571,4 +571,5 @@ class BaseMGPUTest : public ::testing::Test {
 
 class DeclareUnifiedDistributedTest(MetricTest) : public BaseMGPUTest{};
 
+inline DeviceOrd FstCU() { return DeviceOrd::CUDA(0); }
 }  // namespace xgboost
diff --git a/tests/cpp/histogram_helpers.h b/tests/cpp/histogram_helpers.h
index 6930447f0..d09a1dce6 100644
--- a/tests/cpp/histogram_helpers.h
+++ b/tests/cpp/histogram_helpers.h
@@ -1,3 +1,8 @@
+/**
+ * Copyright 2020-2023, XGBoost contributors
+ */
+#pragma once
+
 #if defined(__CUDACC__) || defined(__HIP_PLATFORM_AMD__)
 #include "../../src/data/ellpack_page.cuh"
 #endif
@@ -24,8 +29,8 @@ class HistogramCutsWrapper : public common::HistogramCuts {
 };
 }  //  anonymous namespace
 
-inline std::unique_ptr<EllpackPageImpl> BuildEllpackPage(
-    int n_rows, int n_cols, bst_float sparsity= 0) {
+inline std::unique_ptr<EllpackPageImpl> BuildEllpackPage(int n_rows, int n_cols,
+                                                         bst_float sparsity = 0) {
   auto dmat = RandomDataGenerator(n_rows, n_cols, sparsity).Seed(3).GenerateDMatrix();
   const SparsePage& batch = *dmat->GetBatches<xgboost::SparsePage>().begin();
 
@@ -49,7 +54,7 @@ inline std::unique_ptr<EllpackPageImpl> BuildEllpackPage(
   }
 
   auto page = std::unique_ptr<EllpackPageImpl>(
-      new EllpackPageImpl(0, cmat, batch, dmat->IsDense(), row_stride, {}));
+      new EllpackPageImpl(DeviceOrd::CUDA(0), cmat, batch, dmat->IsDense(), row_stride, {}));
 
   return page;
 }
diff --git a/tests/cpp/metric/test_auc.h b/tests/cpp/metric/test_auc.h
index 0dd3dd83e..cef6d9757 100644
--- a/tests/cpp/metric/test_auc.h
+++ b/tests/cpp/metric/test_auc.h
@@ -28,7 +28,7 @@ inline void VerifyBinaryAUC(DataSplitMode data_split_mode = DataSplitMode::kRow)
   // Invalid dataset
   auto p_fmat = EmptyDMatrix();
   MetaInfo& info = p_fmat->Info();
-  info.labels = linalg::Tensor<float, 2>{{0.0f, 0.0f}, {2}, -1};
+  info.labels = linalg::Tensor<float, 2>{{0.0f, 0.0f}, {2}, DeviceOrd::CPU()};
   float auc = metric->Evaluate({1, 1}, p_fmat);
   ASSERT_TRUE(std::isnan(auc));
   *info.labels.Data() = HostDeviceVector<float>{};
diff --git a/tests/cpp/metric/test_elementwise_metric.cc b/tests/cpp/metric/test_elementwise_metric.cc
index 13021fb6a..11854ce88 100644
--- a/tests/cpp/metric/test_elementwise_metric.cc
+++ b/tests/cpp/metric/test_elementwise_metric.cc
@@ -3,8 +3,7 @@
  */
 #include "test_elementwise_metric.h"
 
-namespace xgboost {
-namespace metric {
+namespace xgboost::metric {
 TEST(Metric, DeclareUnifiedTest(RMSE)) { VerifyRMSE(); }
 
 TEST(Metric, DeclareUnifiedTest(RMSLE)) { VerifyRMSLE(); }
@@ -104,5 +103,4 @@ TEST_F(DeclareUnifiedDistributedTest(MetricTest), QuantileRowSplit) {
 TEST_F(DeclareUnifiedDistributedTest(MetricTest), QuantileColumnSplit) {
   DoTest(VerifyQuantile, DataSplitMode::kCol);
 }
-}  // namespace metric
-}  // namespace xgboost
+}  // namespace xgboost::metric
diff --git a/tests/cpp/metric/test_elementwise_metric.h b/tests/cpp/metric/test_elementwise_metric.h
index a32bb0438..ef34d7651 100644
--- a/tests/cpp/metric/test_elementwise_metric.h
+++ b/tests/cpp/metric/test_elementwise_metric.h
@@ -11,9 +11,7 @@
 #include "../../../src/common/linalg_op.h"
 #include "../helpers.h"
 
-namespace xgboost {
-namespace metric {
-
+namespace xgboost::metric {
 inline void CheckDeterministicMetricElementWise(StringView name, int32_t device) {
   auto ctx = MakeCUDACtx(device);
   std::unique_ptr<Metric> metric{Metric::Create(name.c_str(), &ctx)};
@@ -325,14 +323,14 @@ inline void VerifyPoissonNegLogLik(DataSplitMode data_split_mode = DataSplitMode
 }
 
 inline void VerifyMultiRMSE(DataSplitMode data_split_mode = DataSplitMode::kRow) {
+  auto ctx = MakeCUDACtx(GPUIDX);
   size_t n_samples = 32, n_targets = 8;
-  linalg::Tensor<float, 2> y{{n_samples, n_targets}, GPUIDX};
+  linalg::Tensor<float, 2> y{{n_samples, n_targets}, ctx.Device()};
   auto &h_y = y.Data()->HostVector();
   std::iota(h_y.begin(), h_y.end(), 0);
 
   HostDeviceVector<float> predt(n_samples * n_targets, 0);
 
-  auto ctx = MakeCUDACtx(GPUIDX);
   std::unique_ptr<Metric> metric{Metric::Create("rmse", &ctx)};
   metric->Configure({});
 
@@ -381,5 +379,4 @@ inline void VerifyQuantile(DataSplitMode data_split_mode = DataSplitMode::kRow)
   metric->Configure(Args{{"quantile_alpha", "[1.0]"}});
   EXPECT_NEAR(GetMetricEval(metric.get(), predts, labels, {}, {}, data_split_mode), 0.3f, 0.001f);
 }
-}  // namespace metric
-}  // namespace xgboost
+}  // namespace xgboost::metric
diff --git a/tests/cpp/metric/test_rank_metric.h b/tests/cpp/metric/test_rank_metric.h
index 2f7785689..5d5e87072 100644
--- a/tests/cpp/metric/test_rank_metric.h
+++ b/tests/cpp/metric/test_rank_metric.h
@@ -154,7 +154,7 @@ inline void VerifyNDCGExpGain(DataSplitMode data_split_mode = DataSplitMode::kRo
 
   auto p_fmat = xgboost::RandomDataGenerator{0, 0, 0}.GenerateDMatrix();
   MetaInfo& info = p_fmat->Info();
-  info.labels = linalg::Matrix<float>{{10.0f, 0.0f, 0.0f, 1.0f, 5.0f}, {5}, ctx.gpu_id};
+  info.labels = linalg::Matrix<float>{{10.0f, 0.0f, 0.0f, 1.0f, 5.0f}, {5}, ctx.Device()};
   info.num_row_ = info.labels.Shape(0);
   info.group_ptr_.resize(2);
   info.group_ptr_[0] = 0;
diff --git a/tests/cpp/objective/test_lambdarank_obj.cc b/tests/cpp/objective/test_lambdarank_obj.cc
index 963f69639..2b34cfa38 100644
--- a/tests/cpp/objective/test_lambdarank_obj.cc
+++ b/tests/cpp/objective/test_lambdarank_obj.cc
@@ -71,7 +71,7 @@ void TestNDCGGPair(Context const* ctx) {
 
   HostDeviceVector<float> predts{0, 1, 0, 1};
   MetaInfo info;
-  info.labels = linalg::Tensor<float, 2>{{0, 1, 0, 1}, {4, 1}, GPUIDX};
+  info.labels = linalg::Tensor<float, 2>{{0, 1, 0, 1}, {4, 1}, ctx->Device()};
   info.group_ptr_ = {0, 2, 4};
   info.num_row_ = 4;
   linalg::Matrix<GradientPair> gpairs;
@@ -146,7 +146,7 @@ TEST(LambdaRank, UnbiasedNDCG) {
 }
 
 void InitMakePairTest(Context const* ctx, MetaInfo* out_info, HostDeviceVector<float>* out_predt) {
-  out_predt->SetDevice(ctx->gpu_id);
+  out_predt->SetDevice(ctx->Device());
   MetaInfo& info = *out_info;
   info.num_row_ = 128;
   info.labels.ModifyInplace([&](HostDeviceVector<float>* data, common::Span<std::size_t> shape) {
@@ -243,7 +243,7 @@ void TestMAPStat(Context const* ctx) {
 
     auto p_cache = std::make_shared<ltr::MAPCache>(ctx, info, param);
 
-    predt.SetDevice(ctx->gpu_id);
+    predt.SetDevice(ctx->Device());
     auto rank_idx =
         p_cache->SortedIdx(ctx, ctx->IsCPU() ? predt.ConstHostSpan() : predt.ConstDeviceSpan());
 
@@ -280,7 +280,7 @@ void TestMAPStat(Context const* ctx) {
 
     auto p_cache = std::make_shared<ltr::MAPCache>(ctx, info, param);
 
-    predt.SetDevice(ctx->gpu_id);
+    predt.SetDevice(ctx->Device());
     auto rank_idx =
         p_cache->SortedIdx(ctx, ctx->IsCPU() ? predt.ConstHostSpan() : predt.ConstDeviceSpan());
 
diff --git a/tests/cpp/objective/test_objective.cc b/tests/cpp/objective/test_objective.cc
index 718f8f659..21ffc7caf 100644
--- a/tests/cpp/objective/test_objective.cc
+++ b/tests/cpp/objective/test_objective.cc
@@ -6,6 +6,7 @@
 #include <xgboost/objective.h>
 
 #include "../helpers.h"
+#include "../objective_helpers.h"
 
 TEST(Objective, UnknownFunction) {
   xgboost::ObjFunction* obj = nullptr;
@@ -43,4 +44,61 @@ TEST(Objective, PredTransform) {
     ASSERT_TRUE(predts.HostCanWrite());
   }
 }
+
+class TestDefaultObjConfig : public ::testing::TestWithParam<std::string> {
+  Context ctx_;
+
+ public:
+  void Run(std::string objective) {
+    auto Xy = MakeFmatForObjTest(objective);
+    std::unique_ptr<Learner> learner{Learner::Create({Xy})};
+    std::unique_ptr<ObjFunction> objfn{ObjFunction::Create(objective, &ctx_)};
+
+    learner->SetParam("objective", objective);
+    if (objective.find("multi") != std::string::npos) {
+      learner->SetParam("num_class", "3");
+      objfn->Configure(Args{{"num_class", "3"}});
+    } else if (objective.find("quantile") != std::string::npos) {
+      learner->SetParam("quantile_alpha", "0.5");
+      objfn->Configure(Args{{"quantile_alpha", "0.5"}});
+    } else {
+      objfn->Configure(Args{});
+    }
+    learner->Configure();
+    learner->UpdateOneIter(0, Xy);
+    learner->EvalOneIter(0, {Xy}, {"train"});
+    Json config{Object{}};
+    learner->SaveConfig(&config);
+    auto jobj = get<Object const>(config["learner"]["objective"]);
+
+    ASSERT_TRUE(jobj.find("name") != jobj.cend());
+    // FIXME(jiamingy): We should have the following check, but some legacy parameter like
+    // "pos_weight", "delta_step" in objectives are not in metrics.
+
+    // if (jobj.size() > 1) {
+    //   ASSERT_FALSE(IsA<Null>(objfn->DefaultMetricConfig()));
+    // }
+    auto mconfig = objfn->DefaultMetricConfig();
+    if (!IsA<Null>(mconfig)) {
+      // make sure metric can handle it
+      std::unique_ptr<Metric> metricfn{Metric::Create(get<String const>(mconfig["name"]), &ctx_)};
+      metricfn->LoadConfig(mconfig);
+      Json loaded(Object{});
+      metricfn->SaveConfig(&loaded);
+      metricfn->Configure(Args{});
+      ASSERT_EQ(mconfig, loaded);
+    }
+  }
+};
+
+TEST_P(TestDefaultObjConfig, Objective) {
+  std::string objective = GetParam();
+  this->Run(objective);
+}
+
+INSTANTIATE_TEST_SUITE_P(Objective, TestDefaultObjConfig,
+                         ::testing::ValuesIn(MakeObjNamesForTest()),
+                         [](const ::testing::TestParamInfo<TestDefaultObjConfig::ParamType>& info) {
+                           return ObjTestNameGenerator(info);
+                         });
 } // namespace xgboost
diff --git a/tests/cpp/objective/test_quantile_obj.cc b/tests/cpp/objective/test_quantile_obj.cc
index b263b4a8f..5b0a981e1 100644
--- a/tests/cpp/objective/test_quantile_obj.cc
+++ b/tests/cpp/objective/test_quantile_obj.cc
@@ -45,7 +45,7 @@ TEST(Objective, DeclareUnifiedTest(QuantileIntercept)) {
   MetaInfo info;
   info.num_row_ = 10;
   info.labels.ModifyInplace([&](HostDeviceVector<float>* data, common::Span<std::size_t> shape) {
-    data->SetDevice(ctx.gpu_id);
+    data->SetDevice(ctx.Device());
     data->Resize(info.num_row_);
     shape[0] = info.num_row_;
     shape[1] = 1;
diff --git a/tests/cpp/objective_helpers.cc b/tests/cpp/objective_helpers.cc
new file mode 100644
index 000000000..ed80f71d5
--- /dev/null
+++ b/tests/cpp/objective_helpers.cc
@@ -0,0 +1,31 @@
+/**
+ * Copyright (c) 2023, XGBoost contributors
+ */
+#include "objective_helpers.h"
+
+#include "../../src/common/linalg_op.h"  // for begin, end
+#include "helpers.h"                     // for RandomDataGenerator
+
+namespace xgboost {
+std::shared_ptr<DMatrix> MakeFmatForObjTest(std::string const& obj) {
+  auto constexpr kRows = 10, kCols = 10;
+  auto p_fmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true);
+  auto& h_upper = p_fmat->Info().labels_upper_bound_.HostVector();
+  auto& h_lower = p_fmat->Info().labels_lower_bound_.HostVector();
+  h_lower.resize(kRows);
+  h_upper.resize(kRows);
+  for (size_t i = 0; i < kRows; ++i) {
+    h_lower[i] = 1;
+    h_upper[i] = 10;
+  }
+  if (obj.find("rank:") != std::string::npos) {
+    auto h_label = p_fmat->Info().labels.HostView();
+    std::size_t k = 0;
+    for (auto& v : h_label) {
+      v = k % 2 == 0;
+      ++k;
+    }
+  }
+  return p_fmat;
+};
+}  // namespace xgboost
diff --git a/tests/cpp/objective_helpers.h b/tests/cpp/objective_helpers.h
index b26470746..7f394ef8d 100644
--- a/tests/cpp/objective_helpers.h
+++ b/tests/cpp/objective_helpers.h
@@ -1,6 +1,8 @@
 /**
  * Copyright (c) 2023, XGBoost contributors
  */
+#pragma once
+
 #include <dmlc/registry.h>  // for Registry
 #include <gtest/gtest.h>
 #include <xgboost/objective.h>  // for ObjFunctionReg
@@ -29,4 +31,6 @@ inline std::string ObjTestNameGenerator(const ::testing::TestParamInfo<ParamType
   }
   return name;
 };
+
+std::shared_ptr<DMatrix> MakeFmatForObjTest(std::string const& obj);
 }  // namespace xgboost
diff --git a/tests/cpp/plugin/helpers.h b/tests/cpp/plugin/helpers.h
index 20b4afc30..b756adefd 100644
--- a/tests/cpp/plugin/helpers.h
+++ b/tests/cpp/plugin/helpers.h
@@ -23,7 +23,7 @@ class ServerForTest {
   std::unique_ptr<grpc::Server> server_;
 
  public:
-  explicit ServerForTest(std::int32_t world_size) {
+  explicit ServerForTest(std::size_t world_size) {
     server_thread_.reset(new std::thread([this, world_size] {
       grpc::ServerBuilder builder;
       xgboost::federated::FederatedService service{world_size};
diff --git a/tests/cpp/plugin/test_federated_communicator.cc b/tests/cpp/plugin/test_federated_communicator.cc
index 8b0e1039a..68b112f1c 100644
--- a/tests/cpp/plugin/test_federated_communicator.cc
+++ b/tests/cpp/plugin/test_federated_communicator.cc
@@ -19,6 +19,11 @@ class FederatedCommunicatorTest : public BaseFederatedTest {
     CheckAllgather(comm, rank);
   }
 
+  static void VerifyAllgatherV(int rank, const std::string &server_address) {
+    FederatedCommunicator comm{kWorldSize, rank, server_address};
+    CheckAllgatherV(comm, rank);
+  }
+
   static void VerifyAllreduce(int rank, const std::string &server_address) {
     FederatedCommunicator comm{kWorldSize, rank, server_address};
     CheckAllreduce(comm);
@@ -31,14 +36,19 @@ class FederatedCommunicatorTest : public BaseFederatedTest {
 
  protected:
   static void CheckAllgather(FederatedCommunicator &comm, int rank) {
-    int buffer[kWorldSize] = {0, 0};
-    buffer[rank] = rank;
-    comm.AllGather(buffer, sizeof(buffer));
+    std::string input{static_cast<char>('0' + rank)};
+    auto output = comm.AllGather(input);
     for (auto i = 0; i < kWorldSize; i++) {
-      EXPECT_EQ(buffer[i], i);
+      EXPECT_EQ(output[i], static_cast<char>('0' + i));
     }
   }
 
+  static void CheckAllgatherV(FederatedCommunicator &comm, int rank) {
+    std::vector<std::string_view> inputs{"Federated", " Learning!!!"};
+    auto output = comm.AllGatherV(inputs[rank]);
+    EXPECT_EQ(output, "Federated Learning!!!");
+  }
+
   static void CheckAllreduce(FederatedCommunicator &comm) {
     int buffer[] = {1, 2, 3, 4, 5};
     comm.AllReduce(buffer, sizeof(buffer) / sizeof(buffer[0]), DataType::kInt32, Operation::kSum);
@@ -119,6 +129,16 @@ TEST_F(FederatedCommunicatorTest, Allgather) {
   }
 }
 
+TEST_F(FederatedCommunicatorTest, AllgatherV) {
+  std::vector<std::thread> threads;
+  for (auto rank = 0; rank < kWorldSize; rank++) {
+    threads.emplace_back(&FederatedCommunicatorTest::VerifyAllgatherV, rank, server_->Address());
+  }
+  for (auto &thread : threads) {
+    thread.join();
+  }
+}
+
 TEST_F(FederatedCommunicatorTest, Allreduce) {
   std::vector<std::thread> threads;
   for (auto rank = 0; rank < kWorldSize; rank++) {
diff --git a/tests/cpp/plugin/test_federated_learner.cc b/tests/cpp/plugin/test_federated_learner.cc
index d8f552c41..954e0cdb7 100644
--- a/tests/cpp/plugin/test_federated_learner.cc
+++ b/tests/cpp/plugin/test_federated_learner.cc
@@ -120,6 +120,11 @@ TEST_P(VerticalFederatedLearnerTest, Hist) {
 }
 
 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
+TEST_P(VerticalFederatedLearnerTest, GPUApprox) {
+  std::string objective = GetParam();
+  this->Run("approx", "cuda:0", objective);
+}
+
 TEST_P(VerticalFederatedLearnerTest, GPUHist) {
   std::string objective = GetParam();
   this->Run("hist", "cuda:0", objective);
diff --git a/tests/cpp/plugin/test_federated_server.cc b/tests/cpp/plugin/test_federated_server.cc
index 633d64df1..c40e58fa3 100644
--- a/tests/cpp/plugin/test_federated_server.cc
+++ b/tests/cpp/plugin/test_federated_server.cc
@@ -18,6 +18,11 @@ class FederatedServerTest : public BaseFederatedTest {
     CheckAllgather(client, rank);
   }
 
+  static void VerifyAllgatherV(int rank, const std::string& server_address) {
+    federated::FederatedClient client{server_address, rank};
+    CheckAllgatherV(client, rank);
+  }
+
   static void VerifyAllreduce(int rank, const std::string& server_address) {
     federated::FederatedClient client{server_address, rank};
     CheckAllreduce(client);
@@ -39,8 +44,7 @@ class FederatedServerTest : public BaseFederatedTest {
 
  protected:
   static void CheckAllgather(federated::FederatedClient& client, int rank) {
-    int data[kWorldSize] = {0, 0};
-    data[rank] = rank;
+    int data[] = {rank};
     std::string send_buffer(reinterpret_cast<char const*>(data), sizeof(data));
     auto reply = client.Allgather(send_buffer);
     auto const* result = reinterpret_cast<int const*>(reply.data());
@@ -49,6 +53,12 @@ class FederatedServerTest : public BaseFederatedTest {
     }
   }
 
+  static void CheckAllgatherV(federated::FederatedClient& client, int rank) {
+    std::vector<std::string_view> inputs{"Hello,", " World!"};
+    auto reply = client.AllgatherV(inputs[rank]);
+    EXPECT_EQ(reply, "Hello, World!");
+  }
+
   static void CheckAllreduce(federated::FederatedClient& client) {
     int data[] = {1, 2, 3, 4, 5};
     std::string send_buffer(reinterpret_cast<char const*>(data), sizeof(data));
@@ -80,6 +90,16 @@ TEST_F(FederatedServerTest, Allgather) {
   }
 }
 
+TEST_F(FederatedServerTest, AllgatherV) {
+  std::vector<std::thread> threads;
+  for (auto rank = 0; rank < kWorldSize; rank++) {
+    threads.emplace_back(&FederatedServerTest::VerifyAllgatherV, rank, server_->Address());
+  }
+  for (auto& thread : threads) {
+    thread.join();
+  }
+}
+
 TEST_F(FederatedServerTest, Allreduce) {
   std::vector<std::thread> threads;
   for (auto rank = 0; rank < kWorldSize; rank++) {
diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc
index 5ff0fdeec..07f33d72e 100644
--- a/tests/cpp/predictor/test_cpu_predictor.cc
+++ b/tests/cpp/predictor/test_cpu_predictor.cc
@@ -127,8 +127,8 @@ TEST(CpuPredictor, IterationRange) {
 }
 
 TEST(CpuPredictor, IterationRangeColmnSplit) {
-  Context ctx;
-  TestIterationRangeColumnSplit(&ctx);
+  auto constexpr kWorldSize = 2;
+  TestIterationRangeColumnSplit(kWorldSize, false);
 }
 
 TEST(CpuPredictor, ExternalMemory) {
@@ -142,7 +142,7 @@ TEST(CpuPredictor, InplacePredict) {
   bst_row_t constexpr kRows{128};
   bst_feature_t constexpr kCols{64};
   Context ctx;
-  auto gen = RandomDataGenerator{kRows, kCols, 0.5}.Device(ctx.gpu_id);
+  auto gen = RandomDataGenerator{kRows, kCols, 0.5}.Device(ctx.Device());
   {
     HostDeviceVector<float> data;
     gen.GenerateDense(&data);
@@ -226,23 +226,21 @@ TEST(CPUPredictor, GHistIndexTraining) {
 }
 
 TEST(CPUPredictor, CategoricalPrediction) {
-  Context ctx;
-  TestCategoricalPrediction(&ctx, false);
+  TestCategoricalPrediction(false, false);
 }
 
 TEST(CPUPredictor, CategoricalPredictionColumnSplit) {
-  Context ctx;
-  TestCategoricalPredictionColumnSplit(&ctx);
+  auto constexpr kWorldSize = 2;
+  RunWithInMemoryCommunicator(kWorldSize, TestCategoricalPrediction, false, true);
 }
 
 TEST(CPUPredictor, CategoricalPredictLeaf) {
-  Context ctx;
-  TestCategoricalPredictLeaf(&ctx, false);
+  TestCategoricalPredictLeaf(false, false);
 }
 
 TEST(CPUPredictor, CategoricalPredictLeafColumnSplit) {
-  Context ctx;
-  TestCategoricalPredictLeafColumnSplit(&ctx);
+  auto constexpr kWorldSize = 2;
+  RunWithInMemoryCommunicator(kWorldSize, TestCategoricalPredictLeaf, false, true);
 }
 
 TEST(CpuPredictor, UpdatePredictionCache) {
@@ -256,8 +254,8 @@ TEST(CpuPredictor, LesserFeatures) {
 }
 
 TEST(CpuPredictor, LesserFeaturesColumnSplit) {
-  Context ctx;
-  TestPredictionWithLesserFeaturesColumnSplit(&ctx);
+  auto constexpr kWorldSize = 2;
+  RunWithInMemoryCommunicator(kWorldSize, TestPredictionWithLesserFeaturesColumnSplit, false);
 }
 
 TEST(CpuPredictor, Sparse) {
@@ -267,9 +265,9 @@ TEST(CpuPredictor, Sparse) {
 }
 
 TEST(CpuPredictor, SparseColumnSplit) {
-  Context ctx;
-  TestSparsePredictionColumnSplit(&ctx, 0.2);
-  TestSparsePredictionColumnSplit(&ctx, 0.8);
+  auto constexpr kWorldSize = 2;
+  TestSparsePredictionColumnSplit(kWorldSize, false, 0.2);
+  TestSparsePredictionColumnSplit(kWorldSize, false, 0.8);
 }
 
 TEST(CpuPredictor, Multi) {
diff --git a/tests/cpp/predictor/test_gpu_predictor.cu b/tests/cpp/predictor/test_gpu_predictor.cu
index 2abe954fc..b15076773 100644
--- a/tests/cpp/predictor/test_gpu_predictor.cu
+++ b/tests/cpp/predictor/test_gpu_predictor.cu
@@ -38,7 +38,7 @@ TEST(GPUPredictor, Basic) {
     auto dmat = RandomDataGenerator(n_row, n_col, 0).GenerateDMatrix();
 
     auto ctx = MakeCUDACtx(0);
-    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Ordinal())};
+    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Device())};
     gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);
 
     // Test predict batch
@@ -74,7 +74,7 @@ void VerifyBasicColumnSplit(std::array<std::vector<float>, 32> const& expected_r
     auto dmat = RandomDataGenerator(n_row, n_col, 0).GenerateDMatrix();
     std::unique_ptr<DMatrix> sliced{dmat->SliceCol(world_size, rank)};
 
-    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Ordinal())};
+    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Device())};
     gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);
 
     // Test predict batch
@@ -102,7 +102,7 @@ TEST_F(MGPUPredictorTest, BasicColumnSplit) {
     size_t n_row = i, n_col = i;
     auto dmat = RandomDataGenerator(n_row, n_col, 0).GenerateDMatrix();
 
-    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Ordinal())};
+    LearnerModelParam mparam{MakeMP(n_col, .5, 1, ctx.Device())};
     gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx);
 
     // Test predict batch
@@ -123,8 +123,10 @@ TEST(GPUPredictor, EllpackBasic) {
   auto ctx = MakeCUDACtx(0);
   for (size_t bins = 2; bins < 258; bins += 16) {
     size_t rows = bins * 16;
-    auto p_m =
-        RandomDataGenerator{rows, kCols, 0.0}.Bins(bins).Device(0).GenerateDeviceDMatrix(false);
+    auto p_m = RandomDataGenerator{rows, kCols, 0.0}
+                   .Bins(bins)
+                   .Device(DeviceOrd::CUDA(0))
+                   .GenerateDeviceDMatrix(false);
     ASSERT_FALSE(p_m->PageExists<SparsePage>());
     TestPredictionFromGradientIndex<EllpackPage>(&ctx, rows, kCols, p_m);
     TestPredictionFromGradientIndex<EllpackPage>(&ctx, bins, kCols, p_m);
@@ -136,11 +138,11 @@ TEST(GPUPredictor, EllpackTraining) {
   size_t constexpr kRows{128}, kCols{16}, kBins{64};
   auto p_ellpack = RandomDataGenerator{kRows, kCols, 0.0}
                        .Bins(kBins)
-                       .Device(ctx.Ordinal())
+                       .Device(ctx.Device())
                        .GenerateDeviceDMatrix(false);
   HostDeviceVector<float> storage(kRows * kCols);
   auto columnar =
-      RandomDataGenerator{kRows, kCols, 0.0}.Device(ctx.Ordinal()).GenerateArrayInterface(&storage);
+      RandomDataGenerator{kRows, kCols, 0.0}.Device(ctx.Device()).GenerateArrayInterface(&storage);
   auto adapter = data::CupyAdapter(columnar);
   std::shared_ptr<DMatrix> p_full{
       DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1)};
@@ -155,7 +157,7 @@ TEST(GPUPredictor, ExternalMemoryTest) {
 
   const int n_classes = 3;
   Context ctx = MakeCUDACtx(0);
-  LearnerModelParam mparam{MakeMP(5, .5, n_classes, ctx.Ordinal())};
+  LearnerModelParam mparam{MakeMP(5, .5, n_classes, ctx.Device())};
 
   gbm::GBTreeModel model = CreateTestModel(&mparam, &ctx, n_classes);
   std::vector<std::unique_ptr<DMatrix>> dmats;
@@ -166,7 +168,7 @@ TEST(GPUPredictor, ExternalMemoryTest) {
 
   for (const auto& dmat: dmats) {
     dmat->Info().base_margin_ = decltype(dmat->Info().base_margin_){
-        {dmat->Info().num_row_, static_cast<size_t>(n_classes)}, 0};
+        {dmat->Info().num_row_, static_cast<size_t>(n_classes)}, DeviceOrd::CUDA(0)};
     dmat->Info().base_margin_.Data()->Fill(0.5);
     PredictionCacheEntry out_predictions;
     gpu_predictor->InitOutPredictions(dmat->Info(), &out_predictions.predictions, model);
@@ -185,7 +187,7 @@ TEST(GPUPredictor, InplacePredictCupy) {
   auto ctx = MakeCUDACtx(0);
   size_t constexpr kRows{128}, kCols{64};
   RandomDataGenerator gen(kRows, kCols, 0.5);
-  gen.Device(ctx.Ordinal());
+  gen.Device(ctx.Device());
   HostDeviceVector<float> data;
   std::string interface_str = gen.GenerateArrayInterface(&data);
   std::shared_ptr<DMatrix> p_fmat{new data::DMatrixProxy};
@@ -197,7 +199,7 @@ TEST(GPUPredictor, InplacePredictCuDF) {
   auto ctx = MakeCUDACtx(0);
   size_t constexpr kRows{128}, kCols{64};
   RandomDataGenerator gen(kRows, kCols, 0.5);
-  gen.Device(ctx.Ordinal());
+  gen.Device(ctx.Device());
   std::vector<HostDeviceVector<float>> storage(kCols);
   auto interface_str = gen.GenerateColumnarArrayInterface(&storage);
   std::shared_ptr<DMatrix> p_fmat{new data::DMatrixProxy};
@@ -210,6 +212,10 @@ TEST(GpuPredictor, LesserFeatures) {
   TestPredictionWithLesserFeatures(&ctx);
 }
 
+TEST_F(MGPUPredictorTest, LesserFeaturesColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, TestPredictionWithLesserFeaturesColumnSplit, true);
+}
+
 // Very basic test of empty model
 TEST(GPUPredictor, ShapStump) {
 #if defined(XGBOOST_USE_CUDA)
@@ -219,7 +225,7 @@ TEST(GPUPredictor, ShapStump) {
 #endif
 
   auto ctx = MakeCUDACtx(0);
-  LearnerModelParam mparam{MakeMP(1, .5, 1, ctx.Ordinal())};
+  LearnerModelParam mparam{MakeMP(1, .5, 1, ctx.Device())};
   gbm::GBTreeModel model(&mparam, &ctx);
 
   std::vector<std::unique_ptr<RegTree>> trees;
@@ -245,7 +251,7 @@ TEST(GPUPredictor, ShapStump) {
 
 TEST(GPUPredictor, Shap) {
   auto ctx = MakeCUDACtx(0);
-  LearnerModelParam mparam{MakeMP(1, .5, 1, ctx.Ordinal())};
+  LearnerModelParam mparam{MakeMP(1, .5, 1, ctx.Device())};
   gbm::GBTreeModel model(&mparam, &ctx);
 
   std::vector<std::unique_ptr<RegTree>> trees;
@@ -278,19 +284,29 @@ TEST(GPUPredictor, IterationRange) {
   TestIterationRange(&ctx);
 }
 
+TEST_F(MGPUPredictorTest, IterationRangeColumnSplit) {
+  TestIterationRangeColumnSplit(world_size_, true);
+}
+
 TEST(GPUPredictor, CategoricalPrediction) {
-  auto ctx = MakeCUDACtx(0);
-  TestCategoricalPrediction(&ctx, false);
+  TestCategoricalPrediction(true, false);
+}
+
+TEST_F(MGPUPredictorTest, CategoricalPredictionColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, TestCategoricalPrediction, true, true);
 }
 
 TEST(GPUPredictor, CategoricalPredictLeaf) {
-  auto ctx = MakeCUDACtx(0);
-  TestCategoricalPredictLeaf(&ctx, false);
+  TestCategoricalPredictLeaf(true, false);
+}
+
+TEST_F(MGPUPredictorTest, CategoricalPredictionLeafColumnSplit) {
+  RunWithInMemoryCommunicator(world_size_, TestCategoricalPredictLeaf, true, true);
 }
 
 TEST(GPUPredictor, PredictLeafBasic) {
   size_t constexpr kRows = 5, kCols = 5;
-  auto dmat = RandomDataGenerator(kRows, kCols, 0).Device(0).GenerateDMatrix();
+  auto dmat = RandomDataGenerator(kRows, kCols, 0).Device(DeviceOrd::CUDA(0)).GenerateDMatrix();
   auto lparam = MakeCUDACtx(GPUIDX);
   std::unique_ptr<Predictor> gpu_predictor =
       std::unique_ptr<Predictor>(Predictor::Create("gpu_predictor", &lparam));
@@ -313,4 +329,9 @@ TEST(GPUPredictor, Sparse) {
   TestSparsePrediction(&ctx, 0.2);
   TestSparsePrediction(&ctx, 0.8);
 }
+
+TEST_F(MGPUPredictorTest, SparseColumnSplit) {
+  TestSparsePredictionColumnSplit(world_size_, true, 0.2);
+  TestSparsePredictionColumnSplit(world_size_, true, 0.8);
+}
 }  // namespace xgboost::predictor
diff --git a/tests/cpp/predictor/test_predictor.cc b/tests/cpp/predictor/test_predictor.cc
index 98f4c2576..5df04c0f8 100644
--- a/tests/cpp/predictor/test_predictor.cc
+++ b/tests/cpp/predictor/test_predictor.cc
@@ -34,7 +34,7 @@ TEST(Predictor, PredictionCache) {
   // Add a cache that is immediately expired.
   auto add_cache = [&]() {
     auto p_dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix();
-    container.Cache(p_dmat, Context::kCpuId);
+    container.Cache(p_dmat, DeviceOrd::CPU());
     m = p_dmat.get();
   };
 
@@ -93,7 +93,7 @@ void TestTrainingPrediction(Context const *ctx, size_t rows, size_t bins,
 void TestInplacePrediction(Context const *ctx, std::shared_ptr<DMatrix> x, bst_row_t rows,
                            bst_feature_t cols) {
   std::size_t constexpr kClasses { 4 };
-  auto gen = RandomDataGenerator{rows, cols, 0.5}.Device(ctx->gpu_id);
+  auto gen = RandomDataGenerator{rows, cols, 0.5}.Device(ctx->Device());
   std::shared_ptr<DMatrix> m = gen.GenerateDMatrix(true, false, kClasses);
 
   std::unique_ptr<Learner> learner {
@@ -172,16 +172,6 @@ void VerifyPredictionWithLesserFeatures(Learner *learner, bst_row_t kRows,
   ASSERT_THROW({ learner->Predict(m_invalid, false, &prediction, 0, 0); }, dmlc::Error);
 }
 
-void VerifyPredictionWithLesserFeaturesColumnSplit(Learner *learner, size_t rows,
-                                                   std::shared_ptr<DMatrix> m_test,
-                                                   std::shared_ptr<DMatrix> m_invalid) {
-  auto const world_size = collective::GetWorldSize();
-  auto const rank = collective::GetRank();
-  std::shared_ptr<DMatrix> sliced_test{m_test->SliceCol(world_size, rank)};
-  std::shared_ptr<DMatrix> sliced_invalid{m_invalid->SliceCol(world_size, rank)};
-
-  VerifyPredictionWithLesserFeatures(learner, rows, sliced_test, sliced_invalid);
-}
 }  // anonymous namespace
 
 void TestPredictionWithLesserFeatures(Context const *ctx) {
@@ -202,7 +192,7 @@ void TestPredictionDeviceAccess() {
 
   HostDeviceVector<float> from_cpu;
   {
-    ASSERT_EQ(from_cpu.DeviceIdx(), Context::kCpuId);
+    ASSERT_TRUE(from_cpu.Device().IsCPU());
     Context cpu_ctx;
     learner->SetParam("device", cpu_ctx.DeviceName());
     learner->Predict(m_test, false, &from_cpu, 0, 0);
@@ -216,7 +206,7 @@ void TestPredictionDeviceAccess() {
     Context cuda_ctx = MakeCUDACtx(0);
     learner->SetParam("device", cuda_ctx.DeviceName());
     learner->Predict(m_test, false, &from_cuda, 0, 0);
-    ASSERT_EQ(from_cuda.DeviceIdx(), 0);
+    ASSERT_EQ(from_cuda.Device(), DeviceOrd::CUDA(0));
     ASSERT_TRUE(from_cuda.DeviceCanWrite());
     ASSERT_FALSE(from_cuda.HostCanRead());
   }
@@ -229,16 +219,24 @@ void TestPredictionDeviceAccess() {
 #endif  // defined(XGBOOST_USE_CUDA)
 }
 
-void TestPredictionWithLesserFeaturesColumnSplit(Context const *ctx) {
-  size_t constexpr kRows = 256, kTrainCols = 256, kTestCols = 4, kIters = 4;
-  auto m_train = RandomDataGenerator(kRows, kTrainCols, 0.5).GenerateDMatrix(true);
-  auto learner = LearnerForTest(ctx, m_train, kIters);
+void TestPredictionWithLesserFeaturesColumnSplit(bool use_gpu) {
+  auto const world_size = collective::GetWorldSize();
+  auto const rank = collective::GetRank();
+
+  std::size_t constexpr kRows = 256, kTrainCols = 256, kTestCols = 4, kIters = 4;
+  auto m_train = RandomDataGenerator(kRows, kTrainCols, 0.5).Seed(rank).GenerateDMatrix(true);
+  Context ctx;
+  if (use_gpu) {
+    ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : rank);
+  }
+  auto learner = LearnerForTest(&ctx, m_train, kIters);
   auto m_test = RandomDataGenerator(kRows, kTestCols, 0.5).GenerateDMatrix(false);
   auto m_invalid = RandomDataGenerator(kRows, kTrainCols + 1, 0.5).GenerateDMatrix(false);
 
-  auto constexpr kWorldSize = 2;
-  RunWithInMemoryCommunicator(kWorldSize, VerifyPredictionWithLesserFeaturesColumnSplit,
-                              learner.get(), kRows, m_test, m_invalid);
+  std::shared_ptr<DMatrix> sliced_test{m_test->SliceCol(world_size, rank)};
+  std::shared_ptr<DMatrix> sliced_invalid{m_invalid->SliceCol(world_size, rank)};
+
+  VerifyPredictionWithLesserFeatures(learner.get(), kRows, sliced_test, sliced_invalid);
 }
 
 void GBTreeModelForTest(gbm::GBTreeModel *model, uint32_t split_ind,
@@ -260,7 +258,11 @@ void GBTreeModelForTest(gbm::GBTreeModel *model, uint32_t split_ind,
   model->CommitModelGroup(std::move(trees), 0);
 }
 
-void TestCategoricalPrediction(Context const* ctx, bool is_column_split) {
+void TestCategoricalPrediction(bool use_gpu, bool is_column_split) {
+  Context ctx;
+  if (use_gpu) {
+    ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
+  }
   size_t constexpr kCols = 10;
   PredictionCacheEntry out_predictions;
 
@@ -270,10 +272,10 @@ void TestCategoricalPrediction(Context const* ctx, bool is_column_split) {
   float left_weight = 1.3f;
   float right_weight = 1.7f;
 
-  gbm::GBTreeModel model(&mparam, ctx);
+  gbm::GBTreeModel model(&mparam, &ctx);
   GBTreeModelForTest(&model, split_ind, split_cat, left_weight, right_weight);
 
-  std::unique_ptr<Predictor> predictor{CreatePredictorForTest(ctx)};
+  std::unique_ptr<Predictor> predictor{CreatePredictorForTest(&ctx)};
 
   std::vector<float> row(kCols);
   row[split_ind] = split_cat;
@@ -303,12 +305,11 @@ void TestCategoricalPrediction(Context const* ctx, bool is_column_split) {
   ASSERT_EQ(out_predictions.predictions.HostVector()[0], left_weight + score);
 }
 
-void TestCategoricalPredictionColumnSplit(Context const *ctx) {
-  auto constexpr kWorldSize = 2;
-  RunWithInMemoryCommunicator(kWorldSize, TestCategoricalPrediction, ctx, true);
-}
-
-void TestCategoricalPredictLeaf(Context const *ctx, bool is_column_split) {
+void TestCategoricalPredictLeaf(bool use_gpu, bool is_column_split) {
+  Context ctx;
+  if (use_gpu) {
+    ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
+  }
   size_t constexpr kCols = 10;
   PredictionCacheEntry out_predictions;
 
@@ -319,10 +320,10 @@ void TestCategoricalPredictLeaf(Context const *ctx, bool is_column_split) {
   float left_weight = 1.3f;
   float right_weight = 1.7f;
 
-  gbm::GBTreeModel model(&mparam, ctx);
+  gbm::GBTreeModel model(&mparam, &ctx);
   GBTreeModelForTest(&model, split_ind, split_cat, left_weight, right_weight);
 
-  std::unique_ptr<Predictor> predictor{CreatePredictorForTest(ctx)};
+  std::unique_ptr<Predictor> predictor{CreatePredictorForTest(&ctx)};
 
   std::vector<float> row(kCols);
   row[split_ind] = split_cat;
@@ -347,15 +348,10 @@ void TestCategoricalPredictLeaf(Context const *ctx, bool is_column_split) {
   ASSERT_EQ(out_predictions.predictions.HostVector()[0], 1);
 }
 
-void TestCategoricalPredictLeafColumnSplit(Context const *ctx) {
-  auto constexpr kWorldSize = 2;
-  RunWithInMemoryCommunicator(kWorldSize, TestCategoricalPredictLeaf, ctx, true);
-}
-
 void TestIterationRange(Context const* ctx) {
   size_t constexpr kRows = 1000, kCols = 20, kClasses = 4, kForest = 3, kIters = 10;
   auto dmat = RandomDataGenerator(kRows, kCols, 0)
-                  .Device(ctx->gpu_id)
+                  .Device(ctx->Device())
                   .GenerateDMatrix(true, true, kClasses);
   auto learner = LearnerForTest(ctx, dmat, kIters, kForest);
 
@@ -411,15 +407,30 @@ void TestIterationRange(Context const* ctx) {
 }
 
 namespace {
-void VerifyIterationRangeColumnSplit(DMatrix *dmat, Learner *learner, Learner *sliced,
+void VerifyIterationRangeColumnSplit(bool use_gpu, Json const &ranged_model,
+                                     Json const &sliced_model, std::size_t rows, std::size_t cols,
+                                     std::size_t classes,
                                      std::vector<float> const &expected_margin_ranged,
                                      std::vector<float> const &expected_margin_sliced,
                                      std::vector<float> const &expected_leaf_ranged,
                                      std::vector<float> const &expected_leaf_sliced) {
   auto const world_size = collective::GetWorldSize();
   auto const rank = collective::GetRank();
+  Context ctx;
+  if (use_gpu) {
+    ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : rank);
+  }
+  auto dmat = RandomDataGenerator(rows, cols, 0).GenerateDMatrix(true, true, classes);
   std::shared_ptr<DMatrix> Xy{dmat->SliceCol(world_size, rank)};
 
+  std::unique_ptr<Learner> learner{Learner::Create({Xy})};
+  learner->SetParam("device", ctx.DeviceName());
+  learner->LoadModel(ranged_model);
+
+  std::unique_ptr<Learner> sliced{Learner::Create({Xy})};
+  sliced->SetParam("device", ctx.DeviceName());
+  sliced->LoadModel(sliced_model);
+
   HostDeviceVector<float> out_predt_sliced;
   HostDeviceVector<float> out_predt_ranged;
 
@@ -428,11 +439,15 @@ void VerifyIterationRangeColumnSplit(DMatrix *dmat, Learner *learner, Learner *s
     sliced->Predict(Xy, true, &out_predt_sliced, 0, 0, false, false, false, false, false);
     learner->Predict(Xy, true, &out_predt_ranged, 0, 3, false, false, false, false, false);
     auto const &h_sliced = out_predt_sliced.HostVector();
-    auto const &h_range = out_predt_ranged.HostVector();
-    ASSERT_EQ(h_sliced.size(), expected_margin_sliced.size());
-    ASSERT_EQ(h_sliced, expected_margin_sliced);
-    ASSERT_EQ(h_range.size(), expected_margin_ranged.size());
-    ASSERT_EQ(h_range, expected_margin_ranged);
+    auto const &h_ranged = out_predt_ranged.HostVector();
+    EXPECT_EQ(h_sliced.size(), expected_margin_sliced.size());
+    for (std::size_t i = 0; i < expected_margin_sliced.size(); ++i) {
+      ASSERT_FLOAT_EQ(h_sliced[i], expected_margin_sliced[i]) << "rank " << rank << ", i " << i;
+    }
+    EXPECT_EQ(h_ranged.size(), expected_margin_ranged.size());
+    for (std::size_t i = 0; i < expected_margin_ranged.size(); ++i) {
+      ASSERT_FLOAT_EQ(h_ranged[i], expected_margin_ranged[i]) << "rank " << rank << ", i " << i;
+    }
   }
 
   // Leaf
@@ -440,21 +455,27 @@ void VerifyIterationRangeColumnSplit(DMatrix *dmat, Learner *learner, Learner *s
     sliced->Predict(Xy, false, &out_predt_sliced, 0, 0, false, true, false, false, false);
     learner->Predict(Xy, false, &out_predt_ranged, 0, 3, false, true, false, false, false);
     auto const &h_sliced = out_predt_sliced.HostVector();
-    auto const &h_range = out_predt_ranged.HostVector();
-    ASSERT_EQ(h_sliced.size(), expected_leaf_sliced.size());
-    ASSERT_EQ(h_sliced, expected_leaf_sliced);
-    ASSERT_EQ(h_range.size(), expected_leaf_ranged.size());
-    ASSERT_EQ(h_range, expected_leaf_ranged);
+    auto const &h_ranged = out_predt_ranged.HostVector();
+    EXPECT_EQ(h_sliced.size(), expected_leaf_sliced.size());
+    for (std::size_t i = 0; i < expected_leaf_sliced.size(); ++i) {
+      ASSERT_FLOAT_EQ(h_sliced[i], expected_leaf_sliced[i]) << "rank " << rank << ", i " << i;
+    }
+    EXPECT_EQ(h_ranged.size(), expected_leaf_ranged.size());
+    for (std::size_t i = 0; i < expected_leaf_ranged.size(); ++i) {
+      ASSERT_FLOAT_EQ(h_ranged[i], expected_leaf_ranged[i]) << "rank " << rank << ", i " << i;
+    }
   }
 }
 }  // anonymous namespace
 
-void TestIterationRangeColumnSplit(Context const* ctx) {
-  size_t constexpr kRows = 1000, kCols = 20, kClasses = 4, kForest = 3, kIters = 10;
+void TestIterationRangeColumnSplit(int world_size, bool use_gpu) {
+  std::size_t constexpr kRows = 1000, kCols = 20, kClasses = 4, kForest = 3, kIters = 10;
   auto dmat = RandomDataGenerator(kRows, kCols, 0).GenerateDMatrix(true, true, kClasses);
-  auto learner = LearnerForTest(ctx, dmat, kIters, kForest);
-
-  learner->SetParam("device", ctx->DeviceName());
+  Context ctx;
+  if (use_gpu) {
+    ctx = MakeCUDACtx(0);
+  }
+  auto learner = LearnerForTest(&ctx, dmat, kIters, kForest);
 
   bool bound = false;
   std::unique_ptr<Learner> sliced{learner->Slice(0, 3, 1, &bound)};
@@ -476,9 +497,13 @@ void TestIterationRangeColumnSplit(Context const* ctx) {
   auto const &leaf_sliced = leaf_predt_sliced.HostVector();
   auto const &leaf_ranged = leaf_predt_ranged.HostVector();
 
-  auto constexpr kWorldSize = 2;
-  RunWithInMemoryCommunicator(kWorldSize, VerifyIterationRangeColumnSplit, dmat.get(),
-                              learner.get(), sliced.get(), margin_ranged, margin_sliced,
+  Json ranged_model{Object{}};
+  learner->SaveModel(&ranged_model);
+  Json sliced_model{Object{}};
+  sliced->SaveModel(&sliced_model);
+
+  RunWithInMemoryCommunicator(world_size, VerifyIterationRangeColumnSplit, use_gpu, ranged_model,
+                              sliced_model, kRows, kCols, kClasses, margin_ranged, margin_sliced,
                               leaf_ranged, leaf_sliced);
 }
 
@@ -497,7 +522,7 @@ void TestSparsePrediction(Context const *ctx, float sparsity) {
 
   if (ctx->IsCUDA()) {
     learner->SetParam("tree_method", "gpu_hist");
-    learner->SetParam("gpu_id", std::to_string(ctx->gpu_id));
+    learner->SetParam("device", ctx->Device().Name());
   }
   learner->Predict(Xy, false, &sparse_predt, 0, 0);
 
@@ -539,11 +564,20 @@ void TestSparsePrediction(Context const *ctx, float sparsity) {
 }
 
 namespace {
-void VerifySparsePredictionColumnSplit(DMatrix *dmat, Learner *learner,
+void VerifySparsePredictionColumnSplit(bool use_gpu, Json const &model, std::size_t rows,
+                                       std::size_t cols, float sparsity,
                                        std::vector<float> const &expected_predt) {
-  std::shared_ptr<DMatrix> sliced{
-      dmat->SliceCol(collective::GetWorldSize(), collective::GetRank())};
+  Context ctx;
+  if (use_gpu) {
+    ctx = MakeCUDACtx(common::AllVisibleGPUs() == 1 ? 0 : collective::GetRank());
+  }
+  auto Xy = RandomDataGenerator(rows, cols, sparsity).GenerateDMatrix(true);
+  std::shared_ptr<DMatrix> sliced{Xy->SliceCol(collective::GetWorldSize(), collective::GetRank())};
   HostDeviceVector<float> sparse_predt;
+
+  std::unique_ptr<Learner> learner{Learner::Create({sliced})};
+  learner->SetParam("device", ctx.DeviceName());
+  learner->LoadModel(model);
   learner->Predict(sliced, false, &sparse_predt, 0, 0);
 
   auto const &predt = sparse_predt.HostVector();
@@ -554,10 +588,14 @@ void VerifySparsePredictionColumnSplit(DMatrix *dmat, Learner *learner,
 }
 }  // anonymous namespace
 
-void TestSparsePredictionColumnSplit(Context const* ctx, float sparsity) {
+void TestSparsePredictionColumnSplit(int world_size, bool use_gpu, float sparsity) {
+  Context ctx;
+  if (use_gpu) {
+    ctx = MakeCUDACtx(0);
+  }
   size_t constexpr kRows = 512, kCols = 128, kIters = 4;
   auto Xy = RandomDataGenerator(kRows, kCols, sparsity).GenerateDMatrix(true);
-  auto learner = LearnerForTest(ctx, Xy, kIters);
+  auto learner = LearnerForTest(&ctx, Xy, kIters);
 
   HostDeviceVector<float> sparse_predt;
 
@@ -567,12 +605,11 @@ void TestSparsePredictionColumnSplit(Context const* ctx, float sparsity) {
   learner.reset(Learner::Create({Xy}));
   learner->LoadModel(model);
 
-  learner->SetParam("device", ctx->DeviceName());
+  learner->SetParam("device", ctx.DeviceName());
   learner->Predict(Xy, false, &sparse_predt, 0, 0);
 
-  auto constexpr kWorldSize = 2;
-  RunWithInMemoryCommunicator(kWorldSize, VerifySparsePredictionColumnSplit, Xy.get(),
-                              learner.get(), sparse_predt.HostVector());
+  RunWithInMemoryCommunicator(world_size, VerifySparsePredictionColumnSplit, use_gpu, model,
+                              kRows, kCols, sparsity, sparse_predt.HostVector());
 }
 
 void TestVectorLeafPrediction(Context const *ctx) {
@@ -583,7 +620,7 @@ void TestVectorLeafPrediction(Context const *ctx) {
   size_t constexpr kCols = 5;
 
   LearnerModelParam mparam{static_cast<bst_feature_t>(kCols),
-                           linalg::Vector<float>{{0.5}, {1}, Context::kCpuId}, 1, 3,
+                           linalg::Vector<float>{{0.5}, {1}, DeviceOrd::CPU()}, 1, 3,
                            MultiStrategy::kMultiOutputTree};
 
   std::vector<std::unique_ptr<RegTree>> trees;
diff --git a/tests/cpp/predictor/test_predictor.h b/tests/cpp/predictor/test_predictor.h
index 81ec3cb5d..9e0891d56 100644
--- a/tests/cpp/predictor/test_predictor.h
+++ b/tests/cpp/predictor/test_predictor.h
@@ -94,23 +94,19 @@ void TestPredictionWithLesserFeatures(Context const* ctx);
 
 void TestPredictionDeviceAccess();
 
-void TestCategoricalPrediction(Context const* ctx, bool is_column_split);
+void TestCategoricalPrediction(bool use_gpu, bool is_column_split);
 
-void TestCategoricalPredictionColumnSplit(Context const* ctx);
+void TestPredictionWithLesserFeaturesColumnSplit(bool use_gpu);
 
-void TestPredictionWithLesserFeaturesColumnSplit(Context const* ctx);
-
-void TestCategoricalPredictLeaf(Context const* ctx, bool is_column_split);
-
-void TestCategoricalPredictLeafColumnSplit(Context const* ctx);
+void TestCategoricalPredictLeaf(bool use_gpu, bool is_column_split);
 
 void TestIterationRange(Context const* ctx);
 
-void TestIterationRangeColumnSplit(Context const* ctx);
+void TestIterationRangeColumnSplit(int world_size, bool use_gpu);
 
 void TestSparsePrediction(Context const* ctx, float sparsity);
 
-void TestSparsePredictionColumnSplit(Context const* ctx, float sparsity);
+void TestSparsePredictionColumnSplit(int world_size, bool use_gpu, float sparsity);
 
 void TestVectorLeafPrediction(Context const* ctx);
 }  // namespace xgboost
diff --git a/tests/cpp/test_context.cc b/tests/cpp/test_context.cc
index d49f7b4b2..2fdf04aa1 100644
--- a/tests/cpp/test_context.cc
+++ b/tests/cpp/test_context.cc
@@ -5,11 +5,13 @@
 #include <xgboost/base.h>
 #include <xgboost/context.h>
 
+#include <sstream>
+
 namespace xgboost {
 TEST(Context, CPU) {
   Context ctx;
   ASSERT_EQ(ctx.Device(), DeviceOrd::CPU());
-  ASSERT_EQ(ctx.Ordinal(), Context::kCpuId);
+  ASSERT_EQ(ctx.Ordinal(), DeviceOrd::CPUOrdinal());
 
   std::int32_t flag{0};
   ctx.DispatchDevice([&] { flag = -1; }, [&] { flag = 1; });
@@ -27,5 +29,20 @@ TEST(Context, CPU) {
   ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", ":gpu"}}), dmlc::Error);
   ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", ":0"}}), dmlc::Error);
   ASSERT_THROW(ctx.UpdateAllowUnknown(Args{{"device", ""}}), dmlc::Error);
+
+  std::stringstream ss;
+  ss << ctx.Device();
+  ASSERT_EQ(ss.str(), "cpu");
+}
+
+TEST(Context, ErrorInit) {
+  Context ctx;
+  ASSERT_THROW({ ctx.Init({{"foo", "bar"}}); }, dmlc::Error);
+  try {
+    ctx.Init({{"foo", "bar"}});
+  } catch (dmlc::Error const& e) {
+    auto msg = std::string{e.what()};
+    ASSERT_NE(msg.find("foo"), std::string::npos);
+  }
 }
 }  // namespace xgboost
diff --git a/tests/cpp/test_context.cu b/tests/cpp/test_context.cu
index 035d22125..7684ff467 100644
--- a/tests/cpp/test_context.cu
+++ b/tests/cpp/test_context.cu
@@ -13,7 +13,6 @@
 namespace xgboost {
 namespace {
 void TestCUDA(Context const& ctx, bst_d_ordinal_t ord) {
-  ASSERT_EQ(ctx.gpu_id, ord);
   ASSERT_EQ(ctx.Device().ordinal, ord);
   ASSERT_EQ(ctx.DeviceName(), "cuda:" + std::to_string(ord));
   ASSERT_EQ(ctx.Ordinal(), ord);
@@ -25,7 +24,7 @@ void TestCUDA(Context const& ctx, bst_d_ordinal_t ord) {
   Context new_ctx;
   FromJson(jctx, &new_ctx);
   ASSERT_EQ(new_ctx.Device(), ctx.Device());
-  ASSERT_EQ(new_ctx.gpu_id, ctx.gpu_id);
+  ASSERT_EQ(new_ctx.Ordinal(), ctx.Ordinal());
 }
 }  // namespace
 
@@ -53,7 +52,7 @@ TEST(Context, DeviceOrdinal) {
 
   auto cpu_ctx = ctx.MakeCPU();
   ASSERT_TRUE(cpu_ctx.IsCPU());
-  ASSERT_EQ(cpu_ctx.Ordinal(), Context::kCpuId);
+  ASSERT_EQ(cpu_ctx.Ordinal(), DeviceOrd::CPUOrdinal());
   ASSERT_EQ(cpu_ctx.Device(), DeviceOrd::CPU());
 
   auto cuda_ctx = cpu_ctx.MakeCUDA(ctx.Ordinal());
diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc
index 82a0f7e31..7c4f10b6d 100644
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@@ -655,33 +655,11 @@ TEST_F(InitBaseScore, InitWithPredict) { this->TestInitWithPredt(); }
 TEST_F(InitBaseScore, UpdateProcess) { this->TestUpdateProcess(); }
 
 class TestColumnSplit : public ::testing::TestWithParam<std::string> {
-  static auto MakeFmat(std::string const& obj) {
-    auto constexpr kRows = 10, kCols = 10;
-    auto p_fmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true);
-    auto& h_upper = p_fmat->Info().labels_upper_bound_.HostVector();
-    auto& h_lower = p_fmat->Info().labels_lower_bound_.HostVector();
-    h_lower.resize(kRows);
-    h_upper.resize(kRows);
-    for (size_t i = 0; i < kRows; ++i) {
-      h_lower[i] = 1;
-      h_upper[i] = 10;
-    }
-    if (obj.find("rank:") != std::string::npos) {
-      auto h_label = p_fmat->Info().labels.HostView();
-      std::size_t k = 0;
-      for (auto& v : h_label) {
-        v = k % 2 == 0;
-        ++k;
-      }
-    }
-    return p_fmat;
-  };
-
   void TestBaseScore(std::string objective, float expected_base_score, Json expected_model) {
     auto const world_size = collective::GetWorldSize();
     auto const rank = collective::GetRank();
 
-    auto p_fmat = MakeFmat(objective);
+    auto p_fmat = MakeFmatForObjTest(objective);
     std::shared_ptr<DMatrix> sliced{p_fmat->SliceCol(world_size, rank)};
     std::unique_ptr<Learner> learner{Learner::Create({sliced})};
     learner->SetParam("tree_method", "approx");
@@ -705,7 +683,7 @@ class TestColumnSplit : public ::testing::TestWithParam<std::string> {
 
  public:
   void Run(std::string objective) {
-    auto p_fmat = MakeFmat(objective);
+    auto p_fmat = MakeFmatForObjTest(objective);
     std::unique_ptr<Learner> learner{Learner::Create({p_fmat})};
     learner->SetParam("tree_method", "approx");
     learner->SetParam("objective", objective);
@@ -740,4 +718,112 @@ INSTANTIATE_TEST_SUITE_P(ColumnSplitObjective, TestColumnSplit,
                          [](const ::testing::TestParamInfo<TestColumnSplit::ParamType>& info) {
                            return ObjTestNameGenerator(info);
                          });
+
+namespace {
+Json GetModelWithArgs(std::shared_ptr<DMatrix> dmat, std::string const& tree_method,
+                      std::string const& device, Args const& args) {
+  std::unique_ptr<Learner> learner{Learner::Create({dmat})};
+  learner->SetParam("tree_method", tree_method);
+  learner->SetParam("device", device);
+  learner->SetParam("objective", "reg:logistic");
+  learner->SetParams(args);
+  learner->UpdateOneIter(0, dmat);
+  Json model{Object{}};
+  learner->SaveModel(&model);
+  return model;
+}
+
+void VerifyColumnSplitWithArgs(std::string const& tree_method, bool use_gpu, Args const& args,
+                               Json const& expected_model) {
+  auto const world_size = collective::GetWorldSize();
+  auto const rank = collective::GetRank();
+  auto p_fmat = MakeFmatForObjTest("");
+  std::shared_ptr<DMatrix> sliced{p_fmat->SliceCol(world_size, rank)};
+  std::string device = "cpu";
+  if (use_gpu) {
+    auto gpu_id = common::AllVisibleGPUs() == 1 ? 0 : rank;
+    device = "cuda:" + std::to_string(gpu_id);
+  }
+  auto model = GetModelWithArgs(sliced, tree_method, device, args);
+  ASSERT_EQ(model, expected_model);
+}
+
+void TestColumnSplitWithArgs(std::string const& tree_method, bool use_gpu, Args const& args) {
+  auto p_fmat = MakeFmatForObjTest("");
+  std::string device = use_gpu ? "cuda:0" : "cpu";
+  auto model = GetModelWithArgs(p_fmat, tree_method, device, args);
+
+  auto world_size{3};
+  if (use_gpu) {
+    world_size = common::AllVisibleGPUs();
+    // Simulate MPU on a single GPU.
+    if (world_size == 1) {
+      world_size = 3;
+    }
+  }
+  RunWithInMemoryCommunicator(world_size, VerifyColumnSplitWithArgs, tree_method, use_gpu, args,
+                              model);
+}
+
+void TestColumnSplitColumnSampler(std::string const& tree_method, bool use_gpu) {
+  Args args{{"colsample_bytree", "0.5"}, {"colsample_bylevel", "0.6"}, {"colsample_bynode", "0.7"}};
+  TestColumnSplitWithArgs(tree_method, use_gpu, args);
+}
+
+void TestColumnSplitInteractionConstraints(std::string const& tree_method, bool use_gpu) {
+  Args args{{"interaction_constraints", "[[0, 5, 7], [2, 8, 9], [1, 3, 6]]"}};
+  TestColumnSplitWithArgs(tree_method, use_gpu, args);
+}
+
+void TestColumnSplitMonotoneConstraints(std::string const& tree_method, bool use_gpu) {
+  Args args{{"monotone_constraints", "(1,-1,0,1,1,-1,-1,0,0,1)"}};
+  TestColumnSplitWithArgs(tree_method, use_gpu, args);
+}
+}  // anonymous namespace
+
+TEST(ColumnSplitColumnSampler, Approx) { TestColumnSplitColumnSampler("approx", false); }
+
+TEST(ColumnSplitColumnSampler, Hist) { TestColumnSplitColumnSampler("hist", false); }
+
+#if defined(XGBOOST_USE_CUDA)
+TEST(MGPUColumnSplitColumnSampler, GPUApprox) { TestColumnSplitColumnSampler("approx", true); }
+
+TEST(MGPUColumnSplitColumnSampler, GPUHist) { TestColumnSplitColumnSampler("hist", true); }
+#endif  // defined(XGBOOST_USE_CUDA)
+
+TEST(ColumnSplitInteractionConstraints, Approx) {
+  TestColumnSplitInteractionConstraints("approx", false);
+}
+
+TEST(ColumnSplitInteractionConstraints, Hist) {
+  TestColumnSplitInteractionConstraints("hist", false);
+}
+
+#if defined(XGBOOST_USE_CUDA)
+TEST(MGPUColumnSplitInteractionConstraints, GPUApprox) {
+  TestColumnSplitInteractionConstraints("approx", true);
+}
+
+TEST(MGPUColumnSplitInteractionConstraints, GPUHist) {
+  TestColumnSplitInteractionConstraints("hist", true);
+}
+#endif  // defined(XGBOOST_USE_CUDA)
+
+TEST(ColumnSplitMonotoneConstraints, Approx) {
+  TestColumnSplitMonotoneConstraints("approx", false);
+}
+
+TEST(ColumnSplitMonotoneConstraints, Hist) {
+  TestColumnSplitMonotoneConstraints("hist", false);
+}
+
+#if defined(XGBOOST_USE_CUDA)
+TEST(MGPUColumnSplitMonotoneConstraints, GPUApprox) {
+  TestColumnSplitMonotoneConstraints("approx", true);
+}
+
+TEST(MGPUColumnSplitMonotoneConstraints, GPUHist) {
+  TestColumnSplitMonotoneConstraints("hist", true);
+}
+#endif  // defined(XGBOOST_USE_CUDA)
 }  // namespace xgboost
diff --git a/tests/cpp/test_serialization.cc b/tests/cpp/test_serialization.cc
index 28473bc9f..bf23991c1 100644
--- a/tests/cpp/test_serialization.cc
+++ b/tests/cpp/test_serialization.cc
@@ -210,9 +210,9 @@ void TestLearnerSerialization(Args args, FeatureMap const& fmap, std::shared_ptr
     }
     // Pull data to device
     for (auto &batch : p_dmat->GetBatches<SparsePage>()) {
-      batch.data.SetDevice(0);
+      batch.data.SetDevice(DeviceOrd::CUDA(0));
       batch.data.DeviceSpan();
-      batch.offset.SetDevice(0);
+      batch.offset.SetDevice(DeviceOrd::CUDA(0));
       batch.offset.DeviceSpan();
     }
 
diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
index e627402fc..ed5584b3e 100644
--- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
+++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2020-2022 by XGBoost contributors
+/**
+ * Copyright 2020-2023, XGBoost contributors
  */
 #include <gtest/gtest.h>
 #include <thrust/host_vector.h>
@@ -13,9 +13,7 @@
 #include "../../histogram_helpers.h"
 #include "../test_evaluate_splits.h"  // TestPartitionBasedSplit
 
-namespace xgboost {
-namespace tree {
-
+namespace xgboost::tree {
 namespace {
 auto ZeroParam() {
   auto args = Args{{"min_child_weight", "0"}, {"lambda", "0"}};
@@ -41,11 +39,12 @@ thrust::device_vector<GradientPairInt64> ConvertToInteger(std::vector<GradientPa
 }
 
 TEST_F(TestCategoricalSplitWithMissing, GPUHistEvaluator) {
+  auto ctx = MakeCUDACtx(0);
   thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0};
   GPUTrainingParam param{param_};
-  cuts_.cut_ptrs_.SetDevice(0);
-  cuts_.cut_values_.SetDevice(0);
-  cuts_.min_vals_.SetDevice(0);
+  cuts_.cut_ptrs_.SetDevice(ctx.Device());
+  cuts_.cut_values_.SetDevice(ctx.Device());
+  cuts_.min_vals_.SetDevice(ctx.Device());
   thrust::device_vector<GradientPairInt64> feature_histogram{ConvertToInteger(feature_histogram_)};
 
   dh::device_vector<FeatureType> feature_types(feature_set.size(), FeatureType::kCategorical);
@@ -61,9 +60,10 @@ TEST_F(TestCategoricalSplitWithMissing, GPUHistEvaluator) {
                                           cuts_.min_vals_.ConstDeviceSpan(),
                                           false};
 
-  GPUHistEvaluator evaluator{param_, static_cast<bst_feature_t>(feature_set.size()), 0};
+  GPUHistEvaluator evaluator{param_, static_cast<bst_feature_t>(feature_set.size()), ctx.Device()};
 
-  evaluator.Reset(cuts_, dh::ToSpan(feature_types), feature_set.size(), param_, false, 0);
+  evaluator.Reset(cuts_, dh::ToSpan(feature_types), feature_set.size(), param_, false,
+                  ctx.Device());
   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
 
   ASSERT_EQ(result.thresh, 1);
@@ -73,6 +73,7 @@ TEST_F(TestCategoricalSplitWithMissing, GPUHistEvaluator) {
 }
 
 TEST(GpuHist, PartitionBasic) {
+  auto ctx = MakeCUDACtx(0);
   TrainParam tparam = ZeroParam();
   tparam.max_cat_to_onehot = 0;
   GPUTrainingParam param{tparam};
@@ -81,9 +82,9 @@ TEST(GpuHist, PartitionBasic) {
   cuts.cut_values_.HostVector() = std::vector<float>{0.0, 1.0, 2.0};
   cuts.cut_ptrs_.HostVector() = std::vector<uint32_t>{0, 3};
   cuts.min_vals_.HostVector() = std::vector<float>{0.0};
-  cuts.cut_ptrs_.SetDevice(0);
-  cuts.cut_values_.SetDevice(0);
-  cuts.min_vals_.SetDevice(0);
+  cuts.cut_ptrs_.SetDevice(ctx.Device());
+  cuts.cut_values_.SetDevice(ctx.Device());
+  cuts.min_vals_.SetDevice(ctx.Device());
   thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0};
 
   thrust::device_vector<int> monotonic_constraints(feature_set.size(), 0);
@@ -104,8 +105,8 @@ TEST(GpuHist, PartitionBasic) {
       false,
   };
 
-  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), 0};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, 0);
+  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), ctx.Device()};
+  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, ctx.Device());
 
   {
     // -1.0s go right
@@ -187,6 +188,7 @@ TEST(GpuHist, PartitionBasic) {
 }
 
 TEST(GpuHist, PartitionTwoFeatures) {
+  auto ctx = MakeCUDACtx(0);
   TrainParam tparam = ZeroParam();
   tparam.max_cat_to_onehot = 0;
   GPUTrainingParam param{tparam};
@@ -195,9 +197,9 @@ TEST(GpuHist, PartitionTwoFeatures) {
   cuts.cut_values_.HostVector() = std::vector<float>{0.0, 1.0, 2.0, 0.0, 1.0, 2.0};
   cuts.cut_ptrs_.HostVector() = std::vector<uint32_t>{0, 3, 6};
   cuts.min_vals_.HostVector() = std::vector<float>{0.0, 0.0};
-  cuts.cut_ptrs_.SetDevice(0);
-  cuts.cut_values_.SetDevice(0);
-  cuts.min_vals_.SetDevice(0);
+  cuts.cut_ptrs_.SetDevice(ctx.Device());
+  cuts.cut_values_.SetDevice(ctx.Device());
+  cuts.min_vals_.SetDevice(ctx.Device());
   thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0, 1};
 
   thrust::device_vector<int> monotonic_constraints(feature_set.size(), 0);
@@ -216,8 +218,8 @@ TEST(GpuHist, PartitionTwoFeatures) {
                                           cuts.min_vals_.ConstDeviceSpan(),
                                           false};
 
-  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), 0};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, 0);
+  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), ctx.Device()};
+  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, ctx.Device());
 
   {
     auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-6.0, 3.0});
@@ -247,6 +249,7 @@ TEST(GpuHist, PartitionTwoFeatures) {
 }
 
 TEST(GpuHist, PartitionTwoNodes) {
+  auto ctx = MakeCUDACtx(0);
   TrainParam tparam = ZeroParam();
   tparam.max_cat_to_onehot = 0;
   GPUTrainingParam param{tparam};
@@ -255,9 +258,9 @@ TEST(GpuHist, PartitionTwoNodes) {
   cuts.cut_values_.HostVector() = std::vector<float>{0.0, 1.0, 2.0};
   cuts.cut_ptrs_.HostVector() = std::vector<uint32_t>{0, 3};
   cuts.min_vals_.HostVector() = std::vector<float>{0.0};
-  cuts.cut_ptrs_.SetDevice(0);
-  cuts.cut_values_.SetDevice(0);
-  cuts.min_vals_.SetDevice(0);
+  cuts.cut_ptrs_.SetDevice(ctx.Device());
+  cuts.cut_values_.SetDevice(ctx.Device());
+  cuts.min_vals_.SetDevice(ctx.Device());
   thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0};
 
   thrust::device_vector<int> monotonic_constraints(feature_set.size(), 0);
@@ -276,8 +279,10 @@ TEST(GpuHist, PartitionTwoNodes) {
                                           cuts.min_vals_.ConstDeviceSpan(),
                                           false};
 
-  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), 0};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, 0);
+  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()),
+                             ctx.Device()};
+  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false,
+                  ctx.Device());
 
   {
     auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-6.0, 3.0});
@@ -299,12 +304,14 @@ TEST(GpuHist, PartitionTwoNodes) {
 }
 
 void TestEvaluateSingleSplit(bool is_categorical) {
+  auto ctx = MakeCUDACtx(0);
   auto quantiser = DummyRoundingFactor();
   auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 1.0});
   TrainParam tparam = ZeroParam();
   GPUTrainingParam param{tparam};
 
-  common::HistogramCuts cuts{MakeCutsForTest({1.0, 2.0, 11.0, 12.0}, {0, 2, 4}, {0.0, 0.0}, 0)};
+  common::HistogramCuts cuts{
+      MakeCutsForTest({1.0, 2.0, 11.0, 12.0}, {0, 2, 4}, {0.0, 0.0}, ctx.Device())};
   thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0, 1};
 
   // Setup gradients so that second feature gets higher gain
@@ -329,8 +336,10 @@ void TestEvaluateSingleSplit(bool is_categorical) {
                                           cuts.min_vals_.ConstDeviceSpan(),
                                           false};
 
-  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), 0};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, 0);
+  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()),
+                             ctx.Device()};
+  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false,
+                  ctx.Device());
   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
 
   EXPECT_EQ(result.findex, 1);
@@ -367,7 +376,7 @@ TEST(GpuHist, EvaluateSingleSplitMissing) {
                                           dh::ToSpan(feature_min_values),
                                           false};
 
-  GPUHistEvaluator evaluator(tparam, feature_set.size(), 0);
+  GPUHistEvaluator evaluator(tparam, feature_set.size(), FstCU());
   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
 
   EXPECT_EQ(result.findex, 0);
@@ -379,7 +388,7 @@ TEST(GpuHist, EvaluateSingleSplitMissing) {
 
 TEST(GpuHist, EvaluateSingleSplitEmpty) {
   TrainParam tparam = ZeroParam();
-  GPUHistEvaluator evaluator(tparam, 1, 0);
+  GPUHistEvaluator evaluator(tparam, 1, FstCU());
   DeviceSplitCandidate result =
       evaluator
           .EvaluateSingleSplit(
@@ -414,7 +423,7 @@ TEST(GpuHist, EvaluateSingleSplitFeatureSampling) {
                                           dh::ToSpan(feature_min_values),
                                           false};
 
-  GPUHistEvaluator evaluator(tparam, feature_min_values.size(), 0);
+  GPUHistEvaluator evaluator(tparam, feature_min_values.size(), FstCU());
   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
 
   EXPECT_EQ(result.findex, 1);
@@ -446,7 +455,7 @@ TEST(GpuHist, EvaluateSingleSplitBreakTies) {
                                           dh::ToSpan(feature_min_values),
                                           false};
 
-  GPUHistEvaluator evaluator(tparam, feature_min_values.size(), 0);
+  GPUHistEvaluator evaluator(tparam, feature_min_values.size(), FstCU());
   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
 
   EXPECT_EQ(result.findex, 0);
@@ -481,7 +490,8 @@ TEST(GpuHist, EvaluateSplits) {
                                           dh::ToSpan(feature_min_values),
                                           false};
 
-  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_min_values.size()), 0};
+  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_min_values.size()),
+                             FstCU()};
   dh::device_vector<EvaluateSplitInputs> inputs =
       std::vector<EvaluateSplitInputs>{input_left, input_right};
   evaluator.LaunchEvaluateSplits(input_left.feature_set.size(), dh::ToSpan(inputs), shared_inputs,
@@ -497,14 +507,15 @@ TEST(GpuHist, EvaluateSplits) {
 }
 
 TEST_F(TestPartitionBasedSplit, GpuHist) {
+  auto ctx = MakeCUDACtx(0);
   dh::device_vector<FeatureType> ft{std::vector<FeatureType>{FeatureType::kCategorical}};
-  GPUHistEvaluator evaluator{param_, static_cast<bst_feature_t>(info_.num_col_), 0};
+  GPUHistEvaluator evaluator{param_, static_cast<bst_feature_t>(info_.num_col_), ctx.Device()};
 
-  cuts_.cut_ptrs_.SetDevice(0);
-  cuts_.cut_values_.SetDevice(0);
-  cuts_.min_vals_.SetDevice(0);
+  cuts_.cut_ptrs_.SetDevice(ctx.Device());
+  cuts_.cut_values_.SetDevice(ctx.Device());
+  cuts_.min_vals_.SetDevice(ctx.Device());
 
-  evaluator.Reset(cuts_, dh::ToSpan(ft), info_.num_col_, param_, false, 0);
+  evaluator.Reset(cuts_, dh::ToSpan(ft), info_.num_col_, param_, false, ctx.Device());
 
   // Convert the sample histogram to fixed point
   auto quantiser = DummyRoundingFactor();
@@ -532,15 +543,16 @@ class MGPUHistTest : public BaseMGPUTest {};
 
 namespace {
 void VerifyColumnSplitEvaluateSingleSplit(bool is_categorical) {
+  auto ctx = MakeCUDACtx(GPUIDX);
   auto rank = collective::GetRank();
   auto quantiser = DummyRoundingFactor();
   auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{0.0, 1.0});
   TrainParam tparam = ZeroParam();
   GPUTrainingParam param{tparam};
 
-  common::HistogramCuts cuts{rank == 0
-                                 ? MakeCutsForTest({1.0, 2.0}, {0, 2, 2}, {0.0, 0.0}, GPUIDX)
-                                 : MakeCutsForTest({11.0, 12.0}, {0, 0, 2}, {0.0, 0.0}, GPUIDX)};
+  common::HistogramCuts cuts{
+      rank == 0 ? MakeCutsForTest({1.0, 2.0}, {0, 2, 2}, {0.0, 0.0}, ctx.Device())
+                : MakeCutsForTest({11.0, 12.0}, {0, 0, 2}, {0.0, 0.0}, ctx.Device())};
   thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0, 1};
 
   // Setup gradients so that second feature gets higher gain
@@ -566,8 +578,8 @@ void VerifyColumnSplitEvaluateSingleSplit(bool is_categorical) {
                                           cuts.min_vals_.ConstDeviceSpan(),
                                           false};
 
-  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), GPUIDX};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, true, GPUIDX);
+  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), ctx.Device()};
+  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, true, ctx.Device());
   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
 
   EXPECT_EQ(result.findex, 1) << "rank: " << rank;
@@ -587,5 +599,4 @@ TEST_F(MGPUHistTest, ColumnSplitEvaluateSingleSplit) {
 TEST_F(MGPUHistTest, ColumnSplitEvaluateSingleCategoricalSplit) {
   DoTest(VerifyColumnSplitEvaluateSingleSplit, true);
 }
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
diff --git a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
index 0f31339a3..a0f9200ff 100644
--- a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
+++ b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu
@@ -34,9 +34,9 @@ void VerifySampling(size_t page_size,
   for (const auto& gp : gpair.ConstHostVector()) {
     sum_gpair += gp;
   }
-  gpair.SetDevice(0);
-
   Context ctx{MakeCUDACtx(0)};
+  gpair.SetDevice(ctx.Device());
+
   auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
   auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
   if (page_size != 0) {
@@ -91,9 +91,9 @@ TEST(GradientBasedSampler, NoSamplingExternalMemory) {
   std::unique_ptr<DMatrix> dmat(
       CreateSparsePageDMatrix(kRows, kCols, kRows / kPageSize, tmpdir.path + "/cache"));
   auto gpair = GenerateRandomGradients(kRows);
-  gpair.SetDevice(0);
-
   Context ctx{MakeCUDACtx(0)};
+  gpair.SetDevice(ctx.Device());
+
   auto param = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
   auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
   EXPECT_NE(page->n_rows, kRows);
diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu
index 07779cb3d..3e6d24a93 100644
--- a/tests/cpp/tree/gpu_hist/test_histogram.cu
+++ b/tests/cpp/tree/gpu_hist/test_histogram.cu
@@ -17,9 +17,7 @@
 #include "../../categorical_helpers.h"
 #include "../../helpers.h"
 
-namespace xgboost {
-namespace tree {
-
+namespace xgboost::tree {
 void TestDeterministicHistogram(bool is_dense, int shm_size) {
   Context ctx = MakeCUDACtx(0);
   size_t constexpr kBins = 256, kCols = 120, kRows = 16384, kRounds = 16;
@@ -32,22 +30,22 @@ void TestDeterministicHistogram(bool is_dense, int shm_size) {
   for (auto const& batch : matrix->GetBatches<EllpackPage>(&ctx, batch_param)) {
     auto* page = batch.Impl();
 
-    tree::RowPartitioner row_partitioner(0, kRows);
+    tree::RowPartitioner row_partitioner(FstCU(), kRows);
     auto ridx = row_partitioner.GetRows(0);
 
     int num_bins = kBins * kCols;
     dh::device_vector<GradientPairInt64> histogram(num_bins);
     auto d_histogram = dh::ToSpan(histogram);
     auto gpair = GenerateRandomGradients(kRows, kLower, kUpper);
-    gpair.SetDevice(0);
+    gpair.SetDevice(FstCU());
 
     FeatureGroups feature_groups(page->Cuts(), page->is_dense, shm_size,
                                  sizeof(GradientPairInt64));
 
     auto quantiser = GradientQuantiser(gpair.DeviceSpan(), MetaInfo());
-    BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(0),
-                           feature_groups.DeviceAccessor(0), gpair.DeviceSpan(), ridx, d_histogram,
-                           quantiser);
+    BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(FstCU()),
+                           feature_groups.DeviceAccessor(FstCU()), gpair.DeviceSpan(), ridx,
+                           d_histogram, quantiser);
 
     std::vector<GradientPairInt64> histogram_h(num_bins);
 #if defined(XGBOOST_USE_CUDA)
@@ -65,8 +63,8 @@ void TestDeterministicHistogram(bool is_dense, int shm_size) {
       auto d_new_histogram = dh::ToSpan(new_histogram);
 
       auto quantiser = GradientQuantiser(gpair.DeviceSpan(), MetaInfo());
-      BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(0),
-                             feature_groups.DeviceAccessor(0), gpair.DeviceSpan(), ridx,
+      BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(FstCU()),
+                             feature_groups.DeviceAccessor(FstCU()), gpair.DeviceSpan(), ridx,
                              d_new_histogram, quantiser);
 
       std::vector<GradientPairInt64> new_histogram_h(num_bins);
@@ -87,14 +85,14 @@ void TestDeterministicHistogram(bool is_dense, int shm_size) {
 
     {
       auto gpair = GenerateRandomGradients(kRows, kLower, kUpper);
-      gpair.SetDevice(0);
+      gpair.SetDevice(FstCU());
 
       // Use a single feature group to compute the baseline.
       FeatureGroups single_group(page->Cuts());
 
       dh::device_vector<GradientPairInt64> baseline(num_bins);
-      BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(0),
-                             single_group.DeviceAccessor(0), gpair.DeviceSpan(), ridx,
+      BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(FstCU()),
+                             single_group.DeviceAccessor(FstCU()), gpair.DeviceSpan(), ridx,
                              dh::ToSpan(baseline), quantiser);
 
       std::vector<GradientPairInt64> baseline_h(num_bins);
@@ -149,11 +147,11 @@ void TestGPUHistogramCategorical(size_t num_categories) {
   auto cat_m = GetDMatrixFromData(x, kRows, 1);
   cat_m->Info().feature_types.HostVector().push_back(FeatureType::kCategorical);
   auto batch_param = BatchParam{kBins, tree::TrainParam::DftSparseThreshold()};
-  tree::RowPartitioner row_partitioner(0, kRows);
+  tree::RowPartitioner row_partitioner(ctx.Device(), kRows);
   auto ridx = row_partitioner.GetRows(0);
   dh::device_vector<GradientPairInt64> cat_hist(num_categories);
   auto gpair = GenerateRandomGradients(kRows, 0, 2);
-  gpair.SetDevice(0);
+  gpair.SetDevice(DeviceOrd::CUDA(0));
   auto quantiser = GradientQuantiser(gpair.DeviceSpan(), MetaInfo());
   /**
    * Generate hist with cat data.
@@ -161,8 +159,8 @@ void TestGPUHistogramCategorical(size_t num_categories) {
   for (auto const &batch : cat_m->GetBatches<EllpackPage>(&ctx, batch_param)) {
     auto* page = batch.Impl();
     FeatureGroups single_group(page->Cuts());
-    BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(0),
-                           single_group.DeviceAccessor(0), gpair.DeviceSpan(), ridx,
+    BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
+                           single_group.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
                            dh::ToSpan(cat_hist), quantiser);
   }
 
@@ -175,8 +173,8 @@ void TestGPUHistogramCategorical(size_t num_categories) {
   for (auto const &batch : encode_m->GetBatches<EllpackPage>(&ctx, batch_param)) {
     auto* page = batch.Impl();
     FeatureGroups single_group(page->Cuts());
-    BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(0),
-                           single_group.DeviceAccessor(0), gpair.DeviceSpan(), ridx,
+    BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
+                           single_group.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
                            dh::ToSpan(encode_hist), quantiser);
   }
 
@@ -264,5 +262,4 @@ void TestAtomicAdd() {
 TEST(Histogram, AtomicAddInt64) {
   TestAtomicAdd();
 }
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
index a46b6b3e2..d8b085856 100644
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -20,12 +20,10 @@
 #include "xgboost/task.h"
 #include "xgboost/tree_model.h"
 
-namespace xgboost {
-namespace tree {
-
+namespace xgboost::tree {
 void TestUpdatePositionBatch() {
   const int kNumRows = 10;
-  RowPartitioner rp(0, kNumRows);
+  RowPartitioner rp(FstCU(), kNumRows);
   auto rows = rp.GetRowsHost(0);
   EXPECT_EQ(rows.size(), kNumRows);
   for (auto i = 0ull; i < kNumRows; i++) {
@@ -100,12 +98,11 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se
   }
 }
 
-TEST(GpuHist, SortPositionBatch) { 
-  TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{0, 3}, {3, 6}}); 
-  TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{0, 1}, {3, 6}}); 
+TEST(GpuHist, SortPositionBatch) {
+  TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{0, 3}, {3, 6}});
+  TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{0, 1}, {3, 6}});
   TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{0, 6}});
   TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{3, 6}, {0, 2}});
 }
 
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
diff --git a/tests/cpp/tree/hist/test_evaluate_splits.cc b/tests/cpp/tree/hist/test_evaluate_splits.cc
index 095284a38..78fda5ce5 100644
--- a/tests/cpp/tree/hist/test_evaluate_splits.cc
+++ b/tests/cpp/tree/hist/test_evaluate_splits.cc
@@ -115,7 +115,7 @@ TEST(HistMultiEvaluator, Evaluate) {
   HistMultiEvaluator evaluator{&ctx, p_fmat->Info(), &param, sampler};
   HistMakerTrainParam hist_param;
   std::vector<BoundedHistCollection> histogram(n_targets);
-  linalg::Vector<GradientPairPrecise> root_sum({2}, Context::kCpuId);
+  linalg::Vector<GradientPairPrecise> root_sum({2}, DeviceOrd::CPU());
   for (bst_target_t t{0}; t < n_targets; ++t) {
     auto &hist = histogram[t];
     hist.Reset(n_bins * n_features, hist_param.max_cached_hist_node);
diff --git a/tests/cpp/tree/test_evaluate_splits.h b/tests/cpp/tree/test_evaluate_splits.h
index 6cb75e23b..6506b54e8 100644
--- a/tests/cpp/tree/test_evaluate_splits.h
+++ b/tests/cpp/tree/test_evaluate_splits.h
@@ -76,7 +76,7 @@ class TestPartitionBasedSplit : public ::testing::Test {
                                                      GradientPairPrecise parent_sum) {
       int32_t best_thresh = -1;
       float best_score{-std::numeric_limits<float>::infinity()};
-      TreeEvaluator evaluator{param_, static_cast<bst_feature_t>(n_feat), -1};
+      TreeEvaluator evaluator{param_, static_cast<bst_feature_t>(n_feat), DeviceOrd::CPU()};
       auto tree_evaluator = evaluator.GetEvaluator<TrainParam>();
       GradientPairPrecise left_sum;
       auto parent_gain = tree_evaluator.CalcGain(0, param_, GradStats{total_gpair_});
@@ -111,13 +111,13 @@ class TestPartitionBasedSplit : public ::testing::Test {
 };
 
 inline auto MakeCutsForTest(std::vector<float> values, std::vector<uint32_t> ptrs,
-                            std::vector<float> min_values, int32_t device) {
+                            std::vector<float> min_values, DeviceOrd device) {
   common::HistogramCuts cuts;
   cuts.cut_values_.HostVector() = values;
   cuts.cut_ptrs_.HostVector() = ptrs;
   cuts.min_vals_.HostVector() = min_values;
 
-  if (device >= 0) {
+  if (device.IsCUDA()) {
     cuts.cut_ptrs_.SetDevice(device);
     cuts.cut_values_.SetDevice(device);
     cuts.min_vals_.SetDevice(device);
@@ -136,7 +136,7 @@ class TestCategoricalSplitWithMissing : public testing::Test {
   TrainParam param_;
 
   void SetUp() override {
-    cuts_ = MakeCutsForTest({0.0, 1.0, 2.0, 3.0}, {0, 4}, {0.0}, -1);
+    cuts_ = MakeCutsForTest({0.0, 1.0, 2.0, 3.0}, {0, 4}, {0.0}, DeviceOrd::CPU());
     auto max_cat = *std::max_element(cuts_.cut_values_.HostVector().begin(),
                                      cuts_.cut_values_.HostVector().end());
     cuts_.SetCategorical(true, max_cat);
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index 8b6c31fc0..e06d1b9a9 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -40,7 +40,7 @@ TEST(GpuHist, DeviceHistogram) {
   constexpr int kNNodes = 4;
   constexpr size_t kStopGrowing = kNNodes * kNBins * 2u;
   DeviceHistogramStorage<kStopGrowing> histogram;
-  histogram.Init(0, kNBins);
+  histogram.Init(FstCU(), kNBins);
   for (int i = 0; i < kNNodes; ++i) {
     histogram.AllocateHistograms({i});
   }
@@ -113,12 +113,12 @@ void TestBuildHist(bool use_shared_memory_histograms) {
     bst_float hess = dist(&gen);
     gp = GradientPair(grad, hess);
   }
-  gpair.SetDevice(0);
+  gpair.SetDevice(DeviceOrd::CUDA(0));
 
   thrust::host_vector<common::CompressedByteT> h_gidx_buffer (page->gidx_buffer.HostVector());
-  maker.row_partitioner = std::make_unique<RowPartitioner>(0, kNRows);
+  maker.row_partitioner = std::make_unique<RowPartitioner>(FstCU(), kNRows);
 
-  maker.hist.Init(0, page->Cuts().TotalBins());
+  maker.hist.Init(FstCU(), page->Cuts().TotalBins());
   maker.hist.AllocateHistograms({0});
 
   maker.gpair = gpair.DeviceSpan();
@@ -127,8 +127,8 @@ void TestBuildHist(bool use_shared_memory_histograms) {
 
   maker.InitFeatureGroupsOnce();
 
-  BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(0),
-                         maker.feature_groups->DeviceAccessor(0), gpair.DeviceSpan(),
+  BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(DeviceOrd::CUDA(0)),
+                         maker.feature_groups->DeviceAccessor(DeviceOrd::CUDA(0)), gpair.DeviceSpan(),
                          maker.row_partitioner->GetRows(0), maker.hist.GetNodeHistogram(0),
                          *maker.quantiser, !use_shared_memory_histograms);
 
@@ -215,7 +215,7 @@ void TestHistogramIndexImpl() {
   // histogram index
   const auto &maker = hist_maker.maker;
   auto grad = GenerateRandomGradients(kNRows);
-  grad.SetDevice(0);
+  grad.SetDevice(DeviceOrd::CUDA(0));
   maker->Reset(&grad, hist_maker_dmat.get(), kNCols);
   std::vector<common::CompressedByteT> h_gidx_buffer(maker->page->gidx_buffer.HostVector());
 
@@ -281,17 +281,17 @@ TEST(GpuHist, UniformSampling) {
   // Create an in-memory DMatrix.
   std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, 0, true));
 
-  linalg::Matrix<GradientPair> gpair({kRows}, Context{}.MakeCUDA().Ordinal());
+  linalg::Matrix<GradientPair> gpair({kRows}, Context{}.MakeCUDA().Device());
   gpair.Data()->Copy(GenerateRandomGradients(kRows));
 
   // Build a tree using the in-memory DMatrix.
   RegTree tree;
-  HostDeviceVector<bst_float> preds(kRows, 0.0, 0);
+  HostDeviceVector<bst_float> preds(kRows, 0.0, DeviceOrd::CUDA(0));
   Context ctx(MakeCUDACtx(0));
   UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree, &preds, 1.0, "uniform", kRows);
   // Build another tree using sampling.
   RegTree tree_sampling;
-  HostDeviceVector<bst_float> preds_sampling(kRows, 0.0, 0);
+  HostDeviceVector<bst_float> preds_sampling(kRows, 0.0, DeviceOrd::CUDA(0));
   UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree_sampling, &preds_sampling, kSubsample, "uniform",
              kRows);
 
@@ -312,18 +312,18 @@ TEST(GpuHist, GradientBasedSampling) {
   // Create an in-memory DMatrix.
   std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrixWithRC(kRows, kCols, 0, true));
 
-  linalg::Matrix<GradientPair> gpair({kRows}, MakeCUDACtx(0).Ordinal());
+  linalg::Matrix<GradientPair> gpair({kRows}, MakeCUDACtx(0).Device());
   gpair.Data()->Copy(GenerateRandomGradients(kRows));
 
   // Build a tree using the in-memory DMatrix.
   RegTree tree;
-  HostDeviceVector<bst_float> preds(kRows, 0.0, 0);
+  HostDeviceVector<bst_float> preds(kRows, 0.0, DeviceOrd::CUDA(0));
   Context ctx(MakeCUDACtx(0));
   UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree, &preds, 1.0, "uniform", kRows);
 
   // Build another tree using sampling.
   RegTree tree_sampling;
-  HostDeviceVector<bst_float> preds_sampling(kRows, 0.0, 0);
+  HostDeviceVector<bst_float> preds_sampling(kRows, 0.0, DeviceOrd::CUDA(0));
   UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree_sampling, &preds_sampling, kSubsample,
              "gradient_based", kRows);
 
@@ -350,16 +350,16 @@ TEST(GpuHist, ExternalMemory) {
   std::unique_ptr<DMatrix> dmat(CreateSparsePageDMatrix(kRows, kCols, 1, tmpdir.path + "/cache"));
 
   Context ctx(MakeCUDACtx(0));
-  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Ordinal());
+  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
   gpair.Data()->Copy(GenerateRandomGradients(kRows));
 
   // Build a tree using the in-memory DMatrix.
   RegTree tree;
-  HostDeviceVector<bst_float> preds(kRows, 0.0, 0);
+  HostDeviceVector<bst_float> preds(kRows, 0.0, DeviceOrd::CUDA(0));
   UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree, &preds, 1.0, "uniform", kRows);
   // Build another tree using multiple ELLPACK pages.
   RegTree tree_ext;
-  HostDeviceVector<bst_float> preds_ext(kRows, 0.0, 0);
+  HostDeviceVector<bst_float> preds_ext(kRows, 0.0, DeviceOrd::CUDA(0));
   UpdateTree(&ctx, &gpair, dmat_ext.get(), kPageSize, &tree_ext, &preds_ext, 1.0, "uniform", kRows);
 
   // Make sure the predictions are the same.
@@ -388,20 +388,20 @@ TEST(GpuHist, ExternalMemoryWithSampling) {
       CreateSparsePageDMatrix(kRows, kCols, kRows / kPageSize, tmpdir.path + "/cache"));
 
   Context ctx(MakeCUDACtx(0));
-  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Ordinal());
+  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
   gpair.Data()->Copy(GenerateRandomGradients(kRows));
 
   // Build a tree using the in-memory DMatrix.
   auto rng = common::GlobalRandom();
 
   RegTree tree;
-  HostDeviceVector<bst_float> preds(kRows, 0.0, 0);
+  HostDeviceVector<bst_float> preds(kRows, 0.0, DeviceOrd::CUDA(0));
   UpdateTree(&ctx, &gpair, dmat.get(), 0, &tree, &preds, kSubsample, kSamplingMethod, kRows);
 
   // Build another tree using multiple ELLPACK pages.
   common::GlobalRandom() = rng;
   RegTree tree_ext;
-  HostDeviceVector<bst_float> preds_ext(kRows, 0.0, 0);
+  HostDeviceVector<bst_float> preds_ext(kRows, 0.0, DeviceOrd::CUDA(0));
   UpdateTree(&ctx, &gpair, dmat_ext.get(), kPageSize, &tree_ext, &preds_ext, kSubsample,
              kSamplingMethod, kRows);
 
@@ -445,7 +445,7 @@ TEST(GpuHist, MaxDepth) {
 }
 
 namespace {
-RegTree GetUpdatedTree(Context const* ctx, DMatrix* dmat) {
+RegTree GetHistTree(Context const* ctx, DMatrix* dmat) {
   ObjInfo task{ObjInfo::kRegression};
   GPUHistMaker hist_maker{ctx, &task};
   hist_maker.Configure(Args{});
@@ -453,7 +453,7 @@ RegTree GetUpdatedTree(Context const* ctx, DMatrix* dmat) {
   TrainParam param;
   param.UpdateAllowUnknown(Args{});
 
-  linalg::Matrix<GradientPair> gpair({dmat->Info().num_row_}, ctx->Ordinal());
+  linalg::Matrix<GradientPair> gpair({dmat->Info().num_row_}, ctx->Device());
   gpair.Data()->Copy(GenerateRandomGradients(dmat->Info().num_row_));
 
   std::vector<HostDeviceVector<bst_node_t>> position(1);
@@ -463,7 +463,7 @@ RegTree GetUpdatedTree(Context const* ctx, DMatrix* dmat) {
   return tree;
 }
 
-void VerifyColumnSplit(bst_row_t rows, bst_feature_t cols, RegTree const& expected_tree) {
+void VerifyHistColumnSplit(bst_row_t rows, bst_feature_t cols, RegTree const& expected_tree) {
   Context ctx(MakeCUDACtx(GPUIDX));
 
   auto Xy = RandomDataGenerator{rows, cols, 0}.GenerateDMatrix(true);
@@ -471,7 +471,7 @@ void VerifyColumnSplit(bst_row_t rows, bst_feature_t cols, RegTree const& expect
   auto const rank = collective::GetRank();
   std::unique_ptr<DMatrix> sliced{Xy->SliceCol(world_size, rank)};
 
-  RegTree tree = GetUpdatedTree(&ctx, sliced.get());
+  RegTree tree = GetHistTree(&ctx, sliced.get());
 
   Json json{Object{}};
   tree.SaveModel(&json);
@@ -489,8 +489,58 @@ TEST_F(MGPUHistTest, GPUHistColumnSplit) {
 
   Context ctx(MakeCUDACtx(0));
   auto dmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true);
-  RegTree expected_tree = GetUpdatedTree(&ctx, dmat.get());
+  RegTree expected_tree = GetHistTree(&ctx, dmat.get());
 
-  DoTest(VerifyColumnSplit, kRows, kCols, expected_tree);
+  DoTest(VerifyHistColumnSplit, kRows, kCols, expected_tree);
+}
+
+namespace {
+RegTree GetApproxTree(Context const* ctx, DMatrix* dmat) {
+  ObjInfo task{ObjInfo::kRegression};
+  GPUGlobalApproxMaker approx_maker{ctx, &task};
+  approx_maker.Configure(Args{});
+
+  TrainParam param;
+  param.UpdateAllowUnknown(Args{});
+
+  linalg::Matrix<GradientPair> gpair({dmat->Info().num_row_}, ctx->Device());
+  gpair.Data()->Copy(GenerateRandomGradients(dmat->Info().num_row_));
+
+  std::vector<HostDeviceVector<bst_node_t>> position(1);
+  RegTree tree;
+  approx_maker.Update(&param, &gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position},
+                      {&tree});
+  return tree;
+}
+
+void VerifyApproxColumnSplit(bst_row_t rows, bst_feature_t cols, RegTree const& expected_tree) {
+  Context ctx(MakeCUDACtx(GPUIDX));
+
+  auto Xy = RandomDataGenerator{rows, cols, 0}.GenerateDMatrix(true);
+  auto const world_size = collective::GetWorldSize();
+  auto const rank = collective::GetRank();
+  std::unique_ptr<DMatrix> sliced{Xy->SliceCol(world_size, rank)};
+
+  RegTree tree = GetApproxTree(&ctx, sliced.get());
+
+  Json json{Object{}};
+  tree.SaveModel(&json);
+  Json expected_json{Object{}};
+  expected_tree.SaveModel(&expected_json);
+  ASSERT_EQ(json, expected_json);
+}
+}  // anonymous namespace
+
+class MGPUApproxTest : public BaseMGPUTest {};
+
+TEST_F(MGPUApproxTest, GPUApproxColumnSplit) {
+  auto constexpr kRows = 32;
+  auto constexpr kCols = 16;
+
+  Context ctx(MakeCUDACtx(0));
+  auto dmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true);
+  RegTree expected_tree = GetApproxTree(&ctx, dmat.get());
+
+  DoTest(VerifyApproxColumnSplit, kRows, kCols, expected_tree);
 }
 }  // namespace xgboost::tree
diff --git a/tests/cpp/tree/test_histmaker.cc b/tests/cpp/tree/test_histmaker.cc
index e90120231..963660f59 100644
--- a/tests/cpp/tree/test_histmaker.cc
+++ b/tests/cpp/tree/test_histmaker.cc
@@ -28,7 +28,7 @@ TEST(GrowHistMaker, InteractionConstraint) {
   auto p_dmat = GenerateDMatrix(kRows, kCols);
   Context ctx;
 
-  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Ordinal());
+  linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
   gpair.Data()->Copy(GenerateRandomGradients(kRows));
 
   ObjInfo task{ObjInfo::kRegression};
@@ -74,7 +74,7 @@ void VerifyColumnSplit(int32_t rows, bst_feature_t cols, bool categorical,
                        RegTree const& expected_tree) {
   Context ctx;
   auto p_dmat = GenerateDMatrix(rows, cols, categorical);
-  linalg::Matrix<GradientPair> gpair({rows}, ctx.Ordinal());
+  linalg::Matrix<GradientPair> gpair({rows}, ctx.Device());
   gpair.Data()->Copy(GenerateRandomGradients(rows));
 
 
@@ -107,7 +107,7 @@ void TestColumnSplit(bool categorical) {
   {
     Context ctx;
     auto p_dmat = GenerateDMatrix(kRows, kCols, categorical);
-    linalg::Matrix<GradientPair> gpair({kRows}, ctx.Ordinal());
+    linalg::Matrix<GradientPair> gpair({kRows}, ctx.Device());
     gpair.Data()->Copy(GenerateRandomGradients(kRows));
     std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_histmaker", &ctx, &task)};
     std::vector<HostDeviceVector<bst_node_t>> position(1);
diff --git a/tests/cpp/tree/test_multi_target_tree_model.cc b/tests/cpp/tree/test_multi_target_tree_model.cc
index af83ed7eb..550b8837c 100644
--- a/tests/cpp/tree/test_multi_target_tree_model.cc
+++ b/tests/cpp/tree/test_multi_target_tree_model.cc
@@ -12,9 +12,9 @@ TEST(MultiTargetTree, JsonIO) {
   bst_feature_t n_features{4};
   RegTree tree{n_targets, n_features};
   ASSERT_TRUE(tree.IsMultiTarget());
-  linalg::Vector<float> base_weight{{1.0f, 2.0f, 3.0f}, {3ul}, Context::kCpuId};
-  linalg::Vector<float> left_weight{{2.0f, 3.0f, 4.0f}, {3ul}, Context::kCpuId};
-  linalg::Vector<float> right_weight{{3.0f, 4.0f, 5.0f}, {3ul}, Context::kCpuId};
+  linalg::Vector<float> base_weight{{1.0f, 2.0f, 3.0f}, {3ul}, DeviceOrd::CPU()};
+  linalg::Vector<float> left_weight{{2.0f, 3.0f, 4.0f}, {3ul}, DeviceOrd::CPU()};
+  linalg::Vector<float> right_weight{{3.0f, 4.0f, 5.0f}, {3ul}, DeviceOrd::CPU()};
   tree.ExpandNode(RegTree::kRoot, /*split_idx=*/1, 0.5f, true, base_weight.HostView(),
                   left_weight.HostView(), right_weight.HostView());
   ASSERT_EQ(tree.NumNodes(), 3);
diff --git a/tests/cpp/tree/test_tree_stat.cc b/tests/cpp/tree/test_tree_stat.cc
index e37e5d7ca..7f3e3bc94 100644
--- a/tests/cpp/tree/test_tree_stat.cc
+++ b/tests/cpp/tree/test_tree_stat.cc
@@ -33,7 +33,7 @@ class UpdaterTreeStatTest : public ::testing::Test {
     ObjInfo task{ObjInfo::kRegression};
     param.Init(Args{});
 
-    Context ctx(updater == "grow_gpu_hist" ? MakeCUDACtx(0) : MakeCUDACtx(Context::kCpuId));
+    Context ctx(updater == "grow_gpu_hist" ? MakeCUDACtx(0) : MakeCUDACtx(DeviceOrd::CPUOrdinal()));
     auto up = std::unique_ptr<TreeUpdater>{TreeUpdater::Create(updater, &ctx, &task)};
     up->Configure(Args{});
     RegTree tree{1u, kCols};
@@ -78,7 +78,7 @@ class UpdaterEtaTest : public ::testing::Test {
   void RunTest(std::string updater) {
     ObjInfo task{ObjInfo::kClassification};
 
-    Context ctx(updater == "grow_gpu_hist" ? MakeCUDACtx(0) : MakeCUDACtx(Context::kCpuId));
+    Context ctx(updater == "grow_gpu_hist" ? MakeCUDACtx(0) : MakeCUDACtx(DeviceOrd::CPUOrdinal()));
 
     float eta = 0.4;
     auto up_0 = std::unique_ptr<TreeUpdater>{TreeUpdater::Create(updater, &ctx, &task)};
diff --git a/tests/python/test_callback.py b/tests/python/test_callback.py
index 56c9fdabd..262c09c99 100644
--- a/tests/python/test_callback.py
+++ b/tests/python/test_callback.py
@@ -443,7 +443,7 @@ class TestCallbacks:
         m = xgb.DMatrix(X, y)
         with tempfile.TemporaryDirectory() as tmpdir:
             check_point = xgb.callback.TrainingCheckPoint(
-                directory=tmpdir, iterations=1, name="model"
+                directory=tmpdir, interval=1, name="model"
             )
             xgb.train(
                 {"objective": "binary:logistic"},
@@ -456,7 +456,7 @@ class TestCallbacks:
                 assert os.path.exists(os.path.join(tmpdir, "model_" + str(i) + ".json"))
 
             check_point = xgb.callback.TrainingCheckPoint(
-                directory=tmpdir, iterations=1, as_pickle=True, name="model"
+                directory=tmpdir, interval=1, as_pickle=True, name="model"
             )
             xgb.train(
                 {"objective": "binary:logistic"},
diff --git a/tests/python/test_dmatrix.py b/tests/python/test_dmatrix.py
index 73e2055b7..51bee5669 100644
--- a/tests/python/test_dmatrix.py
+++ b/tests/python/test_dmatrix.py
@@ -1,4 +1,5 @@
 import os
+import sys
 import tempfile
 
 import numpy as np
@@ -9,6 +10,7 @@ from scipy.sparse import csr_matrix, rand
 
 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.core import DataSplitMode
 from xgboost.testing.data import np_dtypes
 
 rng = np.random.RandomState(1)
@@ -467,3 +469,97 @@ class TestDMatrix:
             m0 = xgb.DMatrix(orig)
             m1 = xgb.DMatrix(x)
             assert tm.predictor_equal(m0, m1)
+
+
+class TestDMatrixColumnSplit:
+    def test_numpy(self):
+        def verify_numpy():
+            data = np.random.randn(5, 5)
+            dm = xgb.DMatrix(data, data_split_mode=DataSplitMode.COL)
+            assert dm.num_row() == 5
+            assert dm.num_col() == 5 * xgb.collective.get_world_size()
+            assert dm.feature_names is None
+            assert dm.feature_types is None
+
+        tm.run_with_rabit(world_size=3, test_fn=verify_numpy)
+
+    def test_numpy_feature_names(self):
+        def verify_numpy_feature_names():
+            world_size = xgb.collective.get_world_size()
+            data = np.random.randn(5, 5)
+            feature_names = [f'feature{x}' for x in range(5)]
+            feature_types = ['float'] * 5
+            dm = xgb.DMatrix(data, feature_names=feature_names, feature_types=feature_types,
+                             data_split_mode=DataSplitMode.COL)
+            assert dm.num_row() == 5
+            assert dm.num_col() == 5 * world_size
+            assert len(dm.feature_names) == 5 * world_size
+            assert len(dm.feature_types) == 5 * world_size
+
+        tm.run_with_rabit(world_size=3, test_fn=verify_numpy_feature_names)
+
+    def test_csr(self):
+        def verify_csr():
+            indptr = np.array([0, 2, 3, 6])
+            indices = np.array([0, 2, 2, 0, 1, 2])
+            data = np.array([1, 2, 3, 4, 5, 6])
+            X = scipy.sparse.csr_matrix((data, indices, indptr), shape=(3, 3))
+            dtrain = xgb.DMatrix(X, data_split_mode=DataSplitMode.COL)
+            assert dtrain.num_row() == 3
+            assert dtrain.num_col() == 3 * xgb.collective.get_world_size()
+
+        tm.run_with_rabit(world_size=3, test_fn=verify_csr)
+
+    def test_csc(self):
+        def verify_csc():
+            row = np.array([0, 2, 2, 0, 1, 2])
+            col = np.array([0, 0, 1, 2, 2, 2])
+            data = np.array([1, 2, 3, 4, 5, 6])
+            X = scipy.sparse.csc_matrix((data, (row, col)), shape=(3, 3))
+            dtrain = xgb.DMatrix(X, data_split_mode=DataSplitMode.COL)
+            assert dtrain.num_row() == 3
+            assert dtrain.num_col() == 3 * xgb.collective.get_world_size()
+
+        tm.run_with_rabit(world_size=3, test_fn=verify_csc)
+
+    def test_coo(self):
+        def verify_coo():
+            row = np.array([0, 2, 2, 0, 1, 2])
+            col = np.array([0, 0, 1, 2, 2, 2])
+            data = np.array([1, 2, 3, 4, 5, 6])
+            X = scipy.sparse.coo_matrix((data, (row, col)), shape=(3, 3))
+            dtrain = xgb.DMatrix(X, data_split_mode=DataSplitMode.COL)
+            assert dtrain.num_row() == 3
+            assert dtrain.num_col() == 3 * xgb.collective.get_world_size()
+
+        tm.run_with_rabit(world_size=3, test_fn=verify_coo)
+
+    def test_list(self):
+        def verify_list():
+            data = [
+                [1, 2, 3, 4, 5],
+                [6, 7, 8, 9, 10],
+                [11, 12, 13, 14, 15],
+                [16, 17, 18, 19, 20],
+                [21, 22, 23, 24, 25]
+            ]
+            dm = xgb.DMatrix(data, data_split_mode=DataSplitMode.COL)
+            assert dm.num_row() == 5
+            assert dm.num_col() == 5 * xgb.collective.get_world_size()
+
+        tm.run_with_rabit(world_size=3, test_fn=verify_list)
+
+    def test_tuple(self):
+        def verify_tuple():
+            data = (
+                (1, 2, 3, 4, 5),
+                (6, 7, 8, 9, 10),
+                (11, 12, 13, 14, 15),
+                (16, 17, 18, 19, 20),
+                (21, 22, 23, 24, 25)
+            )
+            dm = xgb.DMatrix(data, data_split_mode=DataSplitMode.COL)
+            assert dm.num_row() == 5
+            assert dm.num_col() == 5 * xgb.collective.get_world_size()
+
+        tm.run_with_rabit(world_size=3, test_fn=verify_tuple)
diff --git a/tests/python/test_with_arrow.py b/tests/python/test_with_arrow.py
index 8b7bce9eb..fdc4c7dbe 100644
--- a/tests/python/test_with_arrow.py
+++ b/tests/python/test_with_arrow.py
@@ -1,4 +1,5 @@
 import os
+import sys
 import unittest
 
 import numpy as np
@@ -6,6 +7,7 @@ import pytest
 
 import xgboost as xgb
 from xgboost import testing as tm
+from xgboost.core import DataSplitMode
 
 try:
     import pandas as pd
@@ -22,7 +24,7 @@ pytestmark = pytest.mark.skipif(
 dpath = "demo/data/"
 
 
-class TestArrowTable(unittest.TestCase):
+class TestArrowTable:
     def test_arrow_table(self):
         df = pd.DataFrame(
             [[0, 1, 2.0, 3.0], [1, 2, 3.0, 4.0]], columns=["a", "b", "c", "d"]
@@ -52,7 +54,8 @@ class TestArrowTable(unittest.TestCase):
         assert dm.num_row() == 4
         assert dm.num_col() == 3
 
-    def test_arrow_train(self):
+    @pytest.mark.parametrize("DMatrixT", [xgb.DMatrix, xgb.QuantileDMatrix])
+    def test_arrow_train(self, DMatrixT):
         import pandas as pd
 
         rows = 100
@@ -64,16 +67,24 @@ class TestArrowTable(unittest.TestCase):
             }
         )
         y = pd.Series(np.random.randn(rows))
+
         table = pa.Table.from_pandas(X)
-        dtrain1 = xgb.DMatrix(table)
-        dtrain1.set_label(y)
+        dtrain1 = DMatrixT(table)
+        dtrain1.set_label(pa.Table.from_pandas(pd.DataFrame(y)))
         bst1 = xgb.train({}, dtrain1, num_boost_round=10)
-        preds1 = bst1.predict(xgb.DMatrix(X))
-        dtrain2 = xgb.DMatrix(X, y)
+        preds1 = bst1.predict(DMatrixT(X))
+
+        dtrain2 = DMatrixT(X, y)
         bst2 = xgb.train({}, dtrain2, num_boost_round=10)
-        preds2 = bst2.predict(xgb.DMatrix(X))
+        preds2 = bst2.predict(DMatrixT(X))
+
         np.testing.assert_allclose(preds1, preds2)
 
+        preds3 = bst2.inplace_predict(table)
+        np.testing.assert_allclose(preds1, preds3)
+        assert bst2.feature_names == ["A", "B", "C"]
+        assert bst2.feature_types == ["int", "float", "int"]
+
     def test_arrow_survival(self):
         data = os.path.join(tm.data_dir(__file__), "veterans_lung_cancer.csv")
         table = pc.read_csv(data)
@@ -88,3 +99,17 @@ class TestArrowTable(unittest.TestCase):
         y_np_low = dtrain.get_float_info("label_lower_bound")
         np.testing.assert_equal(y_np_up, y_upper_bound.to_pandas().values)
         np.testing.assert_equal(y_np_low, y_lower_bound.to_pandas().values)
+
+
+class TestArrowTableColumnSplit:
+    def test_arrow_table(self):
+        def verify_arrow_table():
+            df = pd.DataFrame(
+                [[0, 1, 2.0, 3.0], [1, 2, 3.0, 4.0]], columns=["a", "b", "c", "d"]
+            )
+            table = pa.Table.from_pandas(df)
+            dm = xgb.DMatrix(table, data_split_mode=DataSplitMode.COL)
+            assert dm.num_row() == 2
+            assert dm.num_col() == 4 * xgb.collective.get_world_size()
+
+        tm.run_with_rabit(world_size=3, test_fn=verify_arrow_table)
diff --git a/tests/python/test_with_pandas.py b/tests/python/test_with_pandas.py
index f8a21b6ab..a23a66b63 100644
--- a/tests/python/test_with_pandas.py
+++ b/tests/python/test_with_pandas.py
@@ -211,7 +211,7 @@ class TestPandas:
         y = np.random.randn(kRows)
         w = np.random.uniform(size=kRows).astype(np.float32)
         w_pd = pd.DataFrame(w)
-        data = xgb.DMatrix(X, y, w_pd)
+        data = xgb.DMatrix(X, y, weight=w_pd)
 
         assert data.num_row() == kRows
         assert data.num_col() == kCols
@@ -301,14 +301,14 @@ class TestPandas:
 
     @pytest.mark.parametrize("DMatrixT", [xgb.DMatrix, xgb.QuantileDMatrix])
     def test_nullable_type(self, DMatrixT) -> None:
-        from pandas.api.types import is_categorical_dtype
+        from xgboost.data import is_pd_cat_dtype
 
         for orig, df in pd_dtypes():
             if hasattr(df.dtypes, "__iter__"):
-                enable_categorical = any(is_categorical_dtype for dtype in df.dtypes)
+                enable_categorical = any(is_pd_cat_dtype(dtype) for dtype in df.dtypes)
             else:
                 # series
-                enable_categorical = is_categorical_dtype(df.dtype)
+                enable_categorical = is_pd_cat_dtype(df.dtype)
 
             f0_orig = orig[orig.columns[0]] if isinstance(orig, pd.DataFrame) else orig
             f0 = df[df.columns[0]] if isinstance(df, pd.DataFrame) else df
diff --git a/tests/test_distributed/test_gpu_with_spark/test_gpu_spark.py b/tests/test_distributed/test_gpu_with_spark/test_gpu_spark.py
index a954d9d6c..513554e43 100644
--- a/tests/test_distributed/test_gpu_with_spark/test_gpu_spark.py
+++ b/tests/test_distributed/test_gpu_with_spark/test_gpu_spark.py
@@ -2,6 +2,7 @@ import json
 import logging
 import subprocess
 
+import numpy as np
 import pytest
 import sklearn
 
@@ -13,7 +14,7 @@ from pyspark.ml.linalg import Vectors
 from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
 from pyspark.sql import SparkSession
 
-from xgboost.spark import SparkXGBClassifier, SparkXGBRegressor
+from xgboost.spark import SparkXGBClassifier, SparkXGBRegressor, SparkXGBRegressorModel
 
 gpu_discovery_script_path = "tests/test_distributed/test_gpu_with_spark/discover_gpu.sh"
 
@@ -242,3 +243,33 @@ def test_sparkxgb_regressor_feature_cols_with_gpu(spark_diabetes_dataset_feature
     evaluator = RegressionEvaluator(metricName="rmse")
     rmse = evaluator.evaluate(pred_result_df)
     assert rmse <= 65.0
+
+
+def test_gpu_transform(spark_diabetes_dataset) -> None:
+    regressor = SparkXGBRegressor(device="cuda", num_workers=num_workers)
+    train_df, test_df = spark_diabetes_dataset
+    model: SparkXGBRegressorModel = regressor.fit(train_df)
+
+    # The model trained with GPUs, and transform with GPU configurations.
+    assert model._gpu_transform()
+
+    model.set_device("cpu")
+    assert not model._gpu_transform()
+    # without error
+    cpu_rows = model.transform(test_df).select("prediction").collect()
+
+    regressor = SparkXGBRegressor(device="cpu", num_workers=num_workers)
+    model = regressor.fit(train_df)
+
+    # The model trained with CPUs. Even with GPU configurations,
+    # still prefer transforming with CPUs
+    assert not model._gpu_transform()
+
+    # Set gpu transform explicitly.
+    model.set_device("cuda")
+    assert model._gpu_transform()
+    # without error
+    gpu_rows = model.transform(test_df).select("prediction").collect()
+
+    for cpu, gpu in zip(cpu_rows, gpu_rows):
+        np.testing.assert_allclose(cpu.prediction, gpu.prediction, atol=1e-3)
diff --git a/tests/test_distributed/test_with_dask/test_with_dask.py b/tests/test_distributed/test_with_dask/test_with_dask.py
index ae8d24139..3510dff7b 100644
--- a/tests/test_distributed/test_with_dask/test_with_dask.py
+++ b/tests/test_distributed/test_with_dask/test_with_dask.py
@@ -2238,7 +2238,7 @@ class TestDaskCallbacks:
                 y,
                 callbacks=[
                     xgb.callback.TrainingCheckPoint(
-                        directory=Path(tmpdir), iterations=1, name="model"
+                        directory=Path(tmpdir), interval=1, name="model"
                     )
                 ],
             )
diff --git a/tests/test_distributed/test_with_spark/test_spark_local.py b/tests/test_distributed/test_with_spark/test_spark_local.py
index e323a3606..861e67a75 100644
--- a/tests/test_distributed/test_with_spark/test_spark_local.py
+++ b/tests/test_distributed/test_with_spark/test_spark_local.py
@@ -888,6 +888,34 @@ class TestPySparkLocal:
         clf = SparkXGBClassifier(device="cuda")
         clf._validate_params()
 
+    def test_gpu_transform(self, clf_data: ClfData) -> None:
+        """local mode"""
+        classifier = SparkXGBClassifier(device="cpu")
+        model: SparkXGBClassifierModel = classifier.fit(clf_data.cls_df_train)
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            path = "file:" + tmpdir
+            model.write().overwrite().save(path)
+
+            # The model trained with CPU, transform defaults to cpu
+            assert not model._gpu_transform()
+
+            # without error
+            model.transform(clf_data.cls_df_test).collect()
+
+            model.set_device("cuda")
+            assert model._gpu_transform()
+
+            model_loaded = SparkXGBClassifierModel.load(path)
+
+            # The model trained with CPU, transform defaults to cpu
+            assert not model_loaded._gpu_transform()
+            # without error
+            model_loaded.transform(clf_data.cls_df_test).collect()
+
+            model_loaded.set_device("cuda")
+            assert model_loaded._gpu_transform()
+
 
 class XgboostLocalTest(SparkTestCase):
     def setUp(self):