diff --git a/.clang-format b/.clang-format
index 0984d5a7b..737cf9006 100644
--- a/.clang-format
+++ b/.clang-format
@@ -17,7 +17,7 @@ AllowShortEnumsOnASingleLine: true
 AllowShortBlocksOnASingleLine: Never
 AllowShortCaseLabelsOnASingleLine: false
 AllowShortFunctionsOnASingleLine: All
-AllowShortLambdasOnASingleLine: All
+AllowShortLambdasOnASingleLine: Inline
 AllowShortIfStatementsOnASingleLine: WithoutElse
 AllowShortLoopsOnASingleLine: true
 AlwaysBreakAfterDefinitionReturnType: None
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index c03a52c60..0cc0c16fd 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -8,7 +8,7 @@ updates:
   - package-ecosystem: "maven"
     directory: "/jvm-packages"
     schedule:
-      interval: "daily"
+      interval: "monthly"
   - package-ecosystem: "maven"
     directory: "/jvm-packages/xgboost4j"
     schedule:
@@ -16,11 +16,11 @@ updates:
   - package-ecosystem: "maven"
     directory: "/jvm-packages/xgboost4j-gpu"
     schedule:
-      interval: "daily"
+      interval: "monthly"
   - package-ecosystem: "maven"
     directory: "/jvm-packages/xgboost4j-example"
     schedule:
-      interval: "daily"
+      interval: "monthly"
   - package-ecosystem: "maven"
     directory: "/jvm-packages/xgboost4j-spark"
     schedule:
@@ -28,4 +28,4 @@ updates:
   - package-ecosystem: "maven"
     directory: "/jvm-packages/xgboost4j-spark-gpu"
     schedule:
-      interval: "daily"
+      interval: "monthly"
diff --git a/.github/workflows/i386.yml b/.github/workflows/i386.yml
index 4a4d65b25..ca5baf412 100644
--- a/.github/workflows/i386.yml
+++ b/.github/workflows/i386.yml
@@ -5,6 +5,10 @@ on: [push, pull_request]
 permissions:
   contents: read # to fetch code (actions/checkout)
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   build-32bit:
     name: Build 32-bit
diff --git a/.github/workflows/jvm_tests.yml b/.github/workflows/jvm_tests.yml
index 330c037d7..9ef314ca5 100644
--- a/.github/workflows/jvm_tests.yml
+++ b/.github/workflows/jvm_tests.yml
@@ -5,6 +5,10 @@ on: [push, pull_request]
 permissions:
   contents: read # to fetch code (actions/checkout)
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   test-with-jvm:
     name: Test JVM on OS ${{ matrix.os }}
@@ -15,31 +19,36 @@ jobs:
         os: [windows-latest, ubuntu-latest, macos-11]
 
     steps:
-    - uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791 # v2.5.0
+    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
       with:
         submodules: 'true'
 
-    - uses: actions/setup-python@7f80679172b057fc5e90d70d197929d454754a5a # v4.3.0
+    - uses: mamba-org/setup-micromamba@422500192359a097648154e8db4e39bdb6c6eed7  # v1.8.1
       with:
-        python-version: '3.8'
-        architecture: 'x64'
-
-    - uses: actions/setup-java@d202f5dbf7256730fb690ec59f6381650114feb2 # v3.6.0
-      with:
-        java-version: 1.8
-
-    - name: Install Python packages
-      run: |
-        python -m pip install wheel setuptools
-        python -m pip install awscli
+        micromamba-version: '1.5.6-0'
+        environment-name: jvm_tests
+        create-args: >-
+          python=3.10
+          awscli
+        cache-downloads: true
+        cache-environment: true
+        init-shell: bash powershell
 
     - name: Cache Maven packages
-      uses: actions/cache@6998d139ddd3e68c71e9e398d8e40b71a2f39812 # v3.2.5
+      uses: actions/cache@13aacd865c20de90d75de3b17ebe84f7a17d57d2  # v4.0.0
       with:
         path: ~/.m2
         key: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }}
         restore-keys: ${{ runner.os }}-m2-${{ hashFiles('./jvm-packages/pom.xml') }}
 
+    - name: Build xgboost4j.dll
+      run: |
+        mkdir build
+        cd build
+        cmake .. -G"Visual Studio 17 2022" -A x64 -DJVM_BINDINGS=ON
+        cmake --build . --config Release
+      if: matrix.os == 'windows-latest'
+
     - name: Test XGBoost4J (Core)
       run: |
         cd jvm-packages
@@ -47,7 +56,8 @@ jobs:
 
     - name: Extract branch name
       shell: bash
-      run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})"
+      run: |
+        echo "branch=${GITHUB_REF#refs/heads/}" >> "$GITHUB_OUTPUT"
       id: extract_branch
       if: |
         (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) &&
@@ -58,7 +68,7 @@ jobs:
         cd lib/
         Rename-Item -Path xgboost4j.dll -NewName xgboost4j_${{ github.sha }}.dll
         dir
-        python -m awscli s3 cp xgboost4j_${{ github.sha }}.dll s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/libxgboost4j/ --acl public-read
+        python -m awscli s3 cp xgboost4j_${{ github.sha }}.dll s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/libxgboost4j/ --acl public-read --region us-west-2
       if: |
         (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) &&
         matrix.os == 'windows-latest'
@@ -67,11 +77,12 @@ jobs:
         AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }}
 
     - name: Publish artifact libxgboost4j.dylib to S3
+      shell: bash -l {0}
       run: |
         cd lib/
         mv -v libxgboost4j.dylib libxgboost4j_${{ github.sha }}.dylib
         ls
-        python -m awscli s3 cp libxgboost4j_${{ github.sha }}.dylib s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/libxgboost4j/ --acl public-read
+        python -m awscli s3 cp libxgboost4j_${{ github.sha }}.dylib s3://xgboost-nightly-builds/${{ steps.extract_branch.outputs.branch }}/libxgboost4j/ --acl public-read --region us-west-2
       if: |
         (github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')) &&
         matrix.os == 'macos-11'
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 20e91a5d9..b064b4843 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -9,6 +9,10 @@ on: [push, pull_request]
 permissions:
   contents: read # to fetch code (actions/checkout)
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 # A workflow run is made up of one or more jobs that can run sequentially or in parallel
 jobs:
   gtest-cpu:
@@ -174,7 +178,7 @@ jobs:
     - uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791 # v2.5.0
       with:
         submodules: 'true'
-    - uses: actions/setup-python@7f80679172b057fc5e90d70d197929d454754a5a # v4.3.0
+    - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
       with:
         python-version: "3.8"
         architecture: 'x64'
diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml
index 0fca76673..0a182677f 100644
--- a/.github/workflows/python_tests.yml
+++ b/.github/workflows/python_tests.yml
@@ -9,6 +9,10 @@ defaults:
   run:
     shell: bash -l {0}
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   python-mypy-lint:
     runs-on: ubuntu-latest
@@ -310,7 +314,7 @@ jobs:
           submodules: 'true'
 
       - name: Set up Python 3.8
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
         with:
           python-version: 3.8
 
diff --git a/.github/workflows/python_wheels.yml b/.github/workflows/python_wheels.yml
index f46b77295..cb56e1214 100644
--- a/.github/workflows/python_wheels.yml
+++ b/.github/workflows/python_wheels.yml
@@ -5,6 +5,10 @@ on: [push, pull_request]
 permissions:
   contents: read # to fetch code (actions/checkout)
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   python-wheels:
     name: Build wheel for ${{ matrix.platform_id }}
@@ -17,11 +21,11 @@ jobs:
         - os: macos-latest
           platform_id: macosx_arm64
     steps:
-    - uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791 # v2.5.0
+    - uses: actions/checkout@a12a3943b4bdde767164f792f33f40b04645d846 # v3.0.0
       with:
         submodules: 'true'
     - name: Setup Python
-      uses: actions/setup-python@7f80679172b057fc5e90d70d197929d454754a5a # v4.3.0
+      uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
       with:
         python-version: "3.8"
     - name: Build wheels
diff --git a/.github/workflows/r_nold.yml b/.github/workflows/r_nold.yml
index a014c9138..eb7179e81 100644
--- a/.github/workflows/r_nold.yml
+++ b/.github/workflows/r_nold.yml
@@ -10,6 +10,10 @@ on:
 permissions:
   contents: read # to fetch code (actions/checkout)
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   test-R-noLD:
     if: github.event.comment.body == '/gha run r-nold-test' && contains('OWNER,MEMBER,COLLABORATOR', github.event.comment.author_association)
diff --git a/.github/workflows/r_tests.yml b/.github/workflows/r_tests.yml
index d004ab15c..7dbdf3a84 100644
--- a/.github/workflows/r_tests.yml
+++ b/.github/workflows/r_tests.yml
@@ -8,6 +8,10 @@ env:
 permissions:
   contents: read # to fetch code (actions/checkout)
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
 jobs:
   lintr:
     runs-on: ${{ matrix.config.os }}
@@ -46,7 +50,7 @@ jobs:
         MAKEFLAGS="-j$(nproc)" R CMD INSTALL R-package/
         Rscript tests/ci_build/lint_r.R $(pwd)
 
-  test-R-on-Windows:
+  test-Rpkg:
     runs-on: ${{ matrix.config.os }}
     name: Test R on OS ${{ matrix.config.os }}, R ${{ matrix.config.r }}, Compiler ${{ matrix.config.compiler }}, Build ${{ matrix.config.build }}
     strategy:
@@ -54,11 +58,17 @@ jobs:
       matrix:
         config:
           - {os: windows-latest, r: 'release', compiler: 'mingw', build: 'autotools'}
+          - {os: ubuntu-latest, r: 'release', compiler: 'none', build: 'cmake'}
     env:
       R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
       RSPM: ${{ matrix.config.rspm }}
 
     steps:
+    - name: Install system dependencies
+      run: |
+        sudo apt update
+        sudo apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev
+      if: matrix.config.os == 'ubuntu-latest'
     - uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791 # v2.5.0
       with:
         submodules: 'true'
@@ -74,7 +84,7 @@ jobs:
         key: ${{ runner.os }}-r-${{ matrix.config.r }}-6-${{ hashFiles('R-package/DESCRIPTION') }}
         restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-6-${{ hashFiles('R-package/DESCRIPTION') }}
 
-    - uses: actions/setup-python@7f80679172b057fc5e90d70d197929d454754a5a # v4.3.0
+    - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
       with:
         python-version: "3.8"
         architecture: 'x64'
@@ -89,12 +99,18 @@ jobs:
     - name: Test R
       run: |
         python tests/ci_build/test_r_package.py --compiler='${{ matrix.config.compiler }}' --build-tool="${{ matrix.config.build }}" --task=check
+      if: matrix.config.compiler != 'none'
+
+    - name: Test R
+      run: |
+        python tests/ci_build/test_r_package.py --build-tool="${{ matrix.config.build }}" --task=check
+      if: matrix.config.compiler == 'none'
 
   test-R-on-Debian:
     name: Test R package on Debian
     runs-on: ubuntu-latest
     container:
-      image: rhub/debian-gcc-devel
+      image: rhub/debian-gcc-release
 
     steps:
     - name: Install system dependencies
@@ -114,12 +130,12 @@ jobs:
     - name: Install dependencies
       shell: bash -l {0}
       run: |
-        /tmp/R-devel/bin/Rscript -e "source('./R-package/tests/helper_scripts/install_deps.R')"
+        Rscript -e "source('./R-package/tests/helper_scripts/install_deps.R')"
 
     - name: Test R
       shell: bash -l {0}
       run: |
-        python3 tests/ci_build/test_r_package.py --r=/tmp/R-devel/bin/R --build-tool=autotools --task=check
+        python3 tests/ci_build/test_r_package.py --r=/usr/bin/R --build-tool=autotools --task=check
 
     - uses: dorny/paths-filter@v2
       id: changes
@@ -131,4 +147,4 @@ jobs:
     - name: Run document check
       if: steps.changes.outputs.r_package == 'true'
       run: |
-        python3 tests/ci_build/test_r_package.py --r=/tmp/R-devel/bin/R --task=doc
+        python3 tests/ci_build/test_r_package.py --r=/usr/bin/R --task=doc
diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml
index 78cde0a43..24cf0cf35 100644
--- a/.github/workflows/scorecards.yml
+++ b/.github/workflows/scorecards.yml
@@ -22,12 +22,12 @@ jobs:
 
     steps:
       - name: "Checkout code"
-        uses: actions/checkout@a12a3943b4bdde767164f792f33f40b04645d846 # tag=v3.0.0
+        uses: actions/checkout@a12a3943b4bdde767164f792f33f40b04645d846 # v3.0.0
         with:
           persist-credentials: false
 
       - name: "Run analysis"
-        uses: ossf/scorecard-action@08b4669551908b1024bb425080c797723083c031 # tag=v2.2.0
+        uses: ossf/scorecard-action@0864cf19026789058feabb7e87baa5f140aac736 # v2.3.1
         with:
           results_file: results.sarif
           results_format: sarif
@@ -41,7 +41,7 @@ jobs:
       # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
       # format to the repository Actions tab.
       - name: "Upload artifact"
-        uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # tag=v3.1.2
+        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1
         with:
           name: SARIF file
           path: results.sarif
@@ -49,6 +49,6 @@ jobs:
 
       # Upload the results to GitHub's code scanning dashboard.
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@7b6664fa89524ee6e3c3e9749402d5afd69b3cd8 # tag=v2.14.1
+        uses: github/codeql-action/upload-sarif@83a02f7883b12e0e4e1a146174f5e2292a01e601 # v2.16.4
         with:
           sarif_file: results.sarif
diff --git a/.github/workflows/update_rapids.yml b/.github/workflows/update_rapids.yml
index 395a42148..22a395799 100644
--- a/.github/workflows/update_rapids.yml
+++ b/.github/workflows/update_rapids.yml
@@ -3,7 +3,7 @@ name: update-rapids
 on:
   workflow_dispatch:
   schedule:
-    - cron: "0 20 * * *"  # Run once daily
+    - cron: "0 20 * * 1"  # Run once weekly
 
 permissions:
   pull-requests: write
@@ -32,7 +32,7 @@ jobs:
       run: |
         bash tests/buildkite/update-rapids.sh
     - name: Create Pull Request
-      uses: peter-evans/create-pull-request@v5
+      uses: peter-evans/create-pull-request@v6
       if: github.ref == 'refs/heads/master'
       with:
         add-paths: |
diff --git a/NEWS.md b/NEWS.md
index 43019d877..b067c8e3c 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -2101,7 +2101,7 @@ This release marks a major milestone for the XGBoost project.
 ## v0.90 (2019.05.18)
 
 ### XGBoost Python package drops Python 2.x (#4379, #4381)
-Python 2.x is reaching its end-of-life at the end of this year. [Many scientific Python packages are now moving to drop Python 2.x](https://python3statement.org/).
+Python 2.x is reaching its end-of-life at the end of this year. [Many scientific Python packages are now moving to drop Python 2.x](https://python3statement.github.io/).
 
 ### XGBoost4J-Spark now requires Spark 2.4.x (#4377)
 * Spark 2.3 is reaching its end-of-life soon. See discussion at #4389.
diff --git a/R-package/CMakeLists.txt b/R-package/CMakeLists.txt
index d3a69abc2..37c5dbf4c 100644
--- a/R-package/CMakeLists.txt
+++ b/R-package/CMakeLists.txt
@@ -26,7 +26,6 @@ endif()
 target_compile_definitions(
   xgboost-r PUBLIC
   -DXGBOOST_STRICT_R_MODE=1
-  -DXGBOOST_CUSTOMIZE_GLOBAL_PRNG=1
   -DDMLC_LOG_BEFORE_THROW=0
   -DDMLC_DISABLE_STDIN=1
   -DDMLC_LOG_CUSTOMIZE=1
diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index 66e2b5692..b4072aff0 100644
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -56,7 +56,8 @@ Suggests:
     testthat,
     igraph (>= 1.0.1),
     float,
-    titanic
+    titanic,
+    RhpcBLASctl
 Depends:
     R (>= 4.3.0)
 Imports:
diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE
index 580d1f873..c9e085e77 100644
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@@ -20,15 +20,9 @@ export("xgb.attr<-")
 export("xgb.attributes<-")
 export("xgb.config<-")
 export("xgb.parameters<-")
-export(cb.cv.predict)
-export(cb.early.stop)
-export(cb.evaluation.log)
-export(cb.gblinear.history)
-export(cb.print.evaluation)
-export(cb.reset.parameters)
-export(cb.save.model)
 export(getinfo)
 export(setinfo)
+export(xgb.Callback)
 export(xgb.DMatrix)
 export(xgb.DMatrix.hasinfo)
 export(xgb.DMatrix.save)
@@ -39,6 +33,13 @@ export(xgb.QuantileDMatrix)
 export(xgb.QuantileDMatrix.from_iterator)
 export(xgb.attr)
 export(xgb.attributes)
+export(xgb.cb.cv.predict)
+export(xgb.cb.early.stop)
+export(xgb.cb.evaluation.log)
+export(xgb.cb.gblinear.history)
+export(xgb.cb.print.evaluation)
+export(xgb.cb.reset.parameters)
+export(xgb.cb.save.model)
 export(xgb.config)
 export(xgb.copy.Booster)
 export(xgb.create.features)
@@ -72,14 +73,10 @@ export(xgb.slice.DMatrix)
 export(xgb.train)
 export(xgboost)
 import(methods)
+importClassesFrom(Matrix,CsparseMatrix)
 importClassesFrom(Matrix,dgCMatrix)
 importClassesFrom(Matrix,dgRMatrix)
-importClassesFrom(Matrix,dgeMatrix)
-importFrom(Matrix,colSums)
 importFrom(Matrix,sparse.model.matrix)
-importFrom(Matrix,sparseMatrix)
-importFrom(Matrix,sparseVector)
-importFrom(Matrix,t)
 importFrom(data.table,":=")
 importFrom(data.table,as.data.table)
 importFrom(data.table,data.table)
@@ -101,6 +98,7 @@ importFrom(methods,new)
 importFrom(stats,coef)
 importFrom(stats,median)
 importFrom(stats,predict)
+importFrom(stats,sd)
 importFrom(stats,variable.names)
 importFrom(utils,head)
 importFrom(utils,object.size)
diff --git a/R-package/R/callbacks.R b/R-package/R/callbacks.R
index 02e0a7cd4..39734ab09 100644
--- a/R-package/R/callbacks.R
+++ b/R-package/R/callbacks.R
@@ -1,769 +1,392 @@
-#' Callback closures for booster training.
-#'
-#' These are used to perform various service tasks either during boosting iterations or at the end.
-#' This approach helps to modularize many of such tasks without bloating the main training methods,
-#' and it offers .
-#'
-#' @details
-#' By default, a callback function is run after each boosting iteration.
-#' An R-attribute \code{is_pre_iteration} could be set for a callback to define a pre-iteration function.
-#'
-#' When a callback function has \code{finalize} parameter, its finalizer part will also be run after
-#' the boosting is completed.
-#'
-#' WARNING: side-effects!!! Be aware that these callback functions access and modify things in
-#' the environment from which they are called from, which is a fairly uncommon thing to do in R.
-#'
-#' To write a custom callback closure, make sure you first understand the main concepts about R environments.
-#' Check either R documentation on \code{\link[base]{environment}} or the
-#' \href{http://adv-r.had.co.nz/Environments.html}{Environments chapter} from the "Advanced R"
-#' book by Hadley Wickham. Further, the best option is to read the code of some of the existing callbacks -
-#' choose ones that do something similar to what you want to achieve. Also, you would need to get familiar
-#' with the objects available inside of the \code{xgb.train} and \code{xgb.cv} internal environments.
-#'
-#' @seealso
-#' \code{\link{cb.print.evaluation}},
-#' \code{\link{cb.evaluation.log}},
-#' \code{\link{cb.reset.parameters}},
-#' \code{\link{cb.early.stop}},
-#' \code{\link{cb.save.model}},
-#' \code{\link{cb.cv.predict}},
-#' \code{\link{xgb.train}},
-#' \code{\link{xgb.cv}}
-#'
-#' @name callbacks
-NULL
+.reserved_cb_names <- c("names", "class", "call", "params", "niter", "nfeatures", "folds")
 
-#
-# Callbacks -------------------------------------------------------------------
-#
-
-#' Callback closure for printing the result of evaluation
+#' @title XGBoost Callback Constructor
+#' @description Constructor for defining the structure of callback functions that can be executed
+#' at different stages of model training (before / after training, before / after each boosting
+#' iteration).
+#' @param cb_name Name for the callback.
 #'
-#' @param period  results would be printed every number of periods
-#' @param showsd  whether standard deviations should be printed (when available)
+#' If the callback produces some non-NULL result (from executing the function passed under
+#' `f_after_training`), that result will be added as an R attribute to the resulting booster
+#' (or as a named element in the result of CV), with the attribute name specified here.
 #'
-#' @details
-#' The callback function prints the result of evaluation at every \code{period} iterations.
-#' The initial and the last iteration's evaluations are always printed.
+#' Names of callbacks must be unique - i.e. there cannot be two callbacks with the same name.
+#' @param env An environment object that will be passed to the different functions in the callback.
+#' Note that this environment will not be shared with other callbacks.
+#' @param f_before_training A function that will be executed before the training has started.
 #'
-#' Callback function expects the following values to be set in its calling frame:
-#' \code{bst_evaluation} (also \code{bst_evaluation_err} when available),
-#' \code{iteration},
-#' \code{begin_iteration},
-#' \code{end_iteration}.
+#' If passing `NULL` for this or for the other function inputs, then no function will be executed.
 #'
-#' @seealso
-#' \code{\link{callbacks}}
+#' If passing a function, it will be called with parameters supplied as non-named arguments
+#' matching the function signatures that are shown in the default value for each function argument.
+#' @param f_before_iter A function that will be executed before each boosting round.
 #'
-#' @export
-cb.print.evaluation <- function(period = 1, showsd = TRUE) {
-
-  callback <- function(env = parent.frame()) {
-    if (length(env$bst_evaluation) == 0 ||
-        period == 0 ||
-        NVL(env$rank, 0) != 0)
-      return()
-
-    i <- env$iteration
-    if ((i - 1) %% period == 0 ||
-        i == env$begin_iteration ||
-        i == env$end_iteration) {
-      stdev <- if (showsd) env$bst_evaluation_err else NULL
-      msg <- .format_eval_string(i, env$bst_evaluation, stdev)
-      cat(msg, '\n')
-    }
-  }
-  attr(callback, 'call') <- match.call()
-  attr(callback, 'name') <- 'cb.print.evaluation'
-  callback
-}
-
-
-#' Callback closure for logging the evaluation history
+#' This function can signal whether the training should be finalized or not, by outputting
+#' a value that evaluates to `TRUE` - i.e. if the output from the function provided here at
+#' a given round is `TRUE`, then training will be stopped before the current iteration happens.
 #'
-#' @details
-#' This callback function appends the current iteration evaluation results \code{bst_evaluation}
-#' available in the calling parent frame to the \code{evaluation_log} list in a calling frame.
+#' Return values of `NULL` will be interpreted as `FALSE`.
+#' @param f_after_iter A function that will be executed after each boosting round.
 #'
-#' The finalizer callback (called with \code{finalize = TURE} in the end) converts
-#' the \code{evaluation_log} list into a final data.table.
+#' This function can signal whether the training should be finalized or not, by outputting
+#' a value that evaluates to `TRUE` - i.e. if the output from the function provided here at
+#' a given round is `TRUE`, then training will be stopped at that round.
 #'
-#' The iteration evaluation result \code{bst_evaluation} must be a named numeric vector.
+#' Return values of `NULL` will be interpreted as `FALSE`.
+#' @param f_after_training A function that will be executed after training is finished.
 #'
-#' Note: in the column names of the final data.table, the dash '-' character is replaced with
-#' the underscore '_' in order to make the column names more like regular R identifiers.
+#' This function can optionally output something non-NULL, which will become part of the R
+#' attributes of the booster (assuming one passes `keep_extra_attributes=TRUE` to \link{xgb.train})
+#' under the name supplied for parameter `cb_name` imn the case of \link{xgb.train}; or a part
+#' of the named elements in the result of \link{xgb.cv}.
+#' @return An `xgb.Callback` object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+#' @details Arguments that will be passed to the supplied functions are as follows:\itemize{
 #'
-#' Callback function expects the following values to be set in its calling frame:
-#' \code{evaluation_log},
-#' \code{bst_evaluation},
-#' \code{iteration}.
+#' \item env The same environment that is passed under argument `env`.
 #'
-#' @seealso
-#' \code{\link{callbacks}}
+#' It may be modified by the functions in order to e.g. keep tracking of what happens
+#' across iterations or similar.
 #'
-#' @export
-cb.evaluation.log <- function() {
-
-  mnames <- NULL
-
-  init <- function(env) {
-    if (!is.list(env$evaluation_log))
-      stop("'evaluation_log' has to be a list")
-    mnames <<- names(env$bst_evaluation)
-    if (is.null(mnames) || any(mnames == ""))
-      stop("bst_evaluation must have non-empty names")
-
-    mnames <<- gsub('-', '_', names(env$bst_evaluation), fixed = TRUE)
-    if (!is.null(env$bst_evaluation_err))
-      mnames <<- c(paste0(mnames, '_mean'), paste0(mnames, '_std'))
-  }
-
-  finalizer <- function(env) {
-    env$evaluation_log <- as.data.table(t(simplify2array(env$evaluation_log)))
-    setnames(env$evaluation_log, c('iter', mnames))
-
-    if (!is.null(env$bst_evaluation_err)) {
-      # rearrange col order from _mean,_mean,...,_std,_std,...
-      # to be _mean,_std,_mean,_std,...
-      len <- length(mnames)
-      means <- mnames[seq_len(len / 2)]
-      stds <- mnames[(len / 2 + 1):len]
-      cnames <- numeric(len)
-      cnames[c(TRUE, FALSE)] <- means
-      cnames[c(FALSE, TRUE)] <- stds
-      env$evaluation_log <- env$evaluation_log[, c('iter', cnames), with = FALSE]
-    }
-  }
-
-  callback <- function(env = parent.frame(), finalize = FALSE) {
-    if (is.null(mnames))
-      init(env)
-
-    if (finalize)
-      return(finalizer(env))
-
-    ev <- env$bst_evaluation
-    if (!is.null(env$bst_evaluation_err))
-      ev <- c(ev, env$bst_evaluation_err)
-    env$evaluation_log <- c(env$evaluation_log,
-                            list(c(iter = env$iteration, ev)))
-  }
-  attr(callback, 'call') <- match.call()
-  attr(callback, 'name') <- 'cb.evaluation.log'
-  callback
-}
-
-#' Callback closure for resetting the booster's parameters at each iteration.
+#' This environment is only used by the functions supplied to the callback, and will
+#' not be kept after the model fitting function terminates (see parameter `f_after_training`).
 #'
-#' @param new_params a list where each element corresponds to a parameter that needs to be reset.
-#'        Each element's value must be either a vector of values of length \code{nrounds}
-#'        to be set at each iteration,
-#'        or a function of two parameters \code{learning_rates(iteration, nrounds)}
-#'        which returns a new parameter value by using the current iteration number
-#'        and the total number of boosting rounds.
+#' \item model The booster object when using \link{xgb.train}, or the folds when using
+#' \link{xgb.cv}.
 #'
-#' @details
-#' This is a "pre-iteration" callback function used to reset booster's parameters
-#' at the beginning of each iteration.
-#'
-#' Note that when training is resumed from some previous model, and a function is used to
-#' reset a parameter value, the \code{nrounds} argument in this function would be the
-#' the number of boosting rounds in the current training.
-#'
-#' Callback function expects the following values to be set in its calling frame:
-#' \code{bst} or \code{bst_folds},
-#' \code{iteration},
-#' \code{begin_iteration},
-#' \code{end_iteration}.
-#'
-#' @seealso
-#' \code{\link{callbacks}}
-#'
-#' @export
-cb.reset.parameters <- function(new_params) {
-
-  if (typeof(new_params) != "list")
-    stop("'new_params' must be a list")
-  pnames <- gsub(".", "_", names(new_params), fixed = TRUE)
-  nrounds <- NULL
-
-  # run some checks in the beginning
-  init <- function(env) {
-    nrounds <<- env$end_iteration - env$begin_iteration + 1
-
-    if (is.null(env$bst) && is.null(env$bst_folds))
-      stop("Parent frame has neither 'bst' nor 'bst_folds'")
-
-    # Some parameters are not allowed to be changed,
-    # since changing them would simply wreck some chaos
-    not_allowed <- pnames %in%
-      c('num_class', 'num_output_group', 'size_leaf_vector', 'updater_seq')
-    if (any(not_allowed))
-      stop('Parameters ', paste(pnames[not_allowed]), " cannot be changed during boosting.")
-
-    for (n in pnames) {
-      p <- new_params[[n]]
-      if (is.function(p)) {
-        if (length(formals(p)) != 2)
-          stop("Parameter '", n, "' is a function but not of two arguments")
-      } else if (is.numeric(p) || is.character(p)) {
-        if (length(p) != nrounds)
-          stop("Length of '", n, "' has to be equal to 'nrounds'")
-      } else {
-        stop("Parameter '", n, "' is not a function or a vector")
-      }
-    }
-  }
-
-  callback <- function(env = parent.frame()) {
-    if (is.null(nrounds))
-      init(env)
-
-    i <- env$iteration
-    pars <- lapply(new_params, function(p) {
-      if (is.function(p))
-        return(p(i, nrounds))
-      p[i]
-    })
-
-    if (!is.null(env$bst)) {
-      xgb.parameters(env$bst) <- pars
-    } else {
-      for (fd in env$bst_folds)
-        xgb.parameters(fd$bst) <- pars
-    }
-  }
-  attr(callback, 'is_pre_iteration') <- TRUE
-  attr(callback, 'call') <- match.call()
-  attr(callback, 'name') <- 'cb.reset.parameters'
-  callback
-}
-
-
-#' Callback closure to activate the early stopping.
-#'
-#' @param stopping_rounds The number of rounds with no improvement in
-#'        the evaluation metric in order to stop the training.
-#' @param maximize whether to maximize the evaluation metric
-#' @param metric_name the name of an evaluation column to use as a criteria for early
-#'        stopping. If not set, the last column would be used.
-#'        Let's say the test data in \code{watchlist} was labelled as \code{dtest},
-#'        and one wants to use the AUC in test data for early stopping regardless of where
-#'        it is in the \code{watchlist}, then one of the following would need to be set:
-#'        \code{metric_name='dtest-auc'} or \code{metric_name='dtest_auc'}.
-#'        All dash '-' characters in metric names are considered equivalent to '_'.
-#' @param verbose whether to print the early stopping information.
-#'
-#' @details
-#' This callback function determines the condition for early stopping
-#' by setting the \code{stop_condition = TRUE} flag in its calling frame.
-#'
-#' The following additional fields are assigned to the model's R object:
-#' \itemize{
-#' \item \code{best_score} the evaluation score at the best iteration
-#' \item \code{best_iteration} at which boosting iteration the best score has occurred (1-based index)
-#' }
-#' The Same values are also stored as xgb-attributes:
-#' \itemize{
-#' \item \code{best_iteration} is stored as a 0-based iteration index (for interoperability of binary models)
-#' \item \code{best_msg} message string is also stored.
+#' For \link{xgb.cv}, folds are a list with a structure as follows:\itemize{
+#' \item `dtrain`: The training data for the fold (as an `xgb.DMatrix` object).
+#' \item `bst`: Rhe `xgb.Booster` object for the fold.
+#' \item `evals`: A list containing two DMatrices, with names `train` and `test`
+#' (`test` is the held-out data for the fold).
+#' \item `index`: The indices of the hold-out data for that fold (base-1 indexing),
+#' from which the `test` entry in `evals` was obtained.
 #' }
 #'
-#' At least one data element is required in the evaluation watchlist for early stopping to work.
+#' This object should \bold{not} be in-place modified in ways that conflict with the
+#' training (e.g. resetting the parameters for a training update in a way that resets
+#' the number of rounds to zero in order to overwrite rounds).
 #'
-#' Callback function expects the following values to be set in its calling frame:
-#' \code{stop_condition},
-#' \code{bst_evaluation},
-#' \code{rank},
-#' \code{bst} (or \code{bst_folds} and \code{basket}),
-#' \code{iteration},
-#' \code{begin_iteration},
-#' \code{end_iteration},
+#' Note that any R attributes that are assigned to the booster during the callback functions,
+#' will not be kept thereafter as the booster object variable is not re-assigned during
+#' training. It is however possible to set C-level attributes of the booster through
+#' \link{xgb.attr} or \link{xgb.attributes}, which should remain available for the rest
+#' of the iterations and after the training is done.
 #'
-#' @seealso
-#' \code{\link{callbacks}},
-#' \code{\link{xgb.attr}}
+#' For keeping variables across iterations, it's recommended to use `env` instead.
+#' \item data The data to which the model is being fit, as an `xgb.DMatrix` object.
 #'
-#' @export
-cb.early.stop <- function(stopping_rounds, maximize = FALSE,
-                          metric_name = NULL, verbose = TRUE) {
-  # state variables
-  best_iteration <- -1
-  best_score <- Inf
-  best_msg <- NULL
-  metric_idx <- 1
-
-  init <- function(env) {
-    if (length(env$bst_evaluation) == 0)
-      stop("For early stopping, watchlist must have at least one element")
-
-    eval_names <- gsub('-', '_', names(env$bst_evaluation), fixed = TRUE)
-    if (!is.null(metric_name)) {
-      metric_idx <<- which(gsub('-', '_', metric_name, fixed = TRUE) == eval_names)
-      if (length(metric_idx) == 0)
-        stop("'metric_name' for early stopping is not one of the following:\n",
-             paste(eval_names, collapse = ' '), '\n')
-    }
-    if (is.null(metric_name) &&
-        length(env$bst_evaluation) > 1) {
-      metric_idx <<- length(eval_names)
-      if (verbose)
-        cat('Multiple eval metrics are present. Will use ',
-            eval_names[metric_idx], ' for early stopping.\n', sep = '')
-    }
-
-    metric_name <<- eval_names[metric_idx]
-
-    # maximize is usually NULL when not set in xgb.train and built-in metrics
-    if (is.null(maximize))
-      maximize <<- grepl('(_auc|_map|_ndcg|_pre)', metric_name)
-
-    if (verbose && NVL(env$rank, 0) == 0)
-      cat("Will train until ", metric_name, " hasn't improved in ",
-          stopping_rounds, " rounds.\n\n", sep = '')
-
-    best_iteration <<- 1
-    if (maximize) best_score <<- -Inf
-
-    env$stop_condition <- FALSE
-
-    if (!is.null(env$bst)) {
-      if (!inherits(env$bst, 'xgb.Booster'))
-        stop("'bst' in the parent frame must be an 'xgb.Booster'")
-      if (!is.null(best_score <- xgb.attr(env$bst, 'best_score'))) {
-        best_score <<- as.numeric(best_score)
-        best_iteration <<- as.numeric(xgb.attr(env$bst, 'best_iteration')) + 1
-        best_msg <<- as.numeric(xgb.attr(env$bst, 'best_msg'))
-      } else {
-        xgb.attributes(env$bst) <- list(best_iteration = best_iteration - 1,
-                                        best_score = best_score)
-      }
-    } else if (is.null(env$bst_folds) || is.null(env$basket)) {
-      stop("Parent frame has neither 'bst' nor ('bst_folds' and 'basket')")
-    }
-  }
-
-  finalizer <- function(env) {
-    if (!is.null(env$bst)) {
-      attr_best_score <- as.numeric(xgb.attr(env$bst, 'best_score'))
-      if (best_score != attr_best_score) {
-        # If the difference is too big, throw an error
-        if (abs(best_score - attr_best_score) >= 1e-14) {
-          stop("Inconsistent 'best_score' values between the closure state: ", best_score,
-               " and the xgb.attr: ", attr_best_score)
-        }
-        # If the difference is due to floating-point truncation, update best_score
-        best_score <- attr_best_score
-      }
-      xgb.attr(env$bst, "best_iteration") <- best_iteration - 1
-      xgb.attr(env$bst, "best_score") <- best_score
-    } else {
-      env$basket$best_iteration <- best_iteration
-    }
-  }
-
-  callback <- function(env = parent.frame(), finalize = FALSE) {
-    if (best_iteration < 0)
-      init(env)
-
-    if (finalize)
-      return(finalizer(env))
-
-    i <- env$iteration
-    score <- env$bst_evaluation[metric_idx]
-
-    if ((maximize && score > best_score) ||
-        (!maximize && score < best_score)) {
-
-      best_msg <<- .format_eval_string(
-        i, env$bst_evaluation, env$bst_evaluation_err
-      )
-      best_score <<- score
-      best_iteration <<- i
-      # save the property to attributes, so they will occur in checkpoint
-      if (!is.null(env$bst)) {
-        xgb.attributes(env$bst) <- list(
-          best_iteration = best_iteration - 1, # convert to 0-based index
-          best_score = best_score,
-          best_msg = best_msg
-        )
-      }
-    } else if (i - best_iteration >= stopping_rounds) {
-      env$stop_condition <- TRUE
-      env$end_iteration <- i
-      if (verbose && NVL(env$rank, 0) == 0)
-        cat("Stopping. Best iteration:\n", best_msg, "\n\n", sep = '')
-    }
-  }
-  attr(callback, 'call') <- match.call()
-  attr(callback, 'name') <- 'cb.early.stop'
-  callback
-}
-
-
-#' Callback closure for saving a model file.
+#' Note that, for \link{xgb.cv}, this will be the full data, while data for the specific
+#' folds can be found in the `model` object.
 #'
-#' @param save_period save the model to disk after every
-#'        \code{save_period} iterations; 0 means save the model at the end.
-#' @param save_name the name or path for the saved model file.
+#' \item evals The evaluation data, as passed under argument `evals` to
+#' \link{xgb.train}.
 #'
-#'        Note that the format of the model being saved is determined by the file
-#'        extension specified here (see \link{xgb.save} for details about how it works).
+#' For \link{xgb.cv}, this will always be `NULL`.
 #'
-#'        It can contain a \code{\link[base]{sprintf}} formatting specifier
-#'        to include the integer iteration number in the file name.
-#'        E.g., with \code{save_name} = 'xgboost_%04d.ubj',
-#'        the file saved at iteration 50 would be named "xgboost_0050.ubj".
-#' @seealso \link{xgb.save}
-#' @details
-#' This callback function allows to save an xgb-model file, either periodically after each \code{save_period}'s or at the end.
+#' \item begin_iteration Index of the first boosting iteration that will be executed
+#' (base-1 indexing).
 #'
-#' Callback function expects the following values to be set in its calling frame:
-#' \code{bst},
-#' \code{iteration},
-#' \code{begin_iteration},
-#' \code{end_iteration}.
+#' This will typically be '1', but when using training continuation, depending on the
+#' parameters for updates, boosting rounds will be continued from where the previous
+#' model ended, in which case this will be larger than 1.
 #'
-#' @seealso
-#' \code{\link{callbacks}}
+#' \item end_iteration Index of the last boostign iteration that will be executed
+#' (base-1 indexing, inclusive of this end).
 #'
-#' @export
-cb.save.model <- function(save_period = 0, save_name = "xgboost.ubj") {
-
-  if (save_period < 0)
-    stop("'save_period' cannot be negative")
-
-  callback <- function(env = parent.frame()) {
-    if (is.null(env$bst))
-      stop("'save_model' callback requires the 'bst' booster object in its calling frame")
-
-    if ((save_period > 0 && (env$iteration - env$begin_iteration) %% save_period == 0) ||
-        (save_period == 0 && env$iteration == env$end_iteration)) {
-      # Note: this throws a warning if the name doesn't have anything to format through 'sprintf'
-      suppressWarnings({
-        save_name <- sprintf(save_name, env$iteration)
-      })
-      xgb.save(env$bst, save_name)
-    }
-  }
-  attr(callback, 'call') <- match.call()
-  attr(callback, 'name') <- 'cb.save.model'
-  callback
-}
-
-
-#' Callback closure for returning cross-validation based predictions.
+#' It should match with argument `nrounds` passed to \link{xgb.train} or \link{xgb.cv}.
 #'
-#' @param save_models a flag for whether to save the folds' models.
+#' Note that boosting might be interrupted before reaching this last iteration, for
+#' example by using the early stopping callback \link{xgb.cb.early.stop}.
 #'
-#' @details
-#' This callback function saves predictions for all of the test folds,
-#' and also allows to save the folds' models.
+#' \item iteration Index of the iteration number that is being executed (first iteration
+#' will be the same as parameter `begin_iteration`, then next one will add +1, and so on).
 #'
-#' It is a "finalizer" callback and it uses early stopping information whenever it is available,
-#' thus it must be run after the early stopping callback if the early stopping is used.
+#' \item iter_feval Evaluation metrics for `evals` that were supplied, either
+#' determined by the objective, or by parameter `feval`.
 #'
-#' Callback function expects the following values to be set in its calling frame:
-#' \code{bst_folds},
-#' \code{basket},
-#' \code{data},
-#' \code{end_iteration},
-#' \code{params},
+#' For \link{xgb.train}, this will be a named vector with one entry per element in
+#' `evals`, where the names are determined as 'evals name' + '-' + 'metric name' - for
+#' example, if `evals` contains an entry named "tr" and the metric is "rmse",
+#' this will be a one-element vector with name "tr-rmse".
 #'
-#' @return
-#' Predictions are returned inside of the \code{pred} element, which is either a vector or a matrix,
-#' depending on the number of prediction outputs per data row. The order of predictions corresponds
-#' to the order of rows in the original dataset. Note that when a custom \code{folds} list is
-#' provided in \code{xgb.cv}, the predictions would only be returned properly when this list is a
-#' non-overlapping list of k sets of indices, as in a standard k-fold CV. The predictions would not be
-#' meaningful when user-provided folds have overlapping indices as in, e.g., random sampling splits.
-#' When some of the indices in the training dataset are not included into user-provided \code{folds},
-#' their prediction value would be \code{NA}.
+#' For \link{xgb.cv}, this will be a 2d matrix with dimensions `[length(evals), nfolds]`,
+#' where the row names will follow the same naming logic as the one-dimensional vector
+#' that is passed in \link{xgb.train}.
 #'
-#' @seealso
-#' \code{\link{callbacks}}
+#' Note that, internally, the built-in callbacks such as \link{xgb.cb.print.evaluation} summarize
+#' this table by calculating the row-wise means and standard deviations.
 #'
-#' @export
-cb.cv.predict <- function(save_models = FALSE) {
-
-  finalizer <- function(env) {
-    if (is.null(env$basket) || is.null(env$bst_folds))
-      stop("'cb.cv.predict' callback requires 'basket' and 'bst_folds' lists in its calling frame")
-
-    N <- nrow(env$data)
-    pred <- NULL
-
-    iterationrange <- c(1, NVL(env$basket$best_iteration, env$end_iteration))
-    if (NVL(env$params[['booster']], '') == 'gblinear') {
-      iterationrange <- "all"
-    }
-    for (fd in env$bst_folds) {
-      pr <- predict(fd$bst, fd$watchlist[[2]], iterationrange = iterationrange, reshape = TRUE)
-      if (is.null(pred)) {
-        if (NCOL(pr) > 1L) {
-          pred <- matrix(NA_real_, N, ncol(pr))
-        } else {
-          pred <- matrix(NA_real_, N)
-        }
-      }
-      if (is.matrix(pred)) {
-        pred[fd$index, ] <- pr
-      } else {
-        pred[fd$index] <- pr
-      }
-    }
-    env$basket$pred <- pred
-    if (save_models) {
-      env$basket$models <- lapply(env$bst_folds, function(fd) {
-        return(fd$bst)
-      })
-    }
-  }
-
-  callback <- function(env = parent.frame(), finalize = FALSE) {
-    if (finalize)
-      return(finalizer(env))
-  }
-  attr(callback, 'call') <- match.call()
-  attr(callback, 'name') <- 'cb.cv.predict'
-  callback
-}
-
-
-#' Callback closure for collecting the model coefficients history of a gblinear booster
-#' during its training.
+#' \item final_feval The evaluation results after the last boosting round is executed
+#' (same format as `iter_feval`, and will be the exact same input as passed under
+#' `iter_feval` to the last round that is executed during model fitting).
 #'
-#' @param sparse when set to FALSE/TRUE, a dense/sparse matrix is used to store the result.
-#'       Sparse format is useful when one expects only a subset of coefficients to be non-zero,
-#'       when using the "thrifty" feature selector with fairly small number of top features
-#'       selected per iteration.
+#' \item prev_cb_res Result from a previous run of a callback sharing the same name
+#' (as given by parameter `cb_name`) when conducting training continuation, if there
+#' was any in the booster R attributes.
 #'
-#' @details
-#' To keep things fast and simple, gblinear booster does not internally store the history of linear
-#' model coefficients at each boosting iteration. This callback provides a workaround for storing
-#' the coefficients' path, by extracting them after each training iteration.
+#' Some times, one might want to append the new results to the previous one, and this will
+#' be done automatically by the built-in callbacks such as \link{xgb.cb.evaluation.log},
+#' which will append the new rows to the previous table.
 #'
-#' Callback function expects the following values to be set in its calling frame:
-#' \code{bst} (or \code{bst_folds}).
+#' If no such previous callback result is available (which it never will when fitting
+#' a model from start instead of updating an existing model), this will be `NULL`.
 #'
-#' @return
-#' Results are stored in the \code{coefs} element of the closure.
-#' The \code{\link{xgb.gblinear.history}} convenience function provides an easy
-#' way to access it.
-#' With \code{xgb.train}, it is either a dense of a sparse matrix.
-#' While with \code{xgb.cv}, it is a list (an element per each fold) of such
-#' matrices.
+#' For \link{xgb.cv}, which doesn't support training continuation, this will always be `NULL`.
+#' }
 #'
-#' @seealso
-#' \code{\link{callbacks}}, \code{\link{xgb.gblinear.history}}.
+#' The following names (`cb_name` values) are reserved for internal callbacks:\itemize{
+#' \item print_evaluation
+#' \item evaluation_log
+#' \item reset_parameters
+#' \item early_stop
+#' \item save_model
+#' \item cv_predict
+#' \item gblinear_history
+#' }
 #'
+#' The following names are reserved for other non-callback attributes:\itemize{
+#' \item names
+#' \item class
+#' \item call
+#' \item params
+#' \item niter
+#' \item nfeatures
+#' \item folds
+#' }
+#'
+#' When using the built-in early stopping callback (\link{xgb.cb.early.stop}), said callback
+#' will always be executed before the others, as it sets some booster C-level attributes
+#' that other callbacks might also use. Otherwise, the order of execution will match with
+#' the order in which the callbacks are passed to the model fitting function.
+#' @seealso Built-in callbacks:\itemize{
+#' \item \link{xgb.cb.print.evaluation}
+#' \item \link{xgb.cb.evaluation.log}
+#' \item \link{xgb.cb.reset.parameters}
+#' \item \link{xgb.cb.early.stop}
+#' \item \link{xgb.cb.save.model}
+#' \item \link{xgb.cb.cv.predict}
+#' \item \link{xgb.cb.gblinear.history}
+#' }
 #' @examples
-#' #### Binary classification:
+#' # Example constructing a custom callback that calculates
+#' # squared error on the training data (no separate test set),
+#' # and outputs the per-iteration results.
+#' ssq_callback <- xgb.Callback(
+#'   cb_name = "ssq",
+#'   f_before_training = function(env, model, data, evals,
+#'                                begin_iteration, end_iteration) {
+#'     # A vector to keep track of a number at each iteration
+#'     env$logs <- rep(NA_real_, end_iteration - begin_iteration + 1)
+#'   },
+#'   f_after_iter = function(env, model, data, evals, iteration, iter_feval) {
+#'     # This calculates the sum of squared errors on the training data.
+#'     # Note that this can be better done by passing an 'evals' entry,
+#'     # but this demonstrates a way in which callbacks can be structured.
+#'     pred <- predict(model, data)
+#'     err <- pred - getinfo(data, "label")
+#'     sq_err <- sum(err^2)
+#'     env$logs[iteration] <- sq_err
+#'     cat(
+#'       sprintf(
+#'         "Squared error at iteration %d: %.2f\n",
+#'         iteration, sq_err
+#'       )
+#'     )
 #'
-#' ## Keep the number of threads to 1 for examples
-#' nthread <- 1
-#' data.table::setDTthreads(nthread)
+#'     # A return value of 'TRUE' here would signal to finalize the training
+#'     return(FALSE)
+#'   },
+#'   f_after_training = function(env, model, data, evals, iteration,
+#'                               final_feval, prev_cb_res) {
+#'     return(env$logs)
+#'   }
+#' )
 #'
-#' # In the iris dataset, it is hard to linearly separate Versicolor class from the rest
-#' # without considering the 2nd order interactions:
-#' x <- model.matrix(Species ~ .^2, iris)[,-1]
-#' colnames(x)
-#' dtrain <- xgb.DMatrix(scale(x), label = 1*(iris$Species == "versicolor"), nthread = nthread)
-#' param <- list(booster = "gblinear", objective = "reg:logistic", eval_metric = "auc",
-#'               lambda = 0.0003, alpha = 0.0003, nthread = nthread)
-#' # For 'shotgun', which is a default linear updater, using high eta values may result in
-#' # unstable behaviour in some datasets. With this simple dataset, however, the high learning
-#' # rate does not break the convergence, but allows us to illustrate the typical pattern of
-#' # "stochastic explosion" behaviour of this lock-free algorithm at early boosting iterations.
-#' bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 200, eta = 1.,
-#'                  callbacks = list(cb.gblinear.history()))
-#' # Extract the coefficients' path and plot them vs boosting iteration number:
-#' coef_path <- xgb.gblinear.history(bst)
-#' matplot(coef_path, type = 'l')
-#'
-#' # With the deterministic coordinate descent updater, it is safer to use higher learning rates.
-#' # Will try the classical componentwise boosting which selects a single best feature per round:
-#' bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 200, eta = 0.8,
-#'                  updater = 'coord_descent', feature_selector = 'thrifty', top_k = 1,
-#'                  callbacks = list(cb.gblinear.history()))
-#' matplot(xgb.gblinear.history(bst), type = 'l')
-#' #  Componentwise boosting is known to have similar effect to Lasso regularization.
-#' # Try experimenting with various values of top_k, eta, nrounds,
-#' # as well as different feature_selectors.
-#'
-#' # For xgb.cv:
-#' bst <- xgb.cv(param, dtrain, nfold = 5, nrounds = 100, eta = 0.8,
-#'               callbacks = list(cb.gblinear.history()))
-#' # coefficients in the CV fold #3
-#' matplot(xgb.gblinear.history(bst)[[3]], type = 'l')
-#'
-#'
-#' #### Multiclass classification:
-#' #
-#' dtrain <- xgb.DMatrix(scale(x), label = as.numeric(iris$Species) - 1, nthread = nthread)
-#' param <- list(booster = "gblinear", objective = "multi:softprob", num_class = 3,
-#'               lambda = 0.0003, alpha = 0.0003, nthread = nthread)
-#' # For the default linear updater 'shotgun' it sometimes is helpful
-#' # to use smaller eta to reduce instability
-#' bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 50, eta = 0.5,
-#'                  callbacks = list(cb.gblinear.history()))
-#' # Will plot the coefficient paths separately for each class:
-#' matplot(xgb.gblinear.history(bst, class_index = 0), type = 'l')
-#' matplot(xgb.gblinear.history(bst, class_index = 1), type = 'l')
-#' matplot(xgb.gblinear.history(bst, class_index = 2), type = 'l')
-#'
-#' # CV:
-#' bst <- xgb.cv(param, dtrain, nfold = 5, nrounds = 70, eta = 0.5,
-#'               callbacks = list(cb.gblinear.history(FALSE)))
-#' # 1st fold of 1st class
-#' matplot(xgb.gblinear.history(bst, class_index = 0)[[1]], type = 'l')
+#' data(mtcars)
+#' y <- mtcars$mpg
+#' x <- as.matrix(mtcars[, -1])
+#' dm <- xgb.DMatrix(x, label = y, nthread = 1)
+#' model <- xgb.train(
+#'   data = dm,
+#'   params = list(objective = "reg:squarederror", nthread = 1),
+#'   nrounds = 5,
+#'   callbacks = list(ssq_callback),
+#'   keep_extra_attributes = TRUE
+#' )
 #'
+#' # Result from 'f_after_iter' will be available as an attribute
+#' attributes(model)$ssq
 #' @export
-cb.gblinear.history <- function(sparse = FALSE) {
-  coefs <- NULL
+xgb.Callback <- function(
+  cb_name = "custom_callback",
+  env = new.env(),
+  f_before_training = function(env, model, data, evals, begin_iteration, end_iteration) NULL,
+  f_before_iter = function(env, model, data, evals, iteration) NULL,
+  f_after_iter = function(env, model, data, evals, iteration, iter_feval) NULL,
+  f_after_training = function(env, model, data, evals, iteration, final_feval, prev_cb_res) NULL
+) {
+  stopifnot(is.null(f_before_training) || is.function(f_before_training))
+  stopifnot(is.null(f_before_iter) || is.function(f_before_iter))
+  stopifnot(is.null(f_after_iter) || is.function(f_after_iter))
+  stopifnot(is.null(f_after_training) || is.function(f_after_training))
+  stopifnot(is.character(cb_name) && length(cb_name) == 1)
 
-  init <- function(env) {
-    # xgb.train(): bst will be present
-    # xgb.cv(): bst_folds will be present
-    if (is.null(env$bst) && is.null(env$bst_folds)) {
-        stop("Parent frame has neither 'bst' nor 'bst_folds'")
-    }
+  if (cb_name %in% .reserved_cb_names) {
+    stop("Cannot use reserved callback name '", cb_name, "'.")
   }
 
-  # convert from list to (sparse) matrix
-  list2mat <- function(coef_list) {
-    if (sparse) {
-      coef_mat <- sparseMatrix(x = unlist(lapply(coef_list, slot, "x")),
-                               i = unlist(lapply(coef_list, slot, "i")),
-                               p = c(0, cumsum(sapply(coef_list, function(x) length(x@x)))),
-                               dims = c(length(coef_list[[1]]), length(coef_list)))
-      return(t(coef_mat))
-    } else {
-      return(do.call(rbind, coef_list))
-    }
-  }
+  out <- list(
+    cb_name = cb_name,
+    env = env,
+    f_before_training = f_before_training,
+    f_before_iter = f_before_iter,
+    f_after_iter = f_after_iter,
+    f_after_training = f_after_training
+  )
+  class(out) <- "xgb.Callback"
+  return(out)
+}
 
-  finalizer <- function(env) {
-    if (length(coefs) == 0)
-      return()
-    if (!is.null(env$bst)) { # # xgb.train:
-      coefs <<- list2mat(coefs)
-    } else { # xgb.cv:
-      # second lapply transposes the list
-      coefs <<- lapply(
-        X = lapply(
-          X = seq_along(coefs[[1]]),
-          FUN = function(i) lapply(coefs, "[[", i)
-        ),
-        FUN = list2mat
+.execute.cb.before.training <- function(
+  callbacks,
+  model,
+  data,
+  evals,
+  begin_iteration,
+  end_iteration
+) {
+  for (callback in callbacks) {
+    if (!is.null(callback$f_before_training)) {
+      callback$f_before_training(
+        callback$env,
+        model,
+        data,
+        evals,
+        begin_iteration,
+        end_iteration
       )
     }
   }
-
-  extract.coef <- function(env) {
-    if (!is.null(env$bst)) { # # xgb.train:
-      cf <- as.numeric(grep('(booster|bias|weigh)', xgb.dump(env$bst), invert = TRUE, value = TRUE))
-      if (sparse) cf <- as(cf, "sparseVector")
-    } else { # xgb.cv:
-      cf <- vector("list", length(env$bst_folds))
-      for (i in seq_along(env$bst_folds)) {
-        dmp <- xgb.dump(env$bst_folds[[i]]$bst)
-        cf[[i]] <- as.numeric(grep('(booster|bias|weigh)', dmp, invert = TRUE, value = TRUE))
-        if (sparse) cf[[i]] <- as(cf[[i]], "sparseVector")
-      }
-    }
-    cf
-  }
-
-  callback <- function(env = parent.frame(), finalize = FALSE) {
-    if (is.null(coefs)) init(env)
-    if (finalize) return(finalizer(env))
-    cf <- extract.coef(env)
-    coefs <<- c(coefs, list(cf))
-  }
-
-  attr(callback, 'call') <- match.call()
-  attr(callback, 'name') <- 'cb.gblinear.history'
-  callback
 }
 
-#' @title Extract gblinear coefficients history.
-#' @description A helper function to extract the matrix of linear coefficients' history
-#' from a gblinear model created while using the \code{cb.gblinear.history()}
-#' callback.
-#' @details Note that this is an R-specific function that relies on R attributes that
-#' are not saved when using xgboost's own serialization functions like \link{xgb.load}
-#' or \link{xgb.load.raw}.
-#'
-#' In order for a serialized model to be accepted by tgis function, one must use R
-#' serializers such as \link{saveRDS}.
-#' @param model either an \code{xgb.Booster} or a result of \code{xgb.cv()}, trained
-#'        using the \code{cb.gblinear.history()} callback, but \bold{not} a booster
-#'        loaded from \link{xgb.load} or \link{xgb.load.raw}.
-#' @param class_index zero-based class index to extract the coefficients for only that
-#'        specific class in a multinomial multiclass model. When it is NULL, all the
-#'        coefficients are returned. Has no effect in non-multiclass models.
-#'
-#' @return
-#' For an \code{xgb.train} result, a matrix (either dense or sparse) with the columns
-#' corresponding to iteration's coefficients (in the order as \code{xgb.dump()} would
-#' return) and the rows corresponding to boosting iterations.
-#'
-#' For an \code{xgb.cv} result, a list of such matrices is returned with the elements
-#' corresponding to CV folds.
-#'
-#' @export
-xgb.gblinear.history <- function(model, class_index = NULL) {
-
-  if (!(inherits(model, "xgb.Booster") ||
-        inherits(model, "xgb.cv.synchronous")))
-    stop("model must be an object of either xgb.Booster or xgb.cv.synchronous class")
-  is_cv <- inherits(model, "xgb.cv.synchronous")
-
-  if (is_cv) {
-    callbacks <- model$callbacks
-  } else {
-    callbacks <- attributes(model)$callbacks
+.execute.cb.before.iter <- function(
+  callbacks,
+  model,
+  data,
+  evals,
+  iteration
+) {
+  if (!length(callbacks)) {
+    return(FALSE)
   }
+  out <- sapply(callbacks, function(cb) {
+    if (is.null(cb$f_before_iter)) {
+      return(FALSE)
+    }
+    should_stop <- cb$f_before_iter(
+      cb$env,
+      model,
+      data,
+      evals,
+      iteration
+    )
+    if (!NROW(should_stop)) {
+      should_stop <- FALSE
+    } else if (NROW(should_stop) > 1) {
+      should_stop <- head(as.logical(should_stop), 1)
+    }
+    return(should_stop)
+  })
+  return(any(out))
+}
 
-  if (is.null(callbacks) || is.null(callbacks$cb.gblinear.history))
-    stop("model must be trained while using the cb.gblinear.history() callback")
-
-  if (!is_cv) {
-    num_class <- xgb.num_class(model)
-    num_feat <- xgb.num_feature(model)
-  } else {
-    # in case of CV, the object is expected to have this info
-    if (model$params$booster != "gblinear")
-      stop("It does not appear to be a gblinear model")
-    num_class <- NVL(model$params$num_class, 1)
-    num_feat <- model$nfeatures
-    if (is.null(num_feat))
-      stop("This xgb.cv result does not have nfeatures info")
+.execute.cb.after.iter <- function(
+  callbacks,
+  model,
+  data,
+  evals,
+  iteration,
+  iter_feval
+) {
+  if (!length(callbacks)) {
+    return(FALSE)
   }
+  out <- sapply(callbacks, function(cb) {
+    if (is.null(cb$f_after_iter)) {
+      return(FALSE)
+    }
+    should_stop <- cb$f_after_iter(
+      cb$env,
+      model,
+      data,
+      evals,
+      iteration,
+      iter_feval
+    )
+    if (!NROW(should_stop)) {
+      should_stop <- FALSE
+    } else if (NROW(should_stop) > 1) {
+      should_stop <- head(as.logical(should_stop), 1)
+    }
+    return(should_stop)
+  })
+  return(any(out))
+}
 
-  if (!is.null(class_index) &&
-      num_class > 1 &&
-      (class_index[1] < 0 || class_index[1] >= num_class))
-    stop("class_index has to be within [0,", num_class - 1, "]")
-
-  coef_path <- environment(callbacks$cb.gblinear.history)[["coefs"]]
-  if (!is.null(class_index) && num_class > 1) {
-    coef_path <- if (is.list(coef_path)) {
-      lapply(coef_path,
-             function(x) x[, seq(1 + class_index, by = num_class, length.out = num_feat)])
+.execute.cb.after.training <- function(
+  callbacks,
+  model,
+  data,
+  evals,
+  iteration,
+  final_feval,
+  prev_cb_res
+) {
+  if (!length(callbacks)) {
+    return(NULL)
+  }
+  old_cb_res <- attributes(model)
+  out <- lapply(callbacks, function(cb) {
+    if (is.null(cb$f_after_training)) {
+      return(NULL)
     } else {
-      coef_path <- coef_path[, seq(1 + class_index, by = num_class, length.out = num_feat)]
+      return(
+        cb$f_after_training(
+          cb$env,
+          model,
+          data,
+          evals,
+          iteration,
+          final_feval,
+          getElement(old_cb_res, cb$cb_name)
+        )
+      )
     }
+  })
+  names(out) <- sapply(callbacks, function(cb) cb$cb_name)
+  if (NROW(out)) {
+    out <- out[!sapply(out, is.null)]
   }
-  coef_path
+  return(out)
 }
 
+.summarize.feval <- function(iter_feval, showsd) {
+  if (NCOL(iter_feval) > 1L && showsd) {
+    stdev <- apply(iter_feval, 1, sd)
+  } else {
+    stdev <- NULL
+  }
+  if (NCOL(iter_feval) > 1L) {
+    iter_feval <- rowMeans(iter_feval)
+  }
+  return(list(feval = iter_feval, stdev = stdev))
+}
 
-#
-# Internal utility functions for callbacks ------------------------------------
-#
+.print.evaluation <- function(iter_feval, showsd, iteration) {
+  tmp <- .summarize.feval(iter_feval, showsd)
+  msg <- .format_eval_string(iteration, tmp$feval, tmp$stdev)
+  cat(msg, '\n')
+}
 
 # Format the evaluation metric string
 .format_eval_string <- function(iter, eval_res, eval_err = NULL) {
@@ -784,69 +407,838 @@ xgb.gblinear.history <- function(model, class_index = NULL) {
   return(paste0(iter, res))
 }
 
-# Extract callback names from the list of callbacks
-callback.names <- function(cb_list) {
-  unlist(lapply(cb_list, function(x) attr(x, 'name')))
-}
-
-# Extract callback calls from the list of callbacks
-callback.calls <- function(cb_list) {
-  unlist(lapply(cb_list, function(x) attr(x, 'call')))
-}
-
-# Add a callback cb to the list and make sure that
-# cb.early.stop and cb.cv.predict are at the end of the list
-# with cb.cv.predict being the last (when present)
-add.cb <- function(cb_list, cb) {
-  cb_list <- c(cb_list, cb)
-  names(cb_list) <- callback.names(cb_list)
-  if ('cb.early.stop' %in% names(cb_list)) {
-    cb_list <- c(cb_list, cb_list['cb.early.stop'])
-    # this removes only the first one
-    cb_list['cb.early.stop'] <- NULL
+#' @title Callback for printing the result of evaluation
+#' @param period results would be printed every number of periods
+#' @param showsd whether standard deviations should be printed (when available)
+#' @return An `xgb.Callback` object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+#' @description
+#' The callback function prints the result of evaluation at every \code{period} iterations.
+#' The initial and the last iteration's evaluations are always printed.
+#'
+#' Does not leave any attribute in the booster (see \link{xgb.cb.evaluation.log} for that).
+#' @seealso \link{xgb.Callback}
+#' @export
+xgb.cb.print.evaluation <- function(period = 1, showsd = TRUE) {
+  if (length(period) != 1 || period != floor(period) || period < 1) {
+    stop("'period' must be a positive integer.")
   }
-  if ('cb.cv.predict' %in% names(cb_list)) {
-    cb_list <- c(cb_list, cb_list['cb.cv.predict'])
-    cb_list['cb.cv.predict'] <- NULL
-  }
-  cb_list
-}
 
-# Sort callbacks list into categories
-categorize.callbacks <- function(cb_list) {
-  list(
-    pre_iter = Filter(function(x) {
-        pre <- attr(x, 'is_pre_iteration')
-        !is.null(pre) && pre
-      }, cb_list),
-    post_iter = Filter(function(x) {
-        pre <- attr(x, 'is_pre_iteration')
-        is.null(pre) || !pre
-      }, cb_list),
-    finalize = Filter(function(x) {
-        'finalize' %in% names(formals(x))
-      }, cb_list)
+  xgb.Callback(
+    cb_name = "print_evaluation",
+    env = as.environment(list(period = period, showsd = showsd, is_first_call = TRUE)),
+    f_before_training = NULL,
+    f_before_iter = NULL,
+    f_after_iter = function(env, model, data, evals, iteration, iter_feval) {
+      if (is.null(iter_feval)) {
+        return(FALSE)
+      }
+      if (env$is_first_call || (iteration - 1) %% env$period == 0) {
+        .print.evaluation(iter_feval, env$showsd, iteration)
+        env$last_printed_iter <- iteration
+      }
+      env$is_first_call <- FALSE
+      return(FALSE)
+    },
+    f_after_training = function(env, model, data, evals, iteration, final_feval, prev_cb_res) {
+      if (is.null(final_feval)) {
+        return(NULL)
+      }
+      if (is.null(env$last_printed_iter) || iteration > env$last_printed_iter) {
+        .print.evaluation(final_feval, env$showsd, iteration)
+      }
+    }
   )
 }
 
-# Check whether all callback functions with names given by 'query_names' are present in the 'cb_list'.
-has.callbacks <- function(cb_list, query_names) {
-  if (length(cb_list) < length(query_names))
-    return(FALSE)
-  if (!is.list(cb_list) ||
-      any(sapply(cb_list, class) != 'function')) {
-    stop('`cb_list` must be a list of callback functions')
-  }
-  cb_names <- callback.names(cb_list)
-  if (!is.character(cb_names) ||
-      length(cb_names) != length(cb_list) ||
-      any(cb_names == "")) {
-    stop('All callbacks in the `cb_list` must have a non-empty `name` attribute')
-  }
-  if (!is.character(query_names) ||
-      length(query_names) == 0 ||
-      any(query_names == "")) {
-    stop('query_names must be a non-empty vector of non-empty character names')
-  }
-  return(all(query_names %in% cb_names))
+#' @title Callback for logging the evaluation history
+#' @return An `xgb.Callback` object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+#' @details This callback creates a table with per-iteration evaluation metrics (see parameters
+#' `evals` and `feval` in \link{xgb.train}).
+#' @details
+#' Note: in the column names of the final data.table, the dash '-' character is replaced with
+#' the underscore '_' in order to make the column names more like regular R identifiers.
+#' @seealso \link{xgb.cb.print.evaluation}
+#' @export
+xgb.cb.evaluation.log <- function() {
+  xgb.Callback(
+    cb_name = "evaluation_log",
+    f_before_training = function(env, model, data, evals, begin_iteration, end_iteration) {
+      env$evaluation_log <- vector("list", end_iteration - begin_iteration + 1)
+      env$next_log <- 1
+    },
+    f_before_iter = NULL,
+    f_after_iter = function(env, model, data, evals, iteration, iter_feval) {
+      tmp <- .summarize.feval(iter_feval, TRUE)
+      env$evaluation_log[[env$next_log]] <- list(iter = iteration, metrics = tmp$feval, sds = tmp$stdev)
+      env$next_log <- env$next_log + 1
+      return(FALSE)
+    },
+    f_after_training = function(env, model, data, evals, iteration, final_feval, prev_cb_res) {
+      if (!NROW(env$evaluation_log)) {
+        return(prev_cb_res)
+      }
+      # in case of early stopping
+      if (env$next_log <= length(env$evaluation_log)) {
+        env$evaluation_log <- head(env$evaluation_log, env$next_log - 1)
+      }
+
+      iters <- data.frame(iter = sapply(env$evaluation_log, function(x) x$iter))
+      metrics <- do.call(rbind, lapply(env$evaluation_log, function(x) x$metrics))
+      mnames <- gsub("-", "_", names(env$evaluation_log[[1]]$metrics), fixed = TRUE)
+      colnames(metrics) <- mnames
+      has_sds <- !is.null(env$evaluation_log[[1]]$sds)
+      if (has_sds) {
+        sds <- do.call(rbind, lapply(env$evaluation_log, function(x) x$sds))
+        colnames(sds) <- mnames
+        metrics <- lapply(
+          mnames,
+          function(metric) {
+            out <- cbind(metrics[, metric], sds[, metric])
+            colnames(out) <- paste0(metric, c("_mean", "_std"))
+            return(out)
+          }
+        )
+        metrics <- do.call(cbind, metrics)
+      }
+      evaluation_log <- cbind(iters, metrics)
+
+      if (!is.null(prev_cb_res)) {
+        if (!is.data.table(prev_cb_res)) {
+          prev_cb_res <- data.table::as.data.table(prev_cb_res)
+        }
+        prev_take <- prev_cb_res[prev_cb_res$iter < min(evaluation_log$iter)]
+        if (nrow(prev_take)) {
+          evaluation_log <- rbind(prev_cb_res, evaluation_log)
+        }
+      }
+      evaluation_log <- data.table::as.data.table(evaluation_log)
+      return(evaluation_log)
+    }
+  )
+}
+
+#' @title Callback for resetting the booster's parameters at each iteration.
+#' @param new_params a list where each element corresponds to a parameter that needs to be reset.
+#'        Each element's value must be either a vector of values of length \code{nrounds}
+#'        to be set at each iteration,
+#'        or a function of two parameters \code{learning_rates(iteration, nrounds)}
+#'        which returns a new parameter value by using the current iteration number
+#'        and the total number of boosting rounds.
+#' @return An `xgb.Callback` object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+#' @details
+#' Note that when training is resumed from some previous model, and a function is used to
+#' reset a parameter value, the \code{nrounds} argument in this function would be the
+#' the number of boosting rounds in the current training.
+#'
+#' Does not leave any attribute in the booster.
+#' @export
+xgb.cb.reset.parameters <- function(new_params) {
+  stopifnot(is.list(new_params))
+  pnames <- gsub(".", "_", names(new_params), fixed = TRUE)
+  not_allowed <- pnames %in%
+    c('num_class', 'num_output_group', 'size_leaf_vector', 'updater_seq')
+  if (any(not_allowed))
+    stop('Parameters ', paste(pnames[not_allowed]), " cannot be changed during boosting.")
+
+  xgb.Callback(
+    cb_name = "reset_parameters",
+    env = as.environment(list(new_params = new_params)),
+    f_before_training = function(env, model, data, evals, begin_iteration, end_iteration) {
+      env$end_iteration <- end_iteration
+
+      pnames <- gsub(".", "_", names(env$new_params), fixed = TRUE)
+      for (n in pnames) {
+        p <- env$new_params[[n]]
+        if (is.function(p)) {
+          if (length(formals(p)) != 2)
+            stop("Parameter '", n, "' is a function but not of two arguments")
+        } else if (is.numeric(p) || is.character(p)) {
+          if (length(p) != env$end_iteration)
+            stop("Length of '", n, "' has to be equal to 'nrounds'")
+        } else {
+          stop("Parameter '", n, "' is not a function or a vector")
+        }
+      }
+    },
+    f_before_iter = function(env, model, data, evals, iteration) {
+      pars <- lapply(env$new_params, function(p) {
+        if (is.function(p)) {
+          return(p(iteration, env$end_iteration))
+        } else {
+          return(p[iteration])
+        }
+      })
+
+      if (inherits(model, "xgb.Booster")) {
+        xgb.parameters(model) <- pars
+      } else {
+        for (fd in model) {
+          xgb.parameters(fd$bst) <- pars
+        }
+      }
+      return(FALSE)
+    },
+    f_after_iter = NULL,
+    f_after_training = NULL
+  )
+}
+
+#' @title Callback to activate early stopping
+#' @param stopping_rounds The number of rounds with no improvement in
+#'        the evaluation metric in order to stop the training.
+#' @param maximize Whether to maximize the evaluation metric.
+#' @param metric_name The name of an evaluation column to use as a criteria for early
+#'        stopping. If not set, the last column would be used.
+#'        Let's say the test data in \code{evals} was labelled as \code{dtest},
+#'        and one wants to use the AUC in test data for early stopping regardless of where
+#'        it is in the \code{evals}, then one of the following would need to be set:
+#'        \code{metric_name='dtest-auc'} or \code{metric_name='dtest_auc'}.
+#'        All dash '-' characters in metric names are considered equivalent to '_'.
+#' @param verbose Whether to print the early stopping information.
+#' @param keep_all_iter Whether to keep all of the boosting rounds that were produced
+#'        in the resulting object. If passing `FALSE`, will only keep the boosting rounds
+#'        up to the detected best iteration, discarding the ones that come after.
+#' @return An `xgb.Callback` object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+#' @description
+#' This callback function determines the condition for early stopping.
+#'
+#' The following attributes are assigned to the booster's object:
+#' \itemize{
+#' \item \code{best_score} the evaluation score at the best iteration
+#' \item \code{best_iteration} at which boosting iteration the best score has occurred
+#' (0-based index for interoperability of binary models)
+#' }
+#'
+#' The same values are also stored as R attributes as a result of the callback, plus an additional
+#' attribute `stopped_by_max_rounds` which indicates whether an early stopping by the `stopping_rounds`
+#' condition occurred. Note that the `best_iteration` that is stored under R attributes will follow
+#' base-1 indexing, so it will be larger by '1' than the C-level 'best_iteration' that is accessed
+#' through \link{xgb.attr} or \link{xgb.attributes}.
+#'
+#' At least one dataset is required in `evals` for early stopping to work.
+#' @export
+xgb.cb.early.stop <- function(
+  stopping_rounds,
+  maximize = FALSE,
+  metric_name = NULL,
+  verbose = TRUE,
+  keep_all_iter = TRUE
+) {
+  if (!is.null(metric_name)) {
+    stopifnot(is.character(metric_name))
+    stopifnot(length(metric_name) == 1L)
+  }
+
+  xgb.Callback(
+    cb_name = "early_stop",
+    env = as.environment(
+      list(
+        checked_evnames = FALSE,
+        stopping_rounds = stopping_rounds,
+        maximize = maximize,
+        metric_name = metric_name,
+        verbose = verbose,
+        keep_all_iter = keep_all_iter,
+        stopped_by_max_rounds = FALSE
+      )
+    ),
+    f_before_training = function(env, model, data, evals, begin_iteration, end_iteration) {
+      if (inherits(model, "xgb.Booster") && !length(evals)) {
+        stop("For early stopping, 'evals' must have at least one element")
+      }
+      env$begin_iteration <- begin_iteration
+      return(NULL)
+    },
+    f_before_iter = function(env, model, data, evals, iteration) NULL,
+    f_after_iter = function(env, model, data, evals, iteration, iter_feval) {
+      sds <- NULL
+      if (NCOL(iter_feval) > 1) {
+        tmp <- .summarize.feval(iter_feval, TRUE)
+        iter_feval <- tmp$feval
+        sds <- tmp$stdev
+      }
+
+      if (!env$checked_evnames) {
+
+        eval_names <- gsub('-', '_', names(iter_feval), fixed = TRUE)
+        if (!is.null(env$metric_name)) {
+          env$metric_idx <- which(gsub('-', '_', env$metric_name, fixed = TRUE) == eval_names)
+          if (length(env$metric_idx) == 0)
+            stop("'metric_name' for early stopping is not one of the following:\n",
+                 paste(eval_names, collapse = ' '), '\n')
+        }
+
+        if (is.null(env$metric_name)) {
+          if (NROW(iter_feval) == 1) {
+            env$metric_idx <- 1L
+          } else {
+            env$metric_idx <- length(eval_names)
+            if (env$verbose)
+              cat('Multiple eval metrics are present. Will use ',
+                  eval_names[env$metric_idx], ' for early stopping.\n', sep = '')
+          }
+        }
+
+        env$metric_name <- eval_names[env$metric_idx]
+
+        # maximize is usually NULL when not set in xgb.train and built-in metrics
+        if (is.null(env$maximize))
+          env$maximize <- grepl('(_auc|_aupr|_map|_ndcg|_pre)', env$metric_name)
+
+        if (env$verbose)
+          cat("Will train until ", env$metric_name, " hasn't improved in ",
+              env$stopping_rounds, " rounds.\n\n", sep = '')
+
+        env$best_iteration <- env$begin_iteration
+        if (env$maximize) {
+          env$best_score <- -Inf
+        } else {
+          env$best_score <- Inf
+        }
+
+        if (inherits(model, "xgb.Booster")) {
+          best_score <- xgb.attr(model, 'best_score')
+          if (NROW(best_score)) env$best_score <- as.numeric(best_score)
+          best_iteration <- xgb.attr(model, 'best_iteration')
+          if (NROW(best_iteration)) env$best_iteration <- as.numeric(best_iteration) + 1
+        }
+
+        env$checked_evnames <- TRUE
+      }
+
+      score <- iter_feval[env$metric_idx]
+      if ((env$maximize && score > env$best_score) ||
+          (!env$maximize && score < env$best_score)) {
+
+        env$best_score <- score
+        env$best_iteration <- iteration
+        # save the property to attributes, so they will occur in checkpoint
+        if (inherits(model, "xgb.Booster")) {
+          xgb.attributes(model) <- list(
+            best_iteration = env$best_iteration - 1, # convert to 0-based index
+            best_score = env$best_score
+          )
+        }
+      } else if (iteration - env$best_iteration >= env$stopping_rounds) {
+        if (env$verbose) {
+          best_msg <- .format_eval_string(iteration, iter_feval, sds)
+          cat("Stopping. Best iteration:\n", best_msg, "\n\n", sep = '')
+        }
+        env$stopped_by_max_rounds <- TRUE
+        return(TRUE)
+      }
+      return(FALSE)
+    },
+    f_after_training = function(env, model, data, evals, iteration, final_feval, prev_cb_res) {
+      if (inherits(model, "xgb.Booster") && !env$keep_all_iter && env$best_iteration < iteration) {
+        # Note: it loses the attributes after being sliced,
+        # so they have to be re-assigned afterwards.
+        prev_attr <- xgb.attributes(model)
+        if (NROW(prev_attr)) {
+          suppressWarnings({
+            prev_attr <- within(prev_attr, rm("best_score", "best_iteration"))
+          })
+        }
+        .Call(XGBoosterSliceAndReplace_R, xgb.get.handle(model), 0L, env$best_iteration, 1L)
+        if (NROW(prev_attr)) {
+          xgb.attributes(model) <- prev_attr
+        }
+      }
+      attrs_set <- list(best_iteration = env$best_iteration - 1, best_score = env$best_score)
+      if (inherits(model, "xgb.Booster")) {
+        xgb.attributes(model) <- attrs_set
+      } else {
+        for (fd in model) {
+          xgb.attributes(fd$bst) <- attrs_set # to use in the cv.predict callback
+        }
+      }
+      return(
+        list(
+          best_iteration = env$best_iteration,
+          best_score = env$best_score,
+          stopped_by_max_rounds = env$stopped_by_max_rounds
+        )
+      )
+    }
+  )
+}
+
+.save.model.w.formatted.name <- function(model, save_name, iteration) {
+  # Note: this throws a warning if the name doesn't have anything to format through 'sprintf'
+  suppressWarnings({
+    save_name <- sprintf(save_name, iteration)
+  })
+  xgb.save(model, save_name)
+}
+
+#' @title Callback for saving a model file.
+#' @param save_period Save the model to disk after every
+#'        \code{save_period} iterations; 0 means save the model at the end.
+#' @param save_name The name or path for the saved model file.
+#'        It can contain a \code{\link[base]{sprintf}} formatting specifier
+#'        to include the integer iteration number in the file name.
+#'        E.g., with \code{save_name} = 'xgboost_%04d.model',
+#'        the file saved at iteration 50 would be named "xgboost_0050.model".
+#' @return An `xgb.Callback` object, which can be passed to \link{xgb.train},
+#'         but \bold{not} to \link{xgb.cv}.
+#' @description
+#' This callback function allows to save an xgb-model file, either periodically
+#' after each \code{save_period}'s or at the end.
+#'
+#' Does not leave any attribute in the booster.
+#' @export
+xgb.cb.save.model <- function(save_period = 0, save_name = "xgboost.ubj") {
+  if (save_period < 0) {
+    stop("'save_period' cannot be negative")
+  }
+  if (!is.character(save_name) || length(save_name) != 1L) {
+    stop("'save_name' must be a single character refering to file name.")
+  }
+
+  xgb.Callback(
+    cb_name = "save_model",
+    env = as.environment(list(save_period = save_period, save_name = save_name, last_save = 0)),
+    f_before_training = function(env, model, data, evals, begin_iteration, end_iteration) {
+      env$begin_iteration <- begin_iteration
+    },
+    f_before_iter = NULL,
+    f_after_iter = function(env, model, data, evals, iteration, iter_feval) {
+      if (env$save_period > 0 && (iteration - env$begin_iteration) %% env$save_period == 0) {
+        .save.model.w.formatted.name(model, env$save_name, iteration)
+        env$last_save <- iteration
+      }
+      return(FALSE)
+    },
+    f_after_training = function(env, model, data, evals, iteration, final_feval, prev_cb_res) {
+      if (env$save_period == 0 && iteration > env$last_save) {
+        .save.model.w.formatted.name(model, env$save_name, iteration)
+      }
+    }
+  )
+}
+
+#' @title Callback for returning cross-validation based predictions.
+#' @param save_models A flag for whether to save the folds' models.
+#' @param outputmargin Whether to save margin predictions (same effect as passing this
+#' parameter to \link{predict.xgb.Booster}).
+#' @return An `xgb.Callback` object, which can be passed to \link{xgb.cv},
+#'         but \bold{not} to \link{xgb.train}.
+#' @description
+#' This callback function saves predictions for all of the test folds,
+#' and also allows to save the folds' models.
+#' @details
+#' Predictions are saved inside of the \code{pred} element, which is either a vector or a matrix,
+#' depending on the number of prediction outputs per data row. The order of predictions corresponds
+#' to the order of rows in the original dataset. Note that when a custom \code{folds} list is
+#' provided in \code{xgb.cv}, the predictions would only be returned properly when this list is a
+#' non-overlapping list of k sets of indices, as in a standard k-fold CV. The predictions would not be
+#' meaningful when user-provided folds have overlapping indices as in, e.g., random sampling splits.
+#' When some of the indices in the training dataset are not included into user-provided \code{folds},
+#' their prediction value would be \code{NA}.
+#' @export
+xgb.cb.cv.predict <- function(save_models = FALSE, outputmargin = FALSE) {
+  xgb.Callback(
+    cb_name = "cv_predict",
+    env = as.environment(list(save_models = save_models, outputmargin = outputmargin)),
+    f_before_training = function(env, model, data, evals, begin_iteration, end_iteration) {
+      if (inherits(model, "xgb.Booster")) {
+        stop("'cv.predict' callback is only for 'xgb.cv'.")
+      }
+    },
+    f_before_iter = NULL,
+    f_after_iter = NULL,
+    f_after_training = function(env, model, data, evals, iteration, final_feval, prev_cb_res) {
+      pred <- NULL
+      for (fd in model) {
+        pr <- predict(
+          fd$bst,
+          fd$evals[[2L]],
+          outputmargin = env$outputmargin,
+          reshape = TRUE
+        )
+        if (is.null(pred)) {
+          if (NCOL(pr) > 1L) {
+            pred <- matrix(NA_real_, nrow(data), ncol(pr))
+          } else {
+            pred <- matrix(NA_real_, nrow(data))
+          }
+        }
+        if (is.matrix(pred)) {
+          pred[fd$index, ] <- pr
+        } else {
+          pred[fd$index] <- pr
+        }
+      }
+      out <- list(pred = pred)
+      if (env$save_models) {
+        out$models <- lapply(model, function(fd) fd$bst)
+      }
+      return(out)
+    }
+  )
+}
+
+.list2mat <- function(coef_list, sparse) {
+  if (sparse) {
+    coef_mat <- methods::new("dgRMatrix")
+    coef_mat@p <- as.integer(c(0, cumsum(sapply(coef_list, function(x) length(x@x)))))
+    coef_mat@j <- as.integer(unlist(lapply(coef_list, slot, "i")) - 1L)
+    coef_mat@x <- unlist(lapply(coef_list, slot, "x"))
+    coef_mat@Dim <- as.integer(c(length(coef_list), length(coef_list[[1L]])))
+    # Note: function 'xgb.gblinear.history' might later on try to slice by columns
+    coef_mat <- methods::as(coef_mat, "CsparseMatrix")
+    return(coef_mat)
+  } else {
+    return(unname(do.call(rbind, coef_list)))
+  }
+}
+
+.extract.coef <- function(model, sparse) {
+  coefs <- .internal.coef.xgb.Booster(model, add_names = FALSE)
+  if (NCOL(coefs) > 1L) {
+    coefs <- as.vector(coefs)
+  }
+  if (sparse) {
+    coefs <- methods::as(coefs, "sparseVector")
+  }
+  return(coefs)
+}
+
+#' @title Callback for collecting coefficients history of a gblinear booster
+#' @param sparse when set to `FALSE`/`TRUE`, a dense/sparse matrix is used to store the result.
+#'       Sparse format is useful when one expects only a subset of coefficients to be non-zero,
+#'       when using the "thrifty" feature selector with fairly small number of top features
+#'       selected per iteration.
+#' @return An `xgb.Callback` object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+#' @details
+#' To keep things fast and simple, gblinear booster does not internally store the history of linear
+#' model coefficients at each boosting iteration. This callback provides a workaround for storing
+#' the coefficients' path, by extracting them after each training iteration.
+#'
+#' This callback will construct a matrix where rows are boosting iterations and columns are
+#' feature coefficients (same order as when calling \link{coef.xgb.Booster}, with the intercept
+#' corresponding to the first column).
+#'
+#' When there is more than one coefficient per feature (e.g. multi-class classification),
+#' the result will be reshaped into a vector where coefficients are arranged first by features and
+#' then by class (e.g. first 1 through N coefficients will be for the first class, then
+#' coefficients N+1 through 2N for the second class, and so on).
+#'
+#' If the result has only one coefficient per feature in the data, then the resulting matrix
+#' will have column names matching with the feature names, otherwise (when there's more than
+#' one coefficient per feature) the names will be composed as 'column name' + ':' + 'class index'
+#' (so e.g. column 'c1' for class '0' will be named 'c1:0').
+#'
+#' With \code{xgb.train}, the output is either a dense or a sparse matrix.
+#' With with \code{xgb.cv}, it is a list (one element per each fold) of such
+#' matrices.
+#'
+#' Function \link{xgb.gblinear.history} function provides an easy way to retrieve the
+#' outputs from this callback.
+#' @seealso \link{xgb.gblinear.history}, \link{coef.xgb.Booster}.
+#' @examples
+#' #### Binary classification:
+#'
+#' ## Keep the number of threads to 1 for examples
+#' nthread <- 1
+#' data.table::setDTthreads(nthread)
+#'
+#' # In the iris dataset, it is hard to linearly separate Versicolor class from the rest
+#' # without considering the 2nd order interactions:
+#' x <- model.matrix(Species ~ .^2, iris)[,-1]
+#' colnames(x)
+#' dtrain <- xgb.DMatrix(scale(x), label = 1*(iris$Species == "versicolor"), nthread = nthread)
+#' param <- list(booster = "gblinear", objective = "reg:logistic", eval_metric = "auc",
+#'               lambda = 0.0003, alpha = 0.0003, nthread = nthread)
+#' # For 'shotgun', which is a default linear updater, using high eta values may result in
+#' # unstable behaviour in some datasets. With this simple dataset, however, the high learning
+#' # rate does not break the convergence, but allows us to illustrate the typical pattern of
+#' # "stochastic explosion" behaviour of this lock-free algorithm at early boosting iterations.
+#' bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 200, eta = 1.,
+#'                  callbacks = list(xgb.cb.gblinear.history()))
+#' # Extract the coefficients' path and plot them vs boosting iteration number:
+#' coef_path <- xgb.gblinear.history(bst)
+#' matplot(coef_path, type = 'l')
+#'
+#' # With the deterministic coordinate descent updater, it is safer to use higher learning rates.
+#' # Will try the classical componentwise boosting which selects a single best feature per round:
+#' bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 200, eta = 0.8,
+#'                  updater = 'coord_descent', feature_selector = 'thrifty', top_k = 1,
+#'                  callbacks = list(xgb.cb.gblinear.history()))
+#' matplot(xgb.gblinear.history(bst), type = 'l')
+#' #  Componentwise boosting is known to have similar effect to Lasso regularization.
+#' # Try experimenting with various values of top_k, eta, nrounds,
+#' # as well as different feature_selectors.
+#'
+#' # For xgb.cv:
+#' bst <- xgb.cv(param, dtrain, nfold = 5, nrounds = 100, eta = 0.8,
+#'               callbacks = list(xgb.cb.gblinear.history()))
+#' # coefficients in the CV fold #3
+#' matplot(xgb.gblinear.history(bst)[[3]], type = 'l')
+#'
+#'
+#' #### Multiclass classification:
+#' #
+#' dtrain <- xgb.DMatrix(scale(x), label = as.numeric(iris$Species) - 1, nthread = nthread)
+#' param <- list(booster = "gblinear", objective = "multi:softprob", num_class = 3,
+#'               lambda = 0.0003, alpha = 0.0003, nthread = nthread)
+#' # For the default linear updater 'shotgun' it sometimes is helpful
+#' # to use smaller eta to reduce instability
+#' bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 50, eta = 0.5,
+#'                  callbacks = list(xgb.cb.gblinear.history()))
+#' # Will plot the coefficient paths separately for each class:
+#' matplot(xgb.gblinear.history(bst, class_index = 0), type = 'l')
+#' matplot(xgb.gblinear.history(bst, class_index = 1), type = 'l')
+#' matplot(xgb.gblinear.history(bst, class_index = 2), type = 'l')
+#'
+#' # CV:
+#' bst <- xgb.cv(param, dtrain, nfold = 5, nrounds = 70, eta = 0.5,
+#'               callbacks = list(xgb.cb.gblinear.history(FALSE)))
+#' # 1st fold of 1st class
+#' matplot(xgb.gblinear.history(bst, class_index = 0)[[1]], type = 'l')
+#'
+#' @export
+xgb.cb.gblinear.history <- function(sparse = FALSE) {
+  xgb.Callback(
+    cb_name = "gblinear_history",
+    env = as.environment(list(sparse = sparse)),
+    f_before_training = function(env, model, data, evals, begin_iteration, end_iteration) {
+      if (!inherits(model, "xgb.Booster")) {
+        model <- model[[1L]]$bst
+      }
+      if (xgb.booster_type(model) != "gblinear") {
+        stop("Callback 'xgb.cb.gblinear.history' is only for booster='gblinear'.")
+      }
+      env$coef_hist <- vector("list", end_iteration - begin_iteration + 1)
+      env$next_idx <- 1
+    },
+    f_before_iter = NULL,
+    f_after_iter = function(env, model, data, evals, iteration, iter_feval) {
+      if (inherits(model, "xgb.Booster")) {
+        coef_this <- .extract.coef(model, env$sparse)
+      } else {
+        coef_this <- lapply(model, function(fd) .extract.coef(fd$bst, env$sparse))
+      }
+      env$coef_hist[[env$next_idx]] <- coef_this
+      env$next_idx <- env$next_idx + 1
+      return(FALSE)
+    },
+    f_after_training = function(env, model, data, evals, iteration, final_feval, prev_cb_res) {
+      # in case of early stopping
+      if (env$next_idx <= length(env$coef_hist)) {
+        env$coef_hist <- head(env$coef_hist, env$next_idx - 1)
+      }
+
+      is_booster <- inherits(model, "xgb.Booster")
+      if (is_booster) {
+        out <- .list2mat(env$coef_hist, env$sparse)
+      } else {
+        out <- lapply(
+          X = lapply(
+            X = seq_along(env$coef_hist[[1]]),
+            FUN = function(i) lapply(env$coef_hist, "[[", i)
+          ),
+          FUN = .list2mat,
+          env$sparse
+        )
+      }
+      if (!is.null(prev_cb_res)) {
+        if (is_booster) {
+          out <- rbind(prev_cb_res, out)
+        } else {
+          # Note: this case should never be encountered, since training cannot
+          # be continued from the result of xgb.cv, but this code should in
+          # theory do the job if the situation were to be encountered.
+          out <- lapply(
+            out,
+            function(lst) {
+              lapply(
+                seq_along(lst),
+                function(i) rbind(prev_cb_res[[i]], lst[[i]])
+              )
+            }
+          )
+        }
+      }
+      feature_names <- getinfo(data, "feature_name")
+      if (!NROW(feature_names)) {
+        feature_names <- paste0("V", seq(1L, ncol(data)))
+      }
+      expected_ncols <- length(feature_names) + 1
+      if (is_booster) {
+        mat_ncols <- ncol(out)
+      } else {
+        mat_ncols <- ncol(out[[1L]])
+      }
+      if (mat_ncols %% expected_ncols == 0) {
+        feature_names <- c("(Intercept)", feature_names)
+        n_rep <- mat_ncols / expected_ncols
+        if (n_rep > 1) {
+          feature_names <- unlist(
+            lapply(
+              seq(1, n_rep),
+              function(cl) paste(feature_names, cl - 1, sep = ":")
+            )
+          )
+        }
+        if (is_booster) {
+          colnames(out) <- feature_names
+        } else {
+          out <- lapply(
+            out,
+            function(mat) {
+              colnames(mat) <- feature_names
+              return(mat)
+            }
+          )
+        }
+      }
+      return(out)
+    }
+  )
+}
+
+#' @title Extract gblinear coefficients history.
+#' @description A helper function to extract the matrix of linear coefficients' history
+#' from a gblinear model created while using the \link{xgb.cb.gblinear.history}
+#' callback (which must be added manually as by default it's not used).
+#' @details Note that this is an R-specific function that relies on R attributes that
+#' are not saved when using xgboost's own serialization functions like \link{xgb.load}
+#' or \link{xgb.load.raw}.
+#'
+#' In order for a serialized model to be accepted by this function, one must use R
+#' serializers such as \link{saveRDS}.
+#' @param model either an \code{xgb.Booster} or a result of \code{xgb.cv()}, trained
+#'        using the \link{xgb.cb.gblinear.history} callback, but \bold{not} a booster
+#'        loaded from \link{xgb.load} or \link{xgb.load.raw}.
+#' @param class_index zero-based class index to extract the coefficients for only that
+#'        specific class in a multinomial multiclass model. When it is NULL, all the
+#'        coefficients are returned. Has no effect in non-multiclass models.
+#'
+#' @return
+#' For an \link{xgb.train} result, a matrix (either dense or sparse) with the columns
+#' corresponding to iteration's coefficients and the rows corresponding to boosting iterations.
+#'
+#' For an \link{xgb.cv} result, a list of such matrices is returned with the elements
+#' corresponding to CV folds.
+#'
+#' When there is more than one coefficient per feature (e.g. multi-class classification)
+#' and `class_index` is not provided,
+#' the result will be reshaped into a vector where coefficients are arranged first by features and
+#' then by class (e.g. first 1 through N coefficients will be for the first class, then
+#' coefficients N+1 through 2N for the second class, and so on).
+#' @seealso \link{xgb.cb.gblinear.history}, \link{coef.xgb.Booster}.
+#' @export
+xgb.gblinear.history <- function(model, class_index = NULL) {
+
+  if (!(inherits(model, "xgb.Booster") ||
+        inherits(model, "xgb.cv.synchronous")))
+    stop("model must be an object of either xgb.Booster or xgb.cv.synchronous class")
+  is_cv <- inherits(model, "xgb.cv.synchronous")
+
+  if (!is_cv) {
+    coef_path <- getElement(attributes(model), "gblinear_history")
+  } else {
+    coef_path <- getElement(model, "gblinear_history")
+  }
+  if (is.null(coef_path)) {
+    stop("model must be trained while using the xgb.cb.gblinear.history() callback")
+  }
+
+  if (!is_cv) {
+    num_class <- xgb.num_class(model)
+    num_feat <- xgb.num_feature(model)
+  } else {
+    # in case of CV, the object is expected to have this info
+    if (model$params$booster != "gblinear")
+      stop("It does not appear to be a gblinear model")
+    num_class <- NVL(model$params$num_class, 1)
+    num_feat <- model$nfeatures
+    if (is.null(num_feat))
+      stop("This xgb.cv result does not have nfeatures info")
+  }
+
+  if (!is.null(class_index) &&
+      num_class > 1 &&
+      (class_index[1] < 0 || class_index[1] >= num_class))
+    stop("class_index has to be within [0,", num_class - 1, "]")
+
+  if (!is.null(class_index) && num_class > 1) {
+    seq_take <- seq(1 + class_index * (num_feat + 1), (class_index + 1) * (num_feat + 1))
+    coef_path <- if (is.list(coef_path)) {
+      lapply(coef_path, function(x) x[, seq_take])
+    } else {
+      coef_path <- coef_path[, seq_take]
+    }
+  }
+  return(coef_path)
+}
+
+.callbacks.only.train <- "save_model"
+.callbacks.only.cv <- "cv_predict"
+
+.process.callbacks <- function(callbacks, is_cv) {
+  if (inherits(callbacks, "xgb.Callback")) {
+    callbacks <- list(callbacks)
+  }
+  if (!is.list(callbacks)) {
+    stop("'callbacks' must be a list.")
+  }
+  cb_names <- character()
+  if (length(callbacks)) {
+    is_callback <- sapply(callbacks, inherits, "xgb.Callback")
+    if (!all(is_callback)) {
+      stop("Entries in 'callbacks' must be 'xgb.Callback' objects.")
+    }
+    cb_names <- sapply(callbacks, function(cb) cb$cb_name)
+    if (length(cb_names) != length(callbacks)) {
+      stop("Passed invalid callback(s).")
+    }
+    if (anyDuplicated(cb_names) > 0) {
+      stop("Callbacks must have unique names.")
+    }
+    if (is_cv) {
+      if (any(.callbacks.only.train %in% cb_names)) {
+        stop(
+          "Passed callback(s) not supported for 'xgb.cv': ",
+          paste(intersect(.callbacks.only.train, cb_names), collapse = ", ")
+        )
+      }
+    } else {
+      if (any(.callbacks.only.cv %in% cb_names)) {
+        stop(
+          "Passed callback(s) not supported for 'xgb.train': ",
+          paste(intersect(.callbacks.only.cv, cb_names), collapse = ", ")
+        )
+      }
+    }
+    # Early stopping callback needs to be executed before the others
+    if ("early_stop" %in% cb_names) {
+      mask <- cb_names == "early_stop"
+      callbacks <- c(list(callbacks[[which(mask)]]), callbacks[!mask])
+    }
+  }
+  return(list(callbacks = callbacks, cb_names = cb_names))
+}
+
+# Note: don't try to use functions like 'append', as they will
+# merge the elements of the different callbacks into a single list.
+add.callback <- function(callbacks, cb, as_first_elt = FALSE) {
+  if (!as_first_elt) {
+    callbacks[[length(callbacks) + 1]] <- cb
+    return(callbacks)
+  } else {
+    if (!length(callbacks)) {
+      return(list(cb))
+    }
+    new_cb <- vector("list", length(callbacks) + 1)
+    new_cb[[1]] <- cb
+    new_cb[seq(2, length(new_cb))] <- callbacks
+    return(new_cb)
+  }
+}
+
+has.callbacks <- function(callbacks, cb_name) {
+  cb_names <- sapply(callbacks, function(cb) cb$name)
+  return(cb_name %in% cb_names)
 }
diff --git a/R-package/R/utils.R b/R-package/R/utils.R
index e8ae787fc..7b6a20f70 100644
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@@ -26,6 +26,11 @@ NVL <- function(x, val) {
            'multi:softprob', 'rank:pairwise', 'rank:ndcg', 'rank:map'))
 }
 
+.RANKING_OBJECTIVES <- function() {
+  return(c('binary:logistic', 'binary:logitraw', 'binary:hinge', 'multi:softmax',
+           'multi:softprob'))
+}
+
 
 #
 # Low-level functions for boosting --------------------------------------------
@@ -142,7 +147,7 @@ check.custom.eval <- function(env = parent.frame()) {
   if (!is.null(env$feval) &&
       is.null(env$maximize) && (
         !is.null(env$early_stopping_rounds) ||
-        has.callbacks(env$callbacks, 'cb.early.stop')))
+        has.callbacks(env$callbacks, "early_stop")))
     stop("Please set 'maximize' to indicate whether the evaluation metric needs to be maximized or not")
 }
 
@@ -193,20 +198,20 @@ xgb.iter.update <- function(bst, dtrain, iter, obj) {
 # Evaluate one iteration.
 # Returns a named vector of evaluation metrics
 # with the names in a 'datasetname-metricname' format.
-xgb.iter.eval <- function(bst, watchlist, iter, feval) {
+xgb.iter.eval <- function(bst, evals, iter, feval) {
   handle <- xgb.get.handle(bst)
 
-  if (length(watchlist) == 0)
+  if (length(evals) == 0)
     return(NULL)
 
-  evnames <- names(watchlist)
+  evnames <- names(evals)
   if (is.null(feval)) {
-    msg <- .Call(XGBoosterEvalOneIter_R, handle, as.integer(iter), watchlist, as.list(evnames))
+    msg <- .Call(XGBoosterEvalOneIter_R, handle, as.integer(iter), evals, as.list(evnames))
     mat <- matrix(strsplit(msg, '\\s+|:')[[1]][-1], nrow = 2)
     res <- structure(as.numeric(mat[2, ]), names = mat[1, ])
   } else {
-    res <- sapply(seq_along(watchlist), function(j) {
-      w <- watchlist[[j]]
+    res <- sapply(seq_along(evals), function(j) {
+      w <- evals[[j]]
       ## predict using all trees
       preds <- predict(bst, w, outputmargin = TRUE, iterationrange = "all")
       eval_res <- feval(preds, w)
@@ -235,33 +240,43 @@ convert.labels <- function(labels, objective_name) {
 }
 
 # Generates random (stratified if needed) CV folds
-generate.cv.folds <- function(nfold, nrows, stratified, label, params) {
+generate.cv.folds <- function(nfold, nrows, stratified, label, group, params) {
+  if (NROW(group)) {
+    if (stratified) {
+      warning(
+        paste0(
+          "Stratified splitting is not supported when using 'group' attribute.",
+          " Will use unstratified splitting."
+        )
+      )
+    }
+    return(generate.group.folds(nfold, group))
+  }
+  objective <- params$objective
+  if (!is.character(objective)) {
+    warning("Will use unstratified splitting (custom objective used)")
+    stratified <- FALSE
+  }
+  # cannot stratify if label is NULL
+  if (stratified && is.null(label)) {
+    warning("Will use unstratified splitting (no 'labels' available)")
+    stratified <- FALSE
+  }
 
   # cannot do it for rank
-  objective <- params$objective
   if (is.character(objective) && strtrim(objective, 5) == 'rank:') {
-    stop("\n\tAutomatic generation of CV-folds is not implemented for ranking!\n",
+    stop("\n\tAutomatic generation of CV-folds is not implemented for ranking without 'group' field!\n",
          "\tConsider providing pre-computed CV-folds through the 'folds=' parameter.\n")
   }
   # shuffle
   rnd_idx <- sample.int(nrows)
-  if (stratified &&
-      length(label) == length(rnd_idx)) {
+  if (stratified && length(label) == length(rnd_idx)) {
     y <- label[rnd_idx]
-    # WARNING: some heuristic logic is employed to identify classification setting!
     #  - For classification, need to convert y labels to factor before making the folds,
     #    and then do stratification by factor levels.
     #  - For regression, leave y numeric and do stratification by quantiles.
     if (is.character(objective)) {
-      y <- convert.labels(y, params$objective)
-    } else {
-      # If no 'objective' given in params, it means that user either wants to
-      # use the default 'reg:squarederror' objective or has provided a custom
-      # obj function.  Here, assume classification setting when y has 5 or less
-      # unique values:
-      if (length(unique(y)) <= 5) {
-        y <- factor(y)
-      }
+      y <- convert.labels(y, objective)
     }
     folds <- xgb.createFolds(y = y, k = nfold)
   } else {
@@ -277,6 +292,29 @@ generate.cv.folds <- function(nfold, nrows, stratified, label, params) {
   return(folds)
 }
 
+generate.group.folds <- function(nfold, group) {
+  ngroups <- length(group) - 1
+  if (ngroups < nfold) {
+    stop("DMatrix has fewer groups than folds.")
+  }
+  seq_groups <- seq_len(ngroups)
+  indices <- lapply(seq_groups, function(gr) seq(group[gr] + 1, group[gr + 1]))
+  assignments <- base::split(seq_groups, as.integer(seq_groups %% nfold))
+  assignments <- unname(assignments)
+
+  out <- vector("list", nfold)
+  randomized_groups <- sample(ngroups)
+  for (idx in seq_len(nfold)) {
+    groups_idx_test <- randomized_groups[assignments[[idx]]]
+    groups_test <- indices[groups_idx_test]
+    idx_test <- unlist(groups_test)
+    attributes(idx_test)$group_test <- lengths(groups_test)
+    attributes(idx_test)$group_train <- lengths(indices[-groups_idx_test])
+    out[[idx]] <- idx_test
+  }
+  return(out)
+}
+
 # Creates CV folds stratified by the values of y.
 # It was borrowed from caret::createFolds and simplified
 # by always returning an unnamed list of fold indices.
@@ -454,7 +492,8 @@ depr_par_lut <- matrix(c(
   'plot.height', 'plot_height',
   'plot.width', 'plot_width',
   'n_first_tree', 'trees',
-  'dummy', 'DUMMY'
+  'dummy', 'DUMMY',
+  'watchlist', 'evals'
 ), ncol = 2, byrow = TRUE)
 colnames(depr_par_lut) <- c('old', 'new')
 
diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R
index febefb757..77d75fa9c 100644
--- a/R-package/R/xgb.Booster.R
+++ b/R-package/R/xgb.Booster.R
@@ -77,26 +77,45 @@ xgb.get.handle <- function(object) {
 
 #' Predict method for XGBoost model
 #'
-#' Predicted values based on either xgboost model or model handle object.
+#' Predict values on data based on xgboost model.
 #'
 #' @param object Object of class `xgb.Booster`.
-#' @param newdata Takes `matrix`, `dgCMatrix`, `dgRMatrix`, `dsparseVector`,
+#' @param newdata Takes `data.frame`, `matrix`, `dgCMatrix`, `dgRMatrix`, `dsparseVector`,
 #'        local data file, or `xgb.DMatrix`.
-#'        For single-row predictions on sparse data, it is recommended to use the CSR format.
-#'        If passing a sparse vector, it will take it as a row vector.
-#' @param missing Only used when input is a dense matrix. Pick a float value that represents
-#'        missing values in data (e.g., 0 or some other extreme value).
+#'
+#'        For single-row predictions on sparse data, it's recommended to use CSR format. If passing
+#'        a sparse vector, it will take it as a row vector.
+#'
+#'        Note that, for repeated predictions on the same data, one might want to create a DMatrix to
+#'        pass here instead of passing R types like matrices or data frames, as predictions will be
+#'        faster on DMatrix.
+#'
+#'        If `newdata` is a `data.frame`, be aware that:\itemize{
+#'        \item Columns will be converted to numeric if they aren't already, which could potentially make
+#'              the operation slower than in an equivalent `matrix` object.
+#'        \item The order of the columns must match with that of the data from which the model was fitted
+#'              (i.e. columns will not be referenced by their names, just by their order in the data).
+#'        \item If the model was fitted to data with categorical columns, these columns must be of
+#'              `factor` type here, and must use the same encoding (i.e. have the same levels).
+#'        \item If `newdata` contains any `factor` columns, they will be converted to base-0
+#'              encoding (same as during DMatrix creation) - hence, one should not pass a `factor`
+#'              under a column which during training had a different type.
+#'        }
+#' @param missing Float value that represents missing values in data (e.g., 0 or some other extreme value).
+#'
+#'        This parameter is not used when `newdata` is an `xgb.DMatrix` - in such cases, should pass
+#'        this as an argument to the DMatrix constructor instead.
 #' @param outputmargin Whether the prediction should be returned in the form of original untransformed
 #'        sum of predictions from boosting iterations' results. E.g., setting `outputmargin=TRUE` for
 #'        logistic regression would return log-odds instead of probabilities.
-#' @param predleaf Whether to predict pre-tree leaf indices.
+#' @param predleaf Whether to predict per-tree leaf indices.
 #' @param predcontrib Whether to return feature contributions to individual predictions (see Details).
 #' @param approxcontrib Whether to use a fast approximation for feature contributions (see Details).
 #' @param predinteraction Whether to return contributions of feature interactions to individual predictions (see Details).
 #' @param reshape Whether to reshape the vector of predictions to matrix form when there are several
 #'        prediction outputs per case. No effect if `predleaf`, `predcontrib`,
 #'        or `predinteraction` is `TRUE`.
-#' @param training Whether the predictions are used for training. For dart booster,
+#' @param training Whether the prediction result is used for training. For dart booster,
 #'        training predicting will perform dropout.
 #' @param iterationrange Sequence of rounds/iterations from the model to use for prediction, specified by passing
 #'        a two-dimensional vector with the start and end numbers in the sequence (same format as R's `seq` - i.e.
@@ -111,6 +130,12 @@ xgb.get.handle <- function(object) {
 #'        If passing "all", will use all of the rounds regardless of whether the model had early stopping or not.
 #' @param strict_shape Default is `FALSE`. When set to `TRUE`, the output
 #'        type and shape of predictions are invariant to the model type.
+#' @param base_margin Base margin used for boosting from existing model.
+#'
+#'        Note that, if `newdata` is an `xgb.DMatrix` object, this argument will
+#'        be ignored as it needs to be added to the DMatrix instead (e.g. by passing it as
+#'        an argument in its constructor, or by calling \link{setinfo.xgb.DMatrix}).
+#'
 #' @param validate_features When `TRUE`, validate that the Booster's and newdata's feature_names
 #'        match (only applicable when both `object` and `newdata` have feature names).
 #'
@@ -287,16 +312,80 @@ xgb.get.handle <- function(object) {
 predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FALSE,
                                 predleaf = FALSE, predcontrib = FALSE, approxcontrib = FALSE, predinteraction = FALSE,
                                 reshape = FALSE, training = FALSE, iterationrange = NULL, strict_shape = FALSE,
-                                validate_features = FALSE, ...) {
+                                validate_features = FALSE, base_margin = NULL, ...) {
   if (validate_features) {
     newdata <- validate.features(object, newdata)
   }
-  if (!inherits(newdata, "xgb.DMatrix")) {
+  is_dmatrix <- inherits(newdata, "xgb.DMatrix")
+  if (is_dmatrix && !is.null(base_margin)) {
+    stop(
+      "'base_margin' is not supported when passing 'xgb.DMatrix' as input.",
+      " Should be passed as argument to 'xgb.DMatrix' constructor."
+    )
+  }
+
+  use_as_df <- FALSE
+  use_as_dense_matrix <- FALSE
+  use_as_csr_matrix <- FALSE
+  n_row <- NULL
+  if (!is_dmatrix) {
+
+    inplace_predict_supported <- !predcontrib && !predinteraction && !predleaf
+    if (inplace_predict_supported) {
+      booster_type <- xgb.booster_type(object)
+      if (booster_type == "gblinear" || (booster_type == "dart" && training)) {
+        inplace_predict_supported <- FALSE
+      }
+    }
+    if (inplace_predict_supported) {
+
+      if (is.matrix(newdata)) {
+        use_as_dense_matrix <- TRUE
+      } else if (is.data.frame(newdata)) {
+        # note: since here it turns it into a non-data-frame list,
+        # needs to keep track of the number of rows it had for later
+        n_row <- nrow(newdata)
+        newdata <- lapply(
+          newdata,
+          function(x) {
+            if (is.factor(x)) {
+              return(as.numeric(x) - 1)
+            } else {
+              return(as.numeric(x))
+            }
+          }
+        )
+        use_as_df <- TRUE
+      } else if (inherits(newdata, "dgRMatrix")) {
+        use_as_csr_matrix <- TRUE
+        csr_data <- list(newdata@p, newdata@j, newdata@x, ncol(newdata))
+      } else if (inherits(newdata, "dsparseVector")) {
+        use_as_csr_matrix <- TRUE
+        n_row <- 1L
+        i <- newdata@i - 1L
+        if (storage.mode(i) != "integer") {
+          storage.mode(i) <- "integer"
+        }
+        csr_data <- list(c(0L, length(i)), i, newdata@x, length(newdata))
+      }
+
+    }
+
+  } # if (!is_dmatrix)
+
+  if (!is_dmatrix && !use_as_dense_matrix && !use_as_csr_matrix && !use_as_df) {
     nthread <- xgb.nthread(object)
     newdata <- xgb.DMatrix(
       newdata,
-      missing = missing, nthread = NVL(nthread, -1)
+      missing = missing,
+      base_margin = base_margin,
+      nthread = NVL(nthread, -1)
     )
+    is_dmatrix <- TRUE
+  }
+
+  if (is.null(n_row)) {
+    n_row <- nrow(newdata)
   }
 
 
@@ -354,18 +443,30 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
     args$type <- set_type(6)
   }
 
-  predts <- .Call(
-    XGBoosterPredictFromDMatrix_R,
-    xgb.get.handle(object),
-    newdata,
-    jsonlite::toJSON(args, auto_unbox = TRUE)
-  )
+  json_conf <- jsonlite::toJSON(args, auto_unbox = TRUE)
+  if (is_dmatrix) {
+    predts <- .Call(
+      XGBoosterPredictFromDMatrix_R, xgb.get.handle(object), newdata, json_conf
+    )
+  } else if (use_as_dense_matrix) {
+    predts <- .Call(
+      XGBoosterPredictFromDense_R, xgb.get.handle(object), newdata, missing, json_conf, base_margin
+    )
+  } else if (use_as_csr_matrix) {
+    predts <- .Call(
+      XGBoosterPredictFromCSR_R, xgb.get.handle(object), csr_data, missing, json_conf, base_margin
+    )
+  } else if (use_as_df) {
+    predts <- .Call(
+      XGBoosterPredictFromColumnar_R, xgb.get.handle(object), newdata, missing, json_conf, base_margin
+    )
+  }
+
   names(predts) <- c("shape", "results")
   shape <- predts$shape
   arr <- predts$results
 
   n_ret <- length(arr)
-  n_row <- nrow(newdata)
   if (n_row != shape[1]) {
     stop("Incorrect predict shape.")
   }
@@ -970,6 +1071,10 @@ xgb.best_iteration <- function(bst) {
 #' coef(model)
 #' @export
 coef.xgb.Booster <- function(object, ...) {
+  return(.internal.coef.xgb.Booster(object, add_names = TRUE))
+}
+
+.internal.coef.xgb.Booster <- function(object, add_names = TRUE) {
   booster_type <- xgb.booster_type(object)
   if (booster_type != "gblinear") {
     stop("Coefficients are not defined for Booster type ", booster_type)
@@ -988,21 +1093,27 @@ coef.xgb.Booster <- function(object, ...) {
   intercepts <- weights[seq(sep + 1, length(weights))]
   intercepts <- intercepts + as.numeric(base_score)
 
-  feature_names <- xgb.feature_names(object)
-  if (!NROW(feature_names)) {
-    # This mimics the default naming in R which names columns as "V1..N"
-    # when names are needed but not available
-    feature_names <- paste0("V", seq(1L, num_feature))
+  if (add_names) {
+    feature_names <- xgb.feature_names(object)
+    if (!NROW(feature_names)) {
+      # This mimics the default naming in R which names columns as "V1..N"
+      # when names are needed but not available
+      feature_names <- paste0("V", seq(1L, num_feature))
+    }
+    feature_names <- c("(Intercept)", feature_names)
   }
-  feature_names <- c("(Intercept)", feature_names)
   if (n_cols == 1L) {
     out <- c(intercepts, coefs)
-    names(out) <- feature_names
+    if (add_names) {
+      names(out) <- feature_names
+    }
   } else {
     coefs <- matrix(coefs, nrow = num_feature, byrow = TRUE)
     dim(intercepts) <- c(1L, n_cols)
     out <- rbind(intercepts, coefs)
-    row.names(out) <- feature_names
+    if (add_names) {
+      row.names(out) <- feature_names
+    }
     # TODO: if a class names attributes is added,
     # should use those names here.
   }
@@ -1154,12 +1265,9 @@ print.xgb.Booster <- function(x, ...) {
     cat("  ", paste(attr_names, collapse = ", "), "\n")
   }
 
-  if (!is.null(R_attrs$callbacks) && length(R_attrs$callbacks) > 0) {
-    cat('callbacks:\n')
-    lapply(callback.calls(R_attrs$callbacks), function(x) {
-      cat('  ')
-      print(x)
-    })
+  additional_attr <- setdiff(names(R_attrs), .reserved_cb_names)
+  if (NROW(additional_attr)) {
+    cat("callbacks:\n  ", paste(additional_attr, collapse = ", "), "\n")
   }
 
   if (!is.null(R_attrs$evaluation_log)) {
diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R
index ba0686cf9..15f6faed0 100644
--- a/R-package/R/xgb.DMatrix.R
+++ b/R-package/R/xgb.DMatrix.R
@@ -28,10 +28,27 @@
 #' 'xgb.QuantileDMatrix'.
 #' \item Single-row CSR matrices, as class `dsparseVector` from package `Matrix`, which is interpreted
 #' as a single row (only when making predictions from a fitted model).
-#' \item Text files in SVMLight / LibSVM formats, passed as a path to the file. These are \bold{not}
-#' supported for xgb.QuantileDMatrix'.
-#' \item Binary files generated by \link{xgb.DMatrix.save},  passed as a path to the file. These are
-#' \bold{not} supported for xgb.QuantileDMatrix'.
+#' \item Text files in a supported format, passed as a `character` variable containing the URI path to
+#' the file, with an optional format specifier.
+#'
+#' These are \bold{not} supported for `xgb.QuantileDMatrix`. Supported formats are:\itemize{
+#'   \item XGBoost's own binary format for DMatrices, as produced by \link{xgb.DMatrix.save}.
+#'   \item SVMLight (a.k.a. LibSVM) format for CSR matrices. This format can be signaled by suffix
+#'         `?format=libsvm` at the end of the file path. It will be the default format if not
+#'         otherwise specified.
+#'   \item CSV files (comma-separated values). This format can be specified by adding suffix
+#'         `?format=csv` at the end ofthe file path. It will \bold{not} be auto-deduced from file extensions.
+#'   }
+#'
+#' Be aware that the format of the file will not be auto-deduced - for example, if a file is named 'file.csv',
+#' it will not look at the extension or file contents to determine that it is a comma-separated value.
+#' Instead, the format must be specified following the URI format, so the input to `data` should be passed
+#' like this: `"file.csv?format=csv"` (or `"file.csv?format=csv&label_column=0"` if the first column
+#' corresponds to the labels).
+#'
+#' For more information about passing text files as input, see the articles
+#' \href{https://xgboost.readthedocs.io/en/stable/tutorials/input_format.html}{Text Input Format of DMatrix} and
+#' \href{https://xgboost.readthedocs.io/en/stable/python/python_intro.html#python-data-interface}{Data Interface}.
 #' }
 #' @param label Label of the training data. For classification problems, should be passed encoded as
 #' integers with numeration starting at zero.
@@ -81,6 +98,13 @@
 #' @param label_lower_bound Lower bound for survival training.
 #' @param label_upper_bound Upper bound for survival training.
 #' @param feature_weights Set feature weights for column sampling.
+#' @param data_split_mode When passing a URI (as R `character`) as input, this signals
+#' whether to split by row or column. Allowed values are `"row"` and `"col"`.
+#'
+#' In distributed mode, the file is split accordingly; otherwise this is only an indicator on
+#' how the file was split beforehand. Default to row.
+#'
+#' This is not used when `data` is not a URI.
 #' @return An 'xgb.DMatrix' object. If calling 'xgb.QuantileDMatrix', it will have additional
 #' subclass 'xgb.QuantileDMatrix'.
 #'
@@ -117,7 +141,8 @@ xgb.DMatrix <- function(
   qid = NULL,
   label_lower_bound = NULL,
   label_upper_bound = NULL,
-  feature_weights = NULL
+  feature_weights = NULL,
+  data_split_mode = "row"
 ) {
   if (!is.null(group) && !is.null(qid)) {
     stop("Either one of 'group' or 'qid' should be NULL")
@@ -131,7 +156,14 @@ xgb.DMatrix <- function(
       )
     }
     data <- path.expand(data)
-    handle <- .Call(XGDMatrixCreateFromFile_R, data, as.integer(silent))
+    if (data_split_mode == "row") {
+      data_split_mode <- 0L
+    } else if (data_split_mode == "col") {
+      data_split_mode <- 1L
+    } else {
+      stop("Passed invalid 'data_split_mode': ", data_split_mode)
+    }
+    handle <- .Call(XGDMatrixCreateFromURI_R, data, as.integer(silent), data_split_mode)
   } else if (is.matrix(data)) {
     handle <- .Call(
       XGDMatrixCreateFromMat_R, data, missing, nthread
@@ -1227,8 +1259,11 @@ xgb.get.DMatrix.data <- function(dmat) {
 #' Get a new DMatrix containing the specified rows of
 #' original xgb.DMatrix object
 #'
-#' @param object Object of class "xgb.DMatrix"
-#' @param idxset a integer vector of indices of rows needed
+#' @param object Object of class "xgb.DMatrix".
+#' @param idxset An integer vector of indices of rows needed (base-1 indexing).
+#' @param allow_groups Whether to allow slicing an `xgb.DMatrix` with `group` (or
+#'        equivalently `qid`) field. Note that in such case, the result will not have
+#'        the groups anymore - they need to be set manually through `setinfo`.
 #' @param colset currently not used (columns subsetting is not available)
 #'
 #' @examples
@@ -1243,11 +1278,11 @@ xgb.get.DMatrix.data <- function(dmat) {
 #'
 #' @rdname xgb.slice.DMatrix
 #' @export
-xgb.slice.DMatrix <- function(object, idxset) {
+xgb.slice.DMatrix <- function(object, idxset, allow_groups = FALSE) {
   if (!inherits(object, "xgb.DMatrix")) {
     stop("object must be xgb.DMatrix")
   }
-  ret <- .Call(XGDMatrixSliceDMatrix_R, object, idxset)
+  ret <- .Call(XGDMatrixSliceDMatrix_R, object, idxset, allow_groups)
 
   attr_list <- attributes(object)
   nr <- nrow(object)
@@ -1264,7 +1299,15 @@ xgb.slice.DMatrix <- function(object, idxset) {
       }
     }
   }
-  return(structure(ret, class = "xgb.DMatrix"))
+
+  out <- structure(ret, class = "xgb.DMatrix")
+  parent_fields <- as.list(attributes(object)$fields)
+  if (NROW(parent_fields)) {
+    child_fields <- parent_fields[!(names(parent_fields) %in% c("group", "qid"))]
+    child_fields <- as.environment(child_fields)
+    attributes(out)$fields <- child_fields
+  }
+  return(out)
 }
 
 #' @rdname xgb.slice.DMatrix
@@ -1308,11 +1351,11 @@ print.xgb.DMatrix <- function(x, verbose = FALSE, ...) {
   }
 
   cat(class_print, ' dim:', nrow(x), 'x', ncol(x), ' info: ')
-  infos <- character(0)
-  if (xgb.DMatrix.hasinfo(x, 'label')) infos <- 'label'
-  if (xgb.DMatrix.hasinfo(x, 'weight')) infos <- c(infos, 'weight')
-  if (xgb.DMatrix.hasinfo(x, 'base_margin')) infos <- c(infos, 'base_margin')
-  if (length(infos) == 0) infos <- 'NA'
+  infos <- names(attributes(x)$fields)
+  infos <- infos[infos != "feature_name"]
+  if (!NROW(infos)) infos <- "NA"
+  infos <- infos[order(infos)]
+  infos <- paste(infos, collapse = ", ")
   cat(infos)
   cnames <- colnames(x)
   cat('  colnames:')
diff --git a/R-package/R/xgb.DMatrix.save.R b/R-package/R/xgb.DMatrix.save.R
index ef4599d0e..243f43047 100644
--- a/R-package/R/xgb.DMatrix.save.R
+++ b/R-package/R/xgb.DMatrix.save.R
@@ -6,6 +6,7 @@
 #' @param fname the name of the file to write.
 #'
 #' @examples
+#' \dontshow{RhpcBLASctl::omp_set_num_threads(1)}
 #' data(agaricus.train, package='xgboost')
 #' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
 #' fname <- file.path(tempdir(), "xgb.DMatrix.data")
diff --git a/R-package/R/xgb.config.R b/R-package/R/xgb.config.R
index 3f3a9b1a7..20b8aef90 100644
--- a/R-package/R/xgb.config.R
+++ b/R-package/R/xgb.config.R
@@ -4,7 +4,14 @@
 #' values of one or more global-scope parameters. Use \code{xgb.get.config} to fetch the current
 #' values of all global-scope parameters (listed in
 #' \url{https://xgboost.readthedocs.io/en/stable/parameter.html}).
+#' @details
+#' Note that serialization-related functions might use a globally-configured number of threads,
+#' which is managed by the system's OpenMP (OMP) configuration instead. Typically, XGBoost methods
+#' accept an `nthreads` parameter, but some methods like `readRDS` might get executed before such
+#' parameter can be supplied.
 #'
+#' The number of OMP threads can in turn be configured for example through an environment variable
+#' `OMP_NUM_THREADS` (needs to be set before R is started), or through `RhpcBLASctl::omp_set_num_threads`.
 #' @rdname xgbConfig
 #' @title Set and get global configuration
 #' @name xgb.set.config, xgb.get.config
diff --git a/R-package/R/xgb.create.features.R b/R-package/R/xgb.create.features.R
index baef3bb03..27f8a0975 100644
--- a/R-package/R/xgb.create.features.R
+++ b/R-package/R/xgb.create.features.R
@@ -71,7 +71,6 @@
 #' new.dtest <- xgb.DMatrix(
 #'   data = new.features.test, label = agaricus.test$label, nthread = 2
 #' )
-#' watchlist <- list(train = new.dtrain)
 #' bst <- xgb.train(params = param, data = new.dtrain, nrounds = nrounds, nthread = 2)
 #'
 #' # Model accuracy with new features
diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R
index 29bddb57f..880fd5697 100644
--- a/R-package/R/xgb.cv.R
+++ b/R-package/R/xgb.cv.R
@@ -1,6 +1,6 @@
 #' Cross Validation
 #'
-#' The cross validation function of xgboost
+#' The cross validation function of xgboost.
 #'
 #' @param params the list of parameters. The complete list of parameters is
 #'   available in the \href{http://xgboost.readthedocs.io/en/latest/parameter.html}{online documentation}. Below
@@ -19,15 +19,19 @@
 #'
 #'   See \code{\link{xgb.train}} for further details.
 #'   See also demo/ for walkthrough example in R.
-#' @param data takes an \code{xgb.DMatrix}, \code{matrix}, or \code{dgCMatrix} as the input.
+#'
+#' Note that, while `params` accepts a `seed` entry and will use such parameter for model training if
+#' supplied, this seed is not used for creation of train-test splits, which instead rely on R's own RNG
+#' system - thus, for reproducible results, one needs to call the `set.seed` function beforehand.
+#' @param data An `xgb.DMatrix` object, with corresponding fields like `label` or bounds as required
+#'        for model training by the objective.
+#'
+#'        Note that only the basic `xgb.DMatrix` class is supported - variants such as `xgb.QuantileDMatrix`
+#'        or `xgb.ExternalDMatrix` are not supported here.
 #' @param nrounds the max number of iterations
 #' @param nfold the original dataset is randomly partitioned into \code{nfold} equal size subsamples.
-#' @param label vector of response values. Should be provided only when data is an R-matrix.
-#' @param missing is only used when input is a dense matrix. By default is set to NA, which means
-#'        that NA values should be considered as 'missing' by the algorithm.
-#'        Sometimes, 0 or other extreme value might be used to represent missing values.
 #' @param prediction A logical value indicating whether to return the test fold predictions
-#'        from each CV model. This parameter engages the \code{\link{cb.cv.predict}} callback.
+#'        from each CV model. This parameter engages the \code{\link{xgb.cb.cv.predict}} callback.
 #' @param showsd \code{boolean}, whether to show standard deviation of cross validation
 #' @param metrics, list of evaluation metrics to be used in cross validation,
 #'   when it is not specified, the evaluation metric is chosen according to objective function.
@@ -47,27 +51,44 @@
 #' @param feval customized evaluation function. Returns
 #'        \code{list(metric='metric-name', value='metric-value')} with given
 #'        prediction and dtrain.
-#' @param stratified a \code{boolean} indicating whether sampling of folds should be stratified
-#'        by the values of outcome labels.
+#' @param stratified A \code{boolean} indicating whether sampling of folds should be stratified
+#'        by the values of outcome labels. For real-valued labels in regression objectives,
+#'        stratification will be done by discretizing the labels into up to 5 buckets beforehand.
+#'
+#'        If passing "auto", will be set to `TRUE` if the objective in `params` is a classification
+#'        objective (from XGBoost's built-in objectives, doesn't apply to custom ones), and to
+#'        `FALSE` otherwise.
+#'
+#'        This parameter is ignored when `data` has a `group` field - in such case, the splitting
+#'        will be based on whole groups (note that this might make the folds have different sizes).
+#'
+#'        Value `TRUE` here is \bold{not} supported for custom objectives.
 #' @param folds \code{list} provides a possibility to use a list of pre-defined CV folds
 #'        (each element must be a vector of test fold's indices). When folds are supplied,
 #'        the \code{nfold} and \code{stratified} parameters are ignored.
+#'
+#'        If `data` has a `group` field and the objective requires this field, each fold (list element)
+#'        must additionally have two attributes (retrievable through \link{attributes}) named `group_test`
+#'        and `group_train`, which should hold the `group` to assign through \link{setinfo.xgb.DMatrix} to
+#'        the resulting DMatrices.
 #' @param train_folds \code{list} list specifying which indicies to use for training. If \code{NULL}
 #'        (the default) all indices not specified in \code{folds} will be used for training.
+#'
+#'        This is not supported when `data` has `group` field.
 #' @param verbose \code{boolean}, print the statistics during the process
 #' @param print_every_n Print each n-th iteration evaluation messages when \code{verbose>0}.
 #'        Default is 1 which means all messages are printed. This parameter is passed to the
-#'        \code{\link{cb.print.evaluation}} callback.
+#'        \code{\link{xgb.cb.print.evaluation}} callback.
 #' @param early_stopping_rounds If \code{NULL}, the early stopping function is not triggered.
 #'        If set to an integer \code{k}, training with a validation set will stop if the performance
 #'        doesn't improve for \code{k} rounds.
-#'        Setting this parameter engages the \code{\link{cb.early.stop}} callback.
+#'        Setting this parameter engages the \code{\link{xgb.cb.early.stop}} callback.
 #' @param maximize If \code{feval} and \code{early_stopping_rounds} are set,
 #'        then this parameter must be set as well.
 #'        When it is \code{TRUE}, it means the larger the evaluation score the better.
-#'        This parameter is passed to the \code{\link{cb.early.stop}} callback.
+#'        This parameter is passed to the \code{\link{xgb.cb.early.stop}} callback.
 #' @param callbacks a list of callback functions to perform various task during boosting.
-#'        See \code{\link{callbacks}}. Some of the callbacks are automatically created depending on the
+#'        See \code{\link{xgb.Callback}}. Some of the callbacks are automatically created depending on the
 #'        parameters' values. User can provide either existing or their own callback methods in order
 #'        to customize the training process.
 #' @param ... other parameters to pass to \code{params}.
@@ -90,25 +111,25 @@
 #' \itemize{
 #'   \item \code{call} a function call.
 #'   \item \code{params} parameters that were passed to the xgboost library. Note that it does not
-#'         capture parameters changed by the \code{\link{cb.reset.parameters}} callback.
-#'   \item \code{callbacks} callback functions that were either automatically assigned or
-#'         explicitly passed.
+#'         capture parameters changed by the \code{\link{xgb.cb.reset.parameters}} callback.
 #'   \item \code{evaluation_log} evaluation history stored as a \code{data.table} with the
 #'         first column corresponding to iteration number and the rest corresponding to the
 #'         CV-based evaluation means and standard deviations for the training and test CV-sets.
-#'         It is created by the \code{\link{cb.evaluation.log}} callback.
+#'         It is created by the \code{\link{xgb.cb.evaluation.log}} callback.
 #'   \item \code{niter} number of boosting iterations.
 #'   \item \code{nfeatures} number of features in training data.
 #'   \item \code{folds} the list of CV folds' indices - either those passed through the \code{folds}
 #'         parameter or randomly generated.
 #'   \item \code{best_iteration} iteration number with the best evaluation metric value
 #'         (only available with early stopping).
-#'   \item \code{pred} CV prediction values available when \code{prediction} is set.
-#'         It is either vector or matrix (see \code{\link{cb.cv.predict}}).
-#'   \item \code{models} a list of the CV folds' models. It is only available with the explicit
-#'         setting of the \code{cb.cv.predict(save_models = TRUE)} callback.
 #' }
 #'
+#' Plus other potential elements that are the result of callbacks, such as a list `cv_predict` with
+#' a sub-element `pred` when passing `prediction = TRUE`, which is added by the \link{xgb.cb.cv.predict}
+#' callback (note that one can also pass it manually under `callbacks` with different settings,
+#' such as saving also the models created during cross validation); or a list `early_stop` which
+#' will contain elements such as `best_iteration` when using the early stopping callback (\link{xgb.cb.early.stop}).
+#'
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
@@ -118,13 +139,14 @@
 #' print(cv, verbose=TRUE)
 #'
 #' @export
-xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing = NA,
+xgb.cv <- function(params = list(), data, nrounds, nfold,
                    prediction = FALSE, showsd = TRUE, metrics = list(),
-                   obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, train_folds = NULL,
+                   obj = NULL, feval = NULL, stratified = "auto", folds = NULL, train_folds = NULL,
                    verbose = TRUE, print_every_n = 1L,
                    early_stopping_rounds = NULL, maximize = NULL, callbacks = list(), ...) {
 
   check.deprecation(...)
+  stopifnot(inherits(data, "xgb.DMatrix"))
   if (inherits(data, "xgb.DMatrix") && .Call(XGCheckNullPtr_R, data)) {
     stop("'data' is an invalid 'xgb.DMatrix' object. Must be constructed again.")
   }
@@ -137,16 +159,22 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
   check.custom.obj()
   check.custom.eval()
 
-  # Check the labels
-  if ((inherits(data, 'xgb.DMatrix') && !xgb.DMatrix.hasinfo(data, 'label')) ||
-      (!inherits(data, 'xgb.DMatrix') && is.null(label))) {
-    stop("Labels must be provided for CV either through xgb.DMatrix, or through 'label=' when 'data' is matrix")
-  } else if (inherits(data, 'xgb.DMatrix')) {
-    if (!is.null(label))
-      warning("xgb.cv: label will be ignored, since data is of type xgb.DMatrix")
-    cv_label <- getinfo(data, 'label')
-  } else {
-    cv_label <- label
+  if (stratified == "auto") {
+    if (is.character(params$objective)) {
+      stratified <- (
+        (params$objective %in% .CLASSIFICATION_OBJECTIVES())
+        && !(params$objective %in% .RANKING_OBJECTIVES())
+      )
+    } else {
+      stratified <- FALSE
+    }
+  }
+
+  # Check the labels and groups
+  cv_label <- getinfo(data, "label")
+  cv_group <- getinfo(data, "group")
+  if (!is.null(train_folds) && NROW(cv_group)) {
+    stop("'train_folds' is not supported for DMatrix object with 'group' field.")
   }
 
   # CV folds
@@ -157,63 +185,64 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
   } else {
     if (nfold <= 1)
       stop("'nfold' must be > 1")
-    folds <- generate.cv.folds(nfold, nrow(data), stratified, cv_label, params)
+    folds <- generate.cv.folds(nfold, nrow(data), stratified, cv_label, cv_group, params)
   }
 
+  # Callbacks
+  tmp <- .process.callbacks(callbacks, is_cv = TRUE)
+  callbacks <- tmp$callbacks
+  cb_names <- tmp$cb_names
+  rm(tmp)
+
+  # Early stopping callback
+  if (!is.null(early_stopping_rounds) && !("early_stop" %in% cb_names)) {
+    callbacks <- add.callback(
+      callbacks,
+      xgb.cb.early.stop(
+        early_stopping_rounds,
+        maximize = maximize,
+        verbose = verbose
+      ),
+      as_first_elt = TRUE
+    )
+  }
   # verbosity & evaluation printing callback:
   params <- c(params, list(silent = 1))
   print_every_n <- max(as.integer(print_every_n), 1L)
-  if (!has.callbacks(callbacks, 'cb.print.evaluation') && verbose) {
-    callbacks <- add.cb(callbacks, cb.print.evaluation(print_every_n, showsd = showsd))
+  if (verbose && !("print_evaluation" %in% cb_names)) {
+    callbacks <- add.callback(callbacks, xgb.cb.print.evaluation(print_every_n, showsd = showsd))
   }
   # evaluation log callback: always is on in CV
-  evaluation_log <- list()
-  if (!has.callbacks(callbacks, 'cb.evaluation.log')) {
-    callbacks <- add.cb(callbacks, cb.evaluation.log())
-  }
-  # Early stopping callback
-  stop_condition <- FALSE
-  if (!is.null(early_stopping_rounds) &&
-      !has.callbacks(callbacks, 'cb.early.stop')) {
-    callbacks <- add.cb(callbacks, cb.early.stop(early_stopping_rounds,
-                                                 maximize = maximize, verbose = verbose))
+  if (!("evaluation_log" %in% cb_names)) {
+    callbacks <- add.callback(callbacks, xgb.cb.evaluation.log())
   }
   # CV-predictions callback
-  if (prediction &&
-      !has.callbacks(callbacks, 'cb.cv.predict')) {
-    callbacks <- add.cb(callbacks, cb.cv.predict(save_models = FALSE))
+  if (prediction && !("cv_predict" %in% cb_names)) {
+    callbacks <- add.callback(callbacks, xgb.cb.cv.predict(save_models = FALSE))
   }
-  # Sort the callbacks into categories
-  cb <- categorize.callbacks(callbacks)
-
 
   # create the booster-folds
   # train_folds
-  dall <- xgb.get.DMatrix(
-    data = data,
-    label = label,
-    missing = missing,
-    weight = NULL,
-    nthread = params$nthread
-  )
+  dall <- data
   bst_folds <- lapply(seq_along(folds), function(k) {
-    dtest  <- xgb.slice.DMatrix(dall, folds[[k]])
+    dtest <- xgb.slice.DMatrix(dall, folds[[k]], allow_groups = TRUE)
     # code originally contributed by @RolandASc on stackoverflow
     if (is.null(train_folds))
-       dtrain <- xgb.slice.DMatrix(dall, unlist(folds[-k]))
+       dtrain <- xgb.slice.DMatrix(dall, unlist(folds[-k]), allow_groups = TRUE)
     else
-       dtrain <- xgb.slice.DMatrix(dall, train_folds[[k]])
+       dtrain <- xgb.slice.DMatrix(dall, train_folds[[k]], allow_groups = TRUE)
+    if (!is.null(attributes(folds[[k]])$group_test)) {
+      setinfo(dtest, "group", attributes(folds[[k]])$group_test)
+      setinfo(dtrain, "group", attributes(folds[[k]])$group_train)
+    }
     bst <- xgb.Booster(
       params = params,
       cachelist = list(dtrain, dtest),
       modelfile = NULL
     )
     bst <- bst$bst
-    list(dtrain = dtrain, bst = bst, watchlist = list(train = dtrain, test = dtest), index = folds[[k]])
+    list(dtrain = dtrain, bst = bst, evals = list(train = dtrain, test = dtest), index = folds[[k]])
   })
-  rm(dall)
-  # a "basket" to collect some results from callbacks
-  basket <- list()
 
   # extract parameters that can affect the relationship b/w #trees and #iterations
   num_class <- max(as.numeric(NVL(params[['num_class']], 1)), 1) # nolint
@@ -222,10 +251,25 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
   begin_iteration <- 1
   end_iteration <- nrounds
 
+  .execute.cb.before.training(
+    callbacks,
+    bst_folds,
+    dall,
+    NULL,
+    begin_iteration,
+    end_iteration
+  )
+
   # synchronous CV boosting: run CV folds' models within each iteration
   for (iteration in begin_iteration:end_iteration) {
 
-    for (f in cb$pre_iter) f()
+    .execute.cb.before.iter(
+      callbacks,
+      bst_folds,
+      dall,
+      NULL,
+      iteration
+    )
 
     msg <- lapply(bst_folds, function(fd) {
       xgb.iter.update(
@@ -236,33 +280,42 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
       )
       xgb.iter.eval(
         bst = fd$bst,
-        watchlist = fd$watchlist,
+        evals = fd$evals,
         iter = iteration - 1,
         feval = feval
       )
     })
     msg <- simplify2array(msg)
-    # Note: these variables might look unused here, but they are used in the callbacks
-    bst_evaluation <- rowMeans(msg) # nolint
-    bst_evaluation_err <- apply(msg, 1, sd) # nolint
 
-    for (f in cb$post_iter) f()
+    should_stop <- .execute.cb.after.iter(
+      callbacks,
+      bst_folds,
+      dall,
+      NULL,
+      iteration,
+      msg
+    )
 
-    if (stop_condition) break
+    if (should_stop) break
   }
-  for (f in cb$finalize) f(finalize = TRUE)
+  cb_outputs <- .execute.cb.after.training(
+    callbacks,
+    bst_folds,
+    dall,
+    NULL,
+    iteration,
+    msg
+  )
 
   # the CV result
   ret <- list(
     call = match.call(),
     params = params,
-    callbacks = callbacks,
-    evaluation_log = evaluation_log,
-    niter = end_iteration,
-    nfeatures = ncol(data),
+    niter = iteration,
+    nfeatures = ncol(dall),
     folds = folds
   )
-  ret <- c(ret, basket)
+  ret <- c(ret, cb_outputs)
 
   class(ret) <- 'xgb.cv.synchronous'
   return(invisible(ret))
@@ -285,8 +338,8 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' train <- agaricus.train
-#' cv <- xgb.cv(data = train$data, label = train$label, nfold = 5, max_depth = 2,
-#'                eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
+#' cv <- xgb.cv(data = xgb.DMatrix(train$data, label = train$label), nfold = 5, max_depth = 2,
+#'              eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
 #' print(cv)
 #' print(cv, verbose=TRUE)
 #'
@@ -308,23 +361,16 @@ print.xgb.cv.synchronous <- function(x, verbose = FALSE, ...) {
                 paste0('"', unlist(x$params), '"'),
                 sep = ' = ', collapse = ', '), '\n', sep = '')
     }
-    if (!is.null(x$callbacks) && length(x$callbacks) > 0) {
-      cat('callbacks:\n')
-      lapply(callback.calls(x$callbacks), function(x) {
-        cat('  ')
-        print(x)
-      })
-    }
 
     for (n in c('niter', 'best_iteration')) {
-      if (is.null(x[[n]]))
+      if (is.null(x$early_stop[[n]]))
         next
-      cat(n, ': ', x[[n]], '\n', sep = '')
+      cat(n, ': ', x$early_stop[[n]], '\n', sep = '')
     }
 
-    if (!is.null(x$pred)) {
+    if (!is.null(x$cv_predict$pred)) {
       cat('pred:\n')
-      str(x$pred)
+      str(x$cv_predict$pred)
     }
   }
 
@@ -332,9 +378,9 @@ print.xgb.cv.synchronous <- function(x, verbose = FALSE, ...) {
     cat('evaluation_log:\n')
   print(x$evaluation_log, row.names = FALSE, ...)
 
-  if (!is.null(x$best_iteration)) {
+  if (!is.null(x$early_stop$best_iteration)) {
     cat('Best iteration:\n')
-    print(x$evaluation_log[x$best_iteration], row.names = FALSE, ...)
+    print(x$evaluation_log[x$early_stop$best_iteration], row.names = FALSE, ...)
   }
   invisible(x)
 }
diff --git a/R-package/R/xgb.dump.R b/R-package/R/xgb.dump.R
index 3a3d2c7dc..2fa5bcb2f 100644
--- a/R-package/R/xgb.dump.R
+++ b/R-package/R/xgb.dump.R
@@ -24,6 +24,7 @@
 #' as a \code{character} vector. Otherwise it will return \code{TRUE}.
 #'
 #' @examples
+#' \dontshow{RhpcBLASctl::omp_set_num_threads(1)}
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
 #' train <- agaricus.train
diff --git a/R-package/R/xgb.load.R b/R-package/R/xgb.load.R
index 7d1eab7e9..d5b192bcb 100644
--- a/R-package/R/xgb.load.R
+++ b/R-package/R/xgb.load.R
@@ -6,7 +6,7 @@
 #'
 #' @details
 #' The input file is expected to contain a model saved in an xgboost model format
-#' using either \code{\link{xgb.save}} or \code{\link{cb.save.model}} in R, or using some
+#' using either \code{\link{xgb.save}} or \code{\link{xgb.cb.save.model}} in R, or using some
 #' appropriate methods from other xgboost interfaces. E.g., a model trained in Python and
 #' saved from there in xgboost format, could be loaded from R.
 #'
@@ -20,6 +20,7 @@
 #' \code{\link{xgb.save}}
 #'
 #' @examples
+#' \dontshow{RhpcBLASctl::omp_set_num_threads(1)}
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
 #'
diff --git a/R-package/R/xgb.save.R b/R-package/R/xgb.save.R
index e1a61d196..91c545ff7 100644
--- a/R-package/R/xgb.save.R
+++ b/R-package/R/xgb.save.R
@@ -35,6 +35,7 @@
 #' \code{\link{xgb.load}}
 #'
 #' @examples
+#' \dontshow{RhpcBLASctl::omp_set_num_threads(1)}
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
 #'
diff --git a/R-package/R/xgb.save.raw.R b/R-package/R/xgb.save.raw.R
index c124a752b..c04f06d9c 100644
--- a/R-package/R/xgb.save.raw.R
+++ b/R-package/R/xgb.save.raw.R
@@ -12,6 +12,7 @@
 #' }
 #'
 #' @examples
+#' \dontshow{RhpcBLASctl::omp_set_num_threads(1)}
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
 #'
diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R
index f0f2332b5..4cea088e0 100644
--- a/R-package/R/xgb.train.R
+++ b/R-package/R/xgb.train.R
@@ -114,13 +114,13 @@
 #' @param data training dataset. \code{xgb.train} accepts only an \code{xgb.DMatrix} as the input.
 #'        \code{xgboost}, in addition, also accepts \code{matrix}, \code{dgCMatrix}, or name of a local data file.
 #' @param nrounds max number of boosting iterations.
-#' @param watchlist named list of xgb.DMatrix datasets to use for evaluating model performance.
+#' @param evals Named list of `xgb.DMatrix` datasets to use for evaluating model performance.
 #'        Metrics specified in either \code{eval_metric} or \code{feval} will be computed for each
 #'        of these datasets during each boosting iteration, and stored in the end as a field named
 #'        \code{evaluation_log} in the resulting object. When either \code{verbose>=1} or
-#'        \code{\link{cb.print.evaluation}} callback is engaged, the performance results are continuously
+#'        \code{\link{xgb.cb.print.evaluation}} callback is engaged, the performance results are continuously
 #'        printed out during the training.
-#'        E.g., specifying \code{watchlist=list(validation1=mat1, validation2=mat2)} allows to track
+#'        E.g., specifying \code{evals=list(validation1=mat1, validation2=mat2)} allows to track
 #'        the performance of each round's model on mat1 and mat2.
 #' @param obj customized objective function. Returns gradient and second order
 #'        gradient with given prediction and dtrain.
@@ -130,31 +130,32 @@
 #' @param verbose If 0, xgboost will stay silent. If 1, it will print information about performance.
 #'        If 2, some additional information will be printed out.
 #'        Note that setting \code{verbose > 0} automatically engages the
-#'        \code{cb.print.evaluation(period=1)} callback function.
+#'        \code{xgb.cb.print.evaluation(period=1)} callback function.
 #' @param print_every_n Print each n-th iteration evaluation messages when \code{verbose>0}.
 #'        Default is 1 which means all messages are printed. This parameter is passed to the
-#'        \code{\link{cb.print.evaluation}} callback.
+#'        \code{\link{xgb.cb.print.evaluation}} callback.
 #' @param early_stopping_rounds If \code{NULL}, the early stopping function is not triggered.
 #'        If set to an integer \code{k}, training with a validation set will stop if the performance
 #'        doesn't improve for \code{k} rounds.
-#'        Setting this parameter engages the \code{\link{cb.early.stop}} callback.
+#'        Setting this parameter engages the \code{\link{xgb.cb.early.stop}} callback.
 #' @param maximize If \code{feval} and \code{early_stopping_rounds} are set,
 #'        then this parameter must be set as well.
 #'        When it is \code{TRUE}, it means the larger the evaluation score the better.
-#'        This parameter is passed to the \code{\link{cb.early.stop}} callback.
+#'        This parameter is passed to the \code{\link{xgb.cb.early.stop}} callback.
 #' @param save_period when it is non-NULL, model is saved to disk after every \code{save_period} rounds,
-#'        0 means save at the end. The saving is handled by the \code{\link{cb.save.model}} callback.
+#'        0 means save at the end. The saving is handled by the \code{\link{xgb.cb.save.model}} callback.
 #' @param save_name the name or path for periodically saved model file.
 #' @param xgb_model a previously built model to continue the training from.
 #'        Could be either an object of class \code{xgb.Booster}, or its raw data, or the name of a
 #'        file with a previously saved model.
 #' @param callbacks a list of callback functions to perform various task during boosting.
-#'        See \code{\link{callbacks}}. Some of the callbacks are automatically created depending on the
+#'        See \code{\link{xgb.Callback}}. Some of the callbacks are automatically created depending on the
 #'        parameters' values. User can provide either existing or their own callback methods in order
 #'        to customize the training process.
 #'
-#'        Note that some callbacks might try to set an evaluation log - be aware that these evaluation logs
-#'        are kept as R attributes, and thus do not get saved when using non-R serializaters like
+#'        Note that some callbacks might try to leave attributes in the resulting model object,
+#'        such as an evaluation log (a `data.table` object) - be aware that these objects are kept
+#'        as R attributes, and thus do not get saved when using XGBoost's own serializaters like
 #'        \link{xgb.save} (but are kept when using R serializers like \link{saveRDS}).
 #' @param ... other parameters to pass to \code{params}.
 #' @param label vector of response values. Should not be provided when data is
@@ -170,7 +171,7 @@
 #' @details
 #' These are the training functions for \code{xgboost}.
 #'
-#' The \code{xgb.train} interface supports advanced features such as \code{watchlist},
+#' The \code{xgb.train} interface supports advanced features such as \code{evals},
 #' customized objective and evaluation metric functions, therefore it is more flexible
 #' than the \code{xgboost} interface.
 #'
@@ -178,6 +179,11 @@
 #' Number of threads can also be manually specified via the \code{nthread}
 #' parameter.
 #'
+#' While in other interfaces, the default random seed defaults to zero, in R, if a parameter `seed`
+#' is not manually supplied, it will generate a random seed through R's own random number generator,
+#' whose seed in turn is controllable through `set.seed`. If `seed` is passed, it will override the
+#' RNG from R.
+#'
 #' The evaluation metric is chosen automatically by XGBoost (according to the objective)
 #' when the \code{eval_metric} parameter is not provided.
 #' User may set one or several \code{eval_metric} parameters.
@@ -201,18 +207,19 @@
 #'
 #' The following callbacks are automatically created when certain parameters are set:
 #' \itemize{
-#'   \item \code{cb.print.evaluation} is turned on when \code{verbose > 0};
+#'   \item \code{xgb.cb.print.evaluation} is turned on when \code{verbose > 0};
 #'         and the \code{print_every_n} parameter is passed to it.
-#'   \item \code{cb.evaluation.log} is on when \code{watchlist} is present.
-#'   \item \code{cb.early.stop}: when \code{early_stopping_rounds} is set.
-#'   \item \code{cb.save.model}: when \code{save_period > 0} is set.
+#'   \item \code{xgb.cb.evaluation.log} is on when \code{evals} is present.
+#'   \item \code{xgb.cb.early.stop}: when \code{early_stopping_rounds} is set.
+#'   \item \code{xgb.cb.save.model}: when \code{save_period > 0} is set.
 #' }
 #'
 #' Note that objects of type `xgb.Booster` as returned by this function behave a bit differently
 #' from typical R objects (it's an 'altrep' list class), and it makes a separation between
 #' internal booster attributes (restricted to jsonifyable data), accessed through \link{xgb.attr}
 #' and shared between interfaces through serialization functions like \link{xgb.save}; and
-#' R-specific attributes, accessed through \link{attributes} and \link{attr}, which are otherwise
+#' R-specific attributes (typically the result from a callback), accessed through \link{attributes}
+#' and \link{attr}, which are otherwise
 #' only used in the R interface, only kept when using R's serializers like \link{saveRDS}, and
 #' not anyhow used by functions like \link{predict.xgb.Booster}.
 #'
@@ -224,7 +231,7 @@
 #' effect elsewhere.
 #'
 #' @seealso
-#' \code{\link{callbacks}},
+#' \code{\link{xgb.Callback}},
 #' \code{\link{predict.xgb.Booster}},
 #' \code{\link{xgb.cv}}
 #'
@@ -247,12 +254,12 @@
 #' dtest <- with(
 #'   agaricus.test, xgb.DMatrix(data, label = label, nthread = nthread)
 #' )
-#' watchlist <- list(train = dtrain, eval = dtest)
+#' evals <- list(train = dtrain, eval = dtest)
 #'
 #' ## A simple xgb.train example:
 #' param <- list(max_depth = 2, eta = 1, nthread = nthread,
 #'               objective = "binary:logistic", eval_metric = "auc")
-#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0)
+#' bst <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0)
 #'
 #' ## An xgb.train example where custom objective and evaluation metric are
 #' ## used:
@@ -273,15 +280,15 @@
 #' #  as 'objective' and 'eval_metric' parameters in the params list:
 #' param <- list(max_depth = 2, eta = 1, nthread = nthread,
 #'               objective = logregobj, eval_metric = evalerror)
-#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0)
+#' bst <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0)
 #'
 #' #  or through the ... arguments:
 #' param <- list(max_depth = 2, eta = 1, nthread = nthread)
-#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
+#' bst <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0,
 #'                  objective = logregobj, eval_metric = evalerror)
 #'
 #' #  or as dedicated 'obj' and 'feval' parameters of xgb.train:
-#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
+#' bst <- xgb.train(param, dtrain, nrounds = 2, evals = evals,
 #'                  obj = logregobj, feval = evalerror)
 #'
 #'
@@ -289,11 +296,11 @@
 #' param <- list(max_depth = 2, eta = 1, nthread = nthread,
 #'               objective = "binary:logistic", eval_metric = "auc")
 #' my_etas <- list(eta = c(0.5, 0.1))
-#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
-#'                  callbacks = list(cb.reset.parameters(my_etas)))
+#' bst <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0,
+#'                  callbacks = list(xgb.cb.reset.parameters(my_etas)))
 #'
 #' ## Early stopping:
-#' bst <- xgb.train(param, dtrain, nrounds = 25, watchlist,
+#' bst <- xgb.train(param, dtrain, nrounds = 25, evals = evals,
 #'                  early_stopping_rounds = 3)
 #'
 #' ## An 'xgboost' interface example:
@@ -304,7 +311,7 @@
 #'
 #' @rdname xgb.train
 #' @export
-xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
+xgb.train <- function(params = list(), data, nrounds, evals = list(),
                       obj = NULL, feval = NULL, verbose = 1, print_every_n = 1L,
                       early_stopping_rounds = NULL, maximize = NULL,
                       save_period = NULL, save_name = "xgboost.model",
@@ -317,68 +324,68 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
   check.custom.obj()
   check.custom.eval()
 
-  # data & watchlist checks
+  # data & evals checks
   dtrain <- data
   if (!inherits(dtrain, "xgb.DMatrix"))
     stop("second argument dtrain must be xgb.DMatrix")
-  if (length(watchlist) > 0) {
-    if (typeof(watchlist) != "list" ||
-        !all(vapply(watchlist, inherits, logical(1), what = 'xgb.DMatrix')))
-      stop("watchlist must be a list of xgb.DMatrix elements")
-    evnames <- names(watchlist)
+  if (length(evals) > 0) {
+    if (typeof(evals) != "list" ||
+        !all(vapply(evals, inherits, logical(1), what = 'xgb.DMatrix')))
+      stop("'evals' must be a list of xgb.DMatrix elements")
+    evnames <- names(evals)
     if (is.null(evnames) || any(evnames == ""))
-      stop("each element of the watchlist must have a name tag")
+      stop("each element of 'evals' must have a name tag")
   }
   # Handle multiple evaluation metrics given as a list
   for (m in params$eval_metric) {
     params <- c(params, list(eval_metric = m))
   }
 
-  # evaluation printing callback
   params <- c(params)
-  print_every_n <- max(as.integer(print_every_n), 1L)
-  if (!has.callbacks(callbacks, 'cb.print.evaluation') &&
-      verbose) {
-    callbacks <- add.cb(callbacks, cb.print.evaluation(print_every_n))
-  }
-  # evaluation log callback:  it is automatically enabled when watchlist is provided
-  evaluation_log <- list()
-  if (!has.callbacks(callbacks, 'cb.evaluation.log') &&
-      length(watchlist) > 0) {
-    callbacks <- add.cb(callbacks, cb.evaluation.log())
-  }
-  # Model saving callback
-  if (!is.null(save_period) &&
-      !has.callbacks(callbacks, 'cb.save.model')) {
-    callbacks <- add.cb(callbacks, cb.save.model(save_period, save_name))
-  }
-  # Early stopping callback
-  stop_condition <- FALSE
-  if (!is.null(early_stopping_rounds) &&
-      !has.callbacks(callbacks, 'cb.early.stop')) {
-    callbacks <- add.cb(callbacks, cb.early.stop(early_stopping_rounds,
-                                                 maximize = maximize, verbose = verbose))
+  params['validate_parameters'] <- TRUE
+  if (!("seed" %in% names(params))) {
+    params[["seed"]] <- sample(.Machine$integer.max, size = 1)
   }
 
-  # Sort the callbacks into categories
-  cb <- categorize.callbacks(callbacks)
-  params['validate_parameters'] <- TRUE
-  if (!is.null(params[['seed']])) {
-    warning("xgb.train: `seed` is ignored in R package.  Use `set.seed()` instead.")
+  # callbacks
+  tmp <- .process.callbacks(callbacks, is_cv = FALSE)
+  callbacks <- tmp$callbacks
+  cb_names <- tmp$cb_names
+  rm(tmp)
+
+  # Early stopping callback (should always come first)
+  if (!is.null(early_stopping_rounds) && !("early_stop" %in% cb_names)) {
+    callbacks <- add.callback(
+      callbacks,
+      xgb.cb.early.stop(
+        early_stopping_rounds,
+        maximize = maximize,
+        verbose = verbose
+      ),
+      as_first_elt = TRUE
+    )
+  }
+  # evaluation printing callback
+  print_every_n <- max(as.integer(print_every_n), 1L)
+  if (verbose && !("print_evaluation" %in% cb_names)) {
+    callbacks <- add.callback(callbacks, xgb.cb.print.evaluation(print_every_n))
+  }
+  # evaluation log callback:  it is automatically enabled when 'evals' is provided
+  if (length(evals) && !("evaluation_log" %in% cb_names)) {
+    callbacks <- add.callback(callbacks, xgb.cb.evaluation.log())
+  }
+  # Model saving callback
+  if (!is.null(save_period) && !("save_model" %in% cb_names)) {
+    callbacks <- add.callback(callbacks, xgb.cb.save.model(save_period, save_name))
   }
 
   # The tree updating process would need slightly different handling
   is_update <- NVL(params[['process_type']], '.') == 'update'
 
-  past_evaluation_log <- NULL
-  if (inherits(xgb_model, "xgb.Booster")) {
-    past_evaluation_log <- attributes(xgb_model)$evaluation_log
-  }
-
   # Construct a booster (either a new one or load from xgb_model)
   bst <- xgb.Booster(
     params = params,
-    cachelist = append(watchlist, dtrain),
+    cachelist = append(evals, dtrain),
     modelfile = xgb_model
   )
   niter_init <- bst$niter
@@ -389,11 +396,6 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
     dtrain
   )
 
-  # extract parameters that can affect the relationship b/w #trees and #iterations
-  # Note: it might look like these aren't used, but they need to be defined in this
-  # environment for the callbacks for work correctly.
-  num_class <- max(as.numeric(NVL(params[['num_class']], 1)), 1) # nolint
-
   if (is_update && nrounds > niter_init)
     stop("nrounds cannot be larger than ", niter_init, " (nrounds of xgb_model)")
 
@@ -401,57 +403,83 @@ xgb.train <- function(params = list(), data, nrounds, watchlist = list(),
   begin_iteration <- niter_skip + 1
   end_iteration <- niter_skip + nrounds
 
+  .execute.cb.before.training(
+    callbacks,
+    bst,
+    dtrain,
+    evals,
+    begin_iteration,
+    end_iteration
+  )
+
   # the main loop for boosting iterations
   for (iteration in begin_iteration:end_iteration) {
 
-    for (f in cb$pre_iter) f()
-
-    xgb.iter.update(
-        bst = bst,
-        dtrain = dtrain,
-        iter = iteration - 1,
-        obj = obj
+    .execute.cb.before.iter(
+      callbacks,
+      bst,
+      dtrain,
+      evals,
+      iteration
     )
 
-    if (length(watchlist) > 0) {
-      bst_evaluation <- xgb.iter.eval(  # nolint: object_usage_linter
+    xgb.iter.update(
+      bst = bst,
+      dtrain = dtrain,
+      iter = iteration - 1,
+      obj = obj
+    )
+
+    bst_evaluation <- NULL
+    if (length(evals) > 0) {
+      bst_evaluation <- xgb.iter.eval(
         bst = bst,
-        watchlist = watchlist,
+        evals = evals,
         iter = iteration - 1,
         feval = feval
       )
     }
 
-    for (f in cb$post_iter) f()
+    should_stop <- .execute.cb.after.iter(
+      callbacks,
+      bst,
+      dtrain,
+      evals,
+      iteration,
+      bst_evaluation
+    )
 
-    if (stop_condition) break
+    if (should_stop) break
   }
-  for (f in cb$finalize) f(finalize = TRUE)
 
-  # store the evaluation results
-  keep_evaluation_log <- FALSE
-  if (length(evaluation_log) > 0 && nrow(evaluation_log) > 0) {
-    keep_evaluation_log <- TRUE
-    # include the previous compatible history when available
-    if (inherits(xgb_model, 'xgb.Booster') &&
-        !is_update &&
-        !is.null(past_evaluation_log) &&
-        isTRUE(all.equal(colnames(evaluation_log),
-                         colnames(past_evaluation_log)))) {
-      evaluation_log <- rbindlist(list(past_evaluation_log, evaluation_log))
-    }
-  }
+  cb_outputs <- .execute.cb.after.training(
+    callbacks,
+    bst,
+    dtrain,
+    evals,
+    iteration,
+    bst_evaluation
+  )
 
   extra_attrs <- list(
     call = match.call(),
-    params = params,
-    callbacks = callbacks
+    params = params
   )
-  if (keep_evaluation_log) {
-    extra_attrs$evaluation_log <- evaluation_log
-  }
+
   curr_attrs <- attributes(bst)
-  attributes(bst) <- c(curr_attrs, extra_attrs)
+  if (NROW(curr_attrs)) {
+    curr_attrs <- curr_attrs[
+      setdiff(
+        names(curr_attrs),
+        c(names(extra_attrs), names(cb_outputs))
+      )
+    ]
+  }
+  curr_attrs <- c(extra_attrs, curr_attrs)
+  if (NROW(cb_outputs)) {
+    curr_attrs <- c(curr_attrs, cb_outputs)
+  }
+  attributes(bst) <- curr_attrs
 
   return(bst)
 }
diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R
index 170aa5ffd..a1d373581 100644
--- a/R-package/R/xgboost.R
+++ b/R-package/R/xgboost.R
@@ -18,9 +18,9 @@ xgboost <- function(data = NULL, label = NULL, missing = NA, weight = NULL,
     nthread = merged$nthread
   )
 
-  watchlist <- list(train = dtrain)
+  evals <- list(train = dtrain)
 
-  bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose = verbose, print_every_n = print_every_n,
+  bst <- xgb.train(params, dtrain, nrounds, evals, verbose = verbose, print_every_n = print_every_n,
                    early_stopping_rounds = early_stopping_rounds, maximize = maximize,
                    save_period = save_period, save_name = save_name,
                    xgb_model = xgb_model, callbacks = callbacks, ...)
@@ -82,12 +82,8 @@ NULL
 NULL
 
 # Various imports
-#' @importClassesFrom Matrix dgCMatrix dgeMatrix dgRMatrix
-#' @importFrom Matrix colSums
+#' @importClassesFrom Matrix dgCMatrix dgRMatrix CsparseMatrix
 #' @importFrom Matrix sparse.model.matrix
-#' @importFrom Matrix sparseVector
-#' @importFrom Matrix sparseMatrix
-#' @importFrom Matrix t
 #' @importFrom data.table data.table
 #' @importFrom data.table is.data.table
 #' @importFrom data.table as.data.table
@@ -103,6 +99,7 @@ NULL
 #' @importFrom stats coef
 #' @importFrom stats predict
 #' @importFrom stats median
+#' @importFrom stats sd
 #' @importFrom stats variable.names
 #' @importFrom utils head
 #' @importFrom graphics barplot
diff --git a/R-package/demo/basic_walkthrough.R b/R-package/demo/basic_walkthrough.R
index 31f79fb57..9403bac20 100644
--- a/R-package/demo/basic_walkthrough.R
+++ b/R-package/demo/basic_walkthrough.R
@@ -55,6 +55,8 @@ print(paste("test-error=", err))
 # save model to binary local file
 xgb.save(bst, "xgboost.model")
 # load binary model to R
+# Function doesn't take 'nthreads', but can be set like this:
+RhpcBLASctl::omp_set_num_threads(1)
 bst2 <- xgb.load("xgboost.model")
 pred2 <- predict(bst2, test$data)
 # pred2 should be identical to pred
@@ -72,17 +74,17 @@ print(paste("sum(abs(pred3-pred))=", sum(abs(pred3 - pred))))
 # to use advanced features, we need to put data in xgb.DMatrix
 dtrain <- xgb.DMatrix(data = train$data, label = train$label)
 dtest <- xgb.DMatrix(data = test$data, label = test$label)
-#---------------Using watchlist----------------
-# watchlist is a list of xgb.DMatrix, each of them is tagged with name
-watchlist <- list(train = dtrain, test = dtest)
-# to train with watchlist, use xgb.train, which contains more advanced features
-# watchlist allows us to monitor the evaluation result on all data in the list
-print("Train xgboost using xgb.train with watchlist")
-bst <- xgb.train(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, watchlist = watchlist,
+#---------------Using an evaluation set----------------
+# 'evals' is a list of xgb.DMatrix, each of them is tagged with name
+evals <- list(train = dtrain, test = dtest)
+# to train with an evaluation set, use xgb.train, which contains more advanced features
+# 'evals' argument allows us to monitor the evaluation result on all data in the list
+print("Train xgboost using xgb.train with evaluation data")
+bst <- xgb.train(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, evals = evals,
                  nthread = 2, objective = "binary:logistic")
 # we can change evaluation metrics, or use multiple evaluation metrics
-print("train xgboost using xgb.train with watchlist, watch logloss and error")
-bst <- xgb.train(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, watchlist = watchlist,
+print("train xgboost using xgb.train with evaluation data, watch logloss and error")
+bst <- xgb.train(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, evals = evals,
                  eval_metric = "error", eval_metric = "logloss",
                  nthread = 2, objective = "binary:logistic")
 
@@ -90,7 +92,7 @@ bst <- xgb.train(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, watchlist =
 xgb.DMatrix.save(dtrain, "dtrain.buffer")
 # to load it in, simply call xgb.DMatrix
 dtrain2 <- xgb.DMatrix("dtrain.buffer")
-bst <- xgb.train(data = dtrain2, max_depth = 2, eta = 1, nrounds = 2, watchlist = watchlist,
+bst <- xgb.train(data = dtrain2, max_depth = 2, eta = 1, nrounds = 2, evals = evals,
                  nthread = 2, objective = "binary:logistic")
 # information can be extracted from xgb.DMatrix using getinfo
 label <- getinfo(dtest, "label")
diff --git a/R-package/demo/boost_from_prediction.R b/R-package/demo/boost_from_prediction.R
index 1a3d55369..75af70dba 100644
--- a/R-package/demo/boost_from_prediction.R
+++ b/R-package/demo/boost_from_prediction.R
@@ -5,14 +5,14 @@ data(agaricus.test, package = 'xgboost')
 dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
 dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
 
-watchlist <- list(eval = dtest, train = dtrain)
+evals <- list(eval = dtest, train = dtrain)
 ###
 # advanced: start from a initial base prediction
 #
 print('start running example to start from a initial prediction')
 # train xgboost for 1 round
 param <- list(max_depth = 2, eta = 1, nthread = 2, objective = 'binary:logistic')
-bst <- xgb.train(param, dtrain, 1, watchlist)
+bst <- xgb.train(param, dtrain, 1, evals)
 # Note: we need the margin value instead of transformed prediction in set_base_margin
 # do predict with output_margin=TRUE, will always give you margin values before logistic transformation
 ptrain <- predict(bst, dtrain, outputmargin = TRUE)
@@ -23,4 +23,4 @@ setinfo(dtrain, "base_margin", ptrain)
 setinfo(dtest, "base_margin", ptest)
 
 print('this is result of boost from initial prediction')
-bst <- xgb.train(params = param, data = dtrain, nrounds = 1, watchlist = watchlist)
+bst <- xgb.train(params = param, data = dtrain, nrounds = 1, evals = evals)
diff --git a/R-package/demo/custom_objective.R b/R-package/demo/custom_objective.R
index 35201332c..03d7b3464 100644
--- a/R-package/demo/custom_objective.R
+++ b/R-package/demo/custom_objective.R
@@ -8,7 +8,7 @@ dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
 # note: for customized objective function, we leave objective as default
 # note: what we are getting is margin value in prediction
 # you must know what you are doing
-watchlist <- list(eval = dtest, train = dtrain)
+evals <- list(eval = dtest, train = dtrain)
 num_round <- 2
 
 # user define objective function, given prediction, return gradient and second order gradient
@@ -38,7 +38,7 @@ param <- list(max_depth = 2, eta = 1, nthread  =  2, verbosity = 0,
 print('start training with user customized objective')
 # training with customized objective, we can also do step by step training
 # simply look at xgboost.py's implementation of train
-bst <- xgb.train(param, dtrain, num_round, watchlist)
+bst <- xgb.train(param, dtrain, num_round, evals)
 
 #
 # there can be cases where you want additional information
@@ -62,4 +62,4 @@ param <- list(max_depth = 2, eta = 1, nthread  =  2, verbosity = 0,
 print('start training with user customized objective, with additional attributes in DMatrix')
 # training with customized objective, we can also do step by step training
 # simply look at xgboost.py's implementation of train
-bst <- xgb.train(param, dtrain, num_round, watchlist)
+bst <- xgb.train(param, dtrain, num_round, evals)
diff --git a/R-package/demo/early_stopping.R b/R-package/demo/early_stopping.R
index 04da1382f..057440882 100644
--- a/R-package/demo/early_stopping.R
+++ b/R-package/demo/early_stopping.R
@@ -8,7 +8,7 @@ dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
 # note: what we are getting is margin value in prediction
 # you must know what you are doing
 param <- list(max_depth = 2, eta = 1, nthread = 2, verbosity = 0)
-watchlist <- list(eval = dtest)
+evals <- list(eval = dtest)
 num_round <- 20
 # user define objective function, given prediction, return gradient and second order gradient
 # this is log likelihood loss
@@ -32,7 +32,7 @@ evalerror <- function(preds, dtrain) {
 }
 print('start training with early Stopping setting')
 
-bst <- xgb.train(param, dtrain, num_round, watchlist,
+bst <- xgb.train(param, dtrain, num_round, evals,
                  objective = logregobj, eval_metric = evalerror, maximize = FALSE,
                  early_stopping_round = 3)
 bst <- xgb.cv(param, dtrain, num_round, nfold = 5,
diff --git a/R-package/demo/generalized_linear_model.R b/R-package/demo/generalized_linear_model.R
index c24fe72cb..d29a6dc5b 100644
--- a/R-package/demo/generalized_linear_model.R
+++ b/R-package/demo/generalized_linear_model.R
@@ -25,9 +25,9 @@ param <- list(objective = "binary:logistic", booster = "gblinear",
 ##
 # the rest of settings are the same
 ##
-watchlist <- list(eval = dtest, train = dtrain)
+evals <- list(eval = dtest, train = dtrain)
 num_round <- 2
-bst <- xgb.train(param, dtrain, num_round, watchlist)
+bst <- xgb.train(param, dtrain, num_round, evals)
 ypred <- predict(bst, dtest)
 labels <- getinfo(dtest, 'label')
 cat('error of preds=', mean(as.numeric(ypred > 0.5) != labels), '\n')
diff --git a/R-package/demo/gpu_accelerated.R b/R-package/demo/gpu_accelerated.R
index 14ed9392b..617a63e74 100644
--- a/R-package/demo/gpu_accelerated.R
+++ b/R-package/demo/gpu_accelerated.R
@@ -23,7 +23,7 @@ y <- rbinom(N, 1, plogis(m))
 tr <- sample.int(N, N * 0.75)
 dtrain <- xgb.DMatrix(X[tr, ], label = y[tr])
 dtest <- xgb.DMatrix(X[-tr, ], label = y[-tr])
-wl <- list(train = dtrain, test = dtest)
+evals <- list(train = dtrain, test = dtest)
 
 # An example of running 'gpu_hist' algorithm
 # which is
@@ -35,11 +35,11 @@ wl <- list(train = dtrain, test = dtest)
 param <- list(objective = 'reg:logistic', eval_metric = 'auc', subsample = 0.5, nthread = 4,
               max_bin = 64, tree_method = 'gpu_hist')
 pt <- proc.time()
-bst_gpu <- xgb.train(param, dtrain, watchlist = wl, nrounds = 50)
+bst_gpu <- xgb.train(param, dtrain, evals = evals, nrounds = 50)
 proc.time() - pt
 
 # Compare to the 'hist' algorithm:
 param$tree_method <- 'hist'
 pt <- proc.time()
-bst_hist <- xgb.train(param, dtrain, watchlist = wl, nrounds = 50)
+bst_hist <- xgb.train(param, dtrain, evals = evals, nrounds = 50)
 proc.time() - pt
diff --git a/R-package/demo/predict_first_ntree.R b/R-package/demo/predict_first_ntree.R
index 179c18c70..ba15ab39a 100644
--- a/R-package/demo/predict_first_ntree.R
+++ b/R-package/demo/predict_first_ntree.R
@@ -6,11 +6,11 @@ dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
 dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
 
 param <- list(max_depth = 2, eta = 1, objective = 'binary:logistic')
-watchlist <- list(eval = dtest, train = dtrain)
+evals <- list(eval = dtest, train = dtrain)
 nrounds <- 2
 
 # training the model for two rounds
-bst <- xgb.train(param, dtrain, nrounds, nthread = 2, watchlist)
+bst <- xgb.train(param, dtrain, nrounds, nthread = 2, evals = evals)
 cat('start testing prediction from first n trees\n')
 labels <- getinfo(dtest, 'label')
 
diff --git a/R-package/demo/predict_leaf_indices.R b/R-package/demo/predict_leaf_indices.R
index 21b6fa71d..a57baf668 100644
--- a/R-package/demo/predict_leaf_indices.R
+++ b/R-package/demo/predict_leaf_indices.R
@@ -43,7 +43,6 @@ colnames(new.features.test) <- colnames(new.features.train)
 # learning with new features
 new.dtrain <- xgb.DMatrix(data = new.features.train, label = agaricus.train$label)
 new.dtest <- xgb.DMatrix(data = new.features.test, label = agaricus.test$label)
-watchlist <- list(train = new.dtrain)
 bst <- xgb.train(params = param, data = new.dtrain, nrounds = nrounds, nthread = 2)
 
 # Model accuracy with new features
diff --git a/R-package/demo/tweedie_regression.R b/R-package/demo/tweedie_regression.R
index dfaf6a2ae..b07858e76 100644
--- a/R-package/demo/tweedie_regression.R
+++ b/R-package/demo/tweedie_regression.R
@@ -39,7 +39,7 @@ bst <- xgb.train(
   data = d_train,
   params = params,
   maximize = FALSE,
-  watchlist = list(train = d_train),
+  evals = list(train = d_train),
   nrounds = 20)
 
 var_imp <- xgb.importance(attr(x, 'Dimnames')[[2]], model = bst)
diff --git a/R-package/man/callbacks.Rd b/R-package/man/callbacks.Rd
deleted file mode 100644
index 9f6f69015..000000000
--- a/R-package/man/callbacks.Rd
+++ /dev/null
@@ -1,37 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/callbacks.R
-\name{callbacks}
-\alias{callbacks}
-\title{Callback closures for booster training.}
-\description{
-These are used to perform various service tasks either during boosting iterations or at the end.
-This approach helps to modularize many of such tasks without bloating the main training methods,
-and it offers .
-}
-\details{
-By default, a callback function is run after each boosting iteration.
-An R-attribute \code{is_pre_iteration} could be set for a callback to define a pre-iteration function.
-
-When a callback function has \code{finalize} parameter, its finalizer part will also be run after
-the boosting is completed.
-
-WARNING: side-effects!!! Be aware that these callback functions access and modify things in
-the environment from which they are called from, which is a fairly uncommon thing to do in R.
-
-To write a custom callback closure, make sure you first understand the main concepts about R environments.
-Check either R documentation on \code{\link[base]{environment}} or the
-\href{http://adv-r.had.co.nz/Environments.html}{Environments chapter} from the "Advanced R"
-book by Hadley Wickham. Further, the best option is to read the code of some of the existing callbacks -
-choose ones that do something similar to what you want to achieve. Also, you would need to get familiar
-with the objects available inside of the \code{xgb.train} and \code{xgb.cv} internal environments.
-}
-\seealso{
-\code{\link{cb.print.evaluation}},
-\code{\link{cb.evaluation.log}},
-\code{\link{cb.reset.parameters}},
-\code{\link{cb.early.stop}},
-\code{\link{cb.save.model}},
-\code{\link{cb.cv.predict}},
-\code{\link{xgb.train}},
-\code{\link{xgb.cv}}
-}
diff --git a/R-package/man/cb.early.stop.Rd b/R-package/man/cb.early.stop.Rd
deleted file mode 100644
index 7cd51a3ce..000000000
--- a/R-package/man/cb.early.stop.Rd
+++ /dev/null
@@ -1,62 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/callbacks.R
-\name{cb.early.stop}
-\alias{cb.early.stop}
-\title{Callback closure to activate the early stopping.}
-\usage{
-cb.early.stop(
-  stopping_rounds,
-  maximize = FALSE,
-  metric_name = NULL,
-  verbose = TRUE
-)
-}
-\arguments{
-\item{stopping_rounds}{The number of rounds with no improvement in
-the evaluation metric in order to stop the training.}
-
-\item{maximize}{whether to maximize the evaluation metric}
-
-\item{metric_name}{the name of an evaluation column to use as a criteria for early
-stopping. If not set, the last column would be used.
-Let's say the test data in \code{watchlist} was labelled as \code{dtest},
-and one wants to use the AUC in test data for early stopping regardless of where
-it is in the \code{watchlist}, then one of the following would need to be set:
-\code{metric_name='dtest-auc'} or \code{metric_name='dtest_auc'}.
-All dash '-' characters in metric names are considered equivalent to '_'.}
-
-\item{verbose}{whether to print the early stopping information.}
-}
-\description{
-Callback closure to activate the early stopping.
-}
-\details{
-This callback function determines the condition for early stopping
-by setting the \code{stop_condition = TRUE} flag in its calling frame.
-
-The following additional fields are assigned to the model's R object:
-\itemize{
-\item \code{best_score} the evaluation score at the best iteration
-\item \code{best_iteration} at which boosting iteration the best score has occurred (1-based index)
-}
-The Same values are also stored as xgb-attributes:
-\itemize{
-\item \code{best_iteration} is stored as a 0-based iteration index (for interoperability of binary models)
-\item \code{best_msg} message string is also stored.
-}
-
-At least one data element is required in the evaluation watchlist for early stopping to work.
-
-Callback function expects the following values to be set in its calling frame:
-\code{stop_condition},
-\code{bst_evaluation},
-\code{rank},
-\code{bst} (or \code{bst_folds} and \code{basket}),
-\code{iteration},
-\code{begin_iteration},
-\code{end_iteration},
-}
-\seealso{
-\code{\link{callbacks}},
-\code{\link{xgb.attr}}
-}
diff --git a/R-package/man/cb.evaluation.log.Rd b/R-package/man/cb.evaluation.log.Rd
deleted file mode 100644
index 94f8a02e6..000000000
--- a/R-package/man/cb.evaluation.log.Rd
+++ /dev/null
@@ -1,31 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/callbacks.R
-\name{cb.evaluation.log}
-\alias{cb.evaluation.log}
-\title{Callback closure for logging the evaluation history}
-\usage{
-cb.evaluation.log()
-}
-\description{
-Callback closure for logging the evaluation history
-}
-\details{
-This callback function appends the current iteration evaluation results \code{bst_evaluation}
-available in the calling parent frame to the \code{evaluation_log} list in a calling frame.
-
-The finalizer callback (called with \code{finalize = TURE} in the end) converts
-the \code{evaluation_log} list into a final data.table.
-
-The iteration evaluation result \code{bst_evaluation} must be a named numeric vector.
-
-Note: in the column names of the final data.table, the dash '-' character is replaced with
-the underscore '_' in order to make the column names more like regular R identifiers.
-
-Callback function expects the following values to be set in its calling frame:
-\code{evaluation_log},
-\code{bst_evaluation},
-\code{iteration}.
-}
-\seealso{
-\code{\link{callbacks}}
-}
diff --git a/R-package/man/cb.print.evaluation.Rd b/R-package/man/cb.print.evaluation.Rd
deleted file mode 100644
index 59b9ba65e..000000000
--- a/R-package/man/cb.print.evaluation.Rd
+++ /dev/null
@@ -1,29 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/callbacks.R
-\name{cb.print.evaluation}
-\alias{cb.print.evaluation}
-\title{Callback closure for printing the result of evaluation}
-\usage{
-cb.print.evaluation(period = 1, showsd = TRUE)
-}
-\arguments{
-\item{period}{results would be printed every number of periods}
-
-\item{showsd}{whether standard deviations should be printed (when available)}
-}
-\description{
-Callback closure for printing the result of evaluation
-}
-\details{
-The callback function prints the result of evaluation at every \code{period} iterations.
-The initial and the last iteration's evaluations are always printed.
-
-Callback function expects the following values to be set in its calling frame:
-\code{bst_evaluation} (also \code{bst_evaluation_err} when available),
-\code{iteration},
-\code{begin_iteration},
-\code{end_iteration}.
-}
-\seealso{
-\code{\link{callbacks}}
-}
diff --git a/R-package/man/cb.save.model.Rd b/R-package/man/cb.save.model.Rd
deleted file mode 100644
index 7701ad990..000000000
--- a/R-package/man/cb.save.model.Rd
+++ /dev/null
@@ -1,40 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/callbacks.R
-\name{cb.save.model}
-\alias{cb.save.model}
-\title{Callback closure for saving a model file.}
-\usage{
-cb.save.model(save_period = 0, save_name = "xgboost.ubj")
-}
-\arguments{
-\item{save_period}{save the model to disk after every
-\code{save_period} iterations; 0 means save the model at the end.}
-
-\item{save_name}{the name or path for the saved model file.
-
-\if{html}{\out{<div class="sourceCode">}}\preformatted{   Note that the format of the model being saved is determined by the file
-   extension specified here (see \link{xgb.save} for details about how it works).
-
-   It can contain a \code{\link[base]{sprintf}} formatting specifier
-   to include the integer iteration number in the file name.
-   E.g., with \code{save_name} = 'xgboost_\%04d.ubj',
-   the file saved at iteration 50 would be named "xgboost_0050.ubj".
-}\if{html}{\out{</div>}}}
-}
-\description{
-Callback closure for saving a model file.
-}
-\details{
-This callback function allows to save an xgb-model file, either periodically after each \code{save_period}'s or at the end.
-
-Callback function expects the following values to be set in its calling frame:
-\code{bst},
-\code{iteration},
-\code{begin_iteration},
-\code{end_iteration}.
-}
-\seealso{
-\link{xgb.save}
-
-\code{\link{callbacks}}
-}
diff --git a/R-package/man/predict.xgb.Booster.Rd b/R-package/man/predict.xgb.Booster.Rd
index 95e7a51fd..88a2f203e 100644
--- a/R-package/man/predict.xgb.Booster.Rd
+++ b/R-package/man/predict.xgb.Booster.Rd
@@ -18,25 +18,47 @@
   iterationrange = NULL,
   strict_shape = FALSE,
   validate_features = FALSE,
+  base_margin = NULL,
   ...
 )
 }
 \arguments{
 \item{object}{Object of class \code{xgb.Booster}.}
 
-\item{newdata}{Takes \code{matrix}, \code{dgCMatrix}, \code{dgRMatrix}, \code{dsparseVector},
+\item{newdata}{Takes \code{data.frame}, \code{matrix}, \code{dgCMatrix}, \code{dgRMatrix}, \code{dsparseVector},
 local data file, or \code{xgb.DMatrix}.
-For single-row predictions on sparse data, it is recommended to use the CSR format.
-If passing a sparse vector, it will take it as a row vector.}
 
-\item{missing}{Only used when input is a dense matrix. Pick a float value that represents
-missing values in data (e.g., 0 or some other extreme value).}
+\if{html}{\out{<div class="sourceCode">}}\preformatted{   For single-row predictions on sparse data, it's recommended to use CSR format. If passing
+   a sparse vector, it will take it as a row vector.
+
+   Note that, for repeated predictions on the same data, one might want to create a DMatrix to
+   pass here instead of passing R types like matrices or data frames, as predictions will be
+   faster on DMatrix.
+
+   If `newdata` is a `data.frame`, be aware that:\\itemize\{
+   \\item Columns will be converted to numeric if they aren't already, which could potentially make
+         the operation slower than in an equivalent `matrix` object.
+   \\item The order of the columns must match with that of the data from which the model was fitted
+         (i.e. columns will not be referenced by their names, just by their order in the data).
+   \\item If the model was fitted to data with categorical columns, these columns must be of
+         `factor` type here, and must use the same encoding (i.e. have the same levels).
+   \\item If `newdata` contains any `factor` columns, they will be converted to base-0
+         encoding (same as during DMatrix creation) - hence, one should not pass a `factor`
+         under a column which during training had a different type.
+   \}
+}\if{html}{\out{</div>}}}
+
+\item{missing}{Float value that represents missing values in data (e.g., 0 or some other extreme value).
+
+\if{html}{\out{<div class="sourceCode">}}\preformatted{   This parameter is not used when `newdata` is an `xgb.DMatrix` - in such cases, should pass
+   this as an argument to the DMatrix constructor instead.
+}\if{html}{\out{</div>}}}
 
 \item{outputmargin}{Whether the prediction should be returned in the form of original untransformed
 sum of predictions from boosting iterations' results. E.g., setting \code{outputmargin=TRUE} for
 logistic regression would return log-odds instead of probabilities.}
 
-\item{predleaf}{Whether to predict pre-tree leaf indices.}
+\item{predleaf}{Whether to predict per-tree leaf indices.}
 
 \item{predcontrib}{Whether to return feature contributions to individual predictions (see Details).}
 
@@ -48,7 +70,7 @@ logistic regression would return log-odds instead of probabilities.}
 prediction outputs per case. No effect if \code{predleaf}, \code{predcontrib},
 or \code{predinteraction} is \code{TRUE}.}
 
-\item{training}{Whether the predictions are used for training. For dart booster,
+\item{training}{Whether the prediction result is used for training. For dart booster,
 training predicting will perform dropout.}
 
 \item{iterationrange}{Sequence of rounds/iterations from the model to use for prediction, specified by passing
@@ -84,6 +106,13 @@ match (only applicable when both \code{object} and \code{newdata} have feature n
    recommended to disable it for performance-sensitive applications.
 }\if{html}{\out{</div>}}}
 
+\item{base_margin}{Base margin used for boosting from existing model.
+
+\if{html}{\out{<div class="sourceCode">}}\preformatted{   Note that, if `newdata` is an `xgb.DMatrix` object, this argument will
+   be ignored as it needs to be added to the DMatrix instead (e.g. by passing it as
+   an argument in its constructor, or by calling \link{setinfo.xgb.DMatrix}).
+}\if{html}{\out{</div>}}}
+
 \item{...}{Not used.}
 }
 \value{
@@ -115,7 +144,7 @@ When \code{strict_shape = TRUE}, the output is always an array:
 }
 }
 \description{
-Predicted values based on either xgboost model or model handle object.
+Predict values on data based on xgboost model.
 }
 \details{
 Note that \code{iterationrange} would currently do nothing for predictions from "gblinear",
diff --git a/R-package/man/print.xgb.cv.Rd b/R-package/man/print.xgb.cv.Rd
index 05ad61eed..74fc15d01 100644
--- a/R-package/man/print.xgb.cv.Rd
+++ b/R-package/man/print.xgb.cv.Rd
@@ -23,8 +23,8 @@ including the best iteration (when available).
 \examples{
 data(agaricus.train, package='xgboost')
 train <- agaricus.train
-cv <- xgb.cv(data = train$data, label = train$label, nfold = 5, max_depth = 2,
-               eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
+cv <- xgb.cv(data = xgb.DMatrix(train$data, label = train$label), nfold = 5, max_depth = 2,
+             eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
 print(cv)
 print(cv, verbose=TRUE)
 
diff --git a/R-package/man/xgb.Callback.Rd b/R-package/man/xgb.Callback.Rd
new file mode 100644
index 000000000..b4edcd978
--- /dev/null
+++ b/R-package/man/xgb.Callback.Rd
@@ -0,0 +1,248 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/callbacks.R
+\name{xgb.Callback}
+\alias{xgb.Callback}
+\title{XGBoost Callback Constructor}
+\usage{
+xgb.Callback(
+  cb_name = "custom_callback",
+  env = new.env(),
+  f_before_training = function(env, model, data, evals, begin_iteration, end_iteration)
+    NULL,
+  f_before_iter = function(env, model, data, evals, iteration) NULL,
+  f_after_iter = function(env, model, data, evals, iteration, iter_feval) NULL,
+  f_after_training = function(env, model, data, evals, iteration, final_feval,
+    prev_cb_res) NULL
+)
+}
+\arguments{
+\item{cb_name}{Name for the callback.
+
+If the callback produces some non-NULL result (from executing the function passed under
+\code{f_after_training}), that result will be added as an R attribute to the resulting booster
+(or as a named element in the result of CV), with the attribute name specified here.
+
+Names of callbacks must be unique - i.e. there cannot be two callbacks with the same name.}
+
+\item{env}{An environment object that will be passed to the different functions in the callback.
+Note that this environment will not be shared with other callbacks.}
+
+\item{f_before_training}{A function that will be executed before the training has started.
+
+If passing \code{NULL} for this or for the other function inputs, then no function will be executed.
+
+If passing a function, it will be called with parameters supplied as non-named arguments
+matching the function signatures that are shown in the default value for each function argument.}
+
+\item{f_before_iter}{A function that will be executed before each boosting round.
+
+This function can signal whether the training should be finalized or not, by outputting
+a value that evaluates to \code{TRUE} - i.e. if the output from the function provided here at
+a given round is \code{TRUE}, then training will be stopped before the current iteration happens.
+
+Return values of \code{NULL} will be interpreted as \code{FALSE}.}
+
+\item{f_after_iter}{A function that will be executed after each boosting round.
+
+This function can signal whether the training should be finalized or not, by outputting
+a value that evaluates to \code{TRUE} - i.e. if the output from the function provided here at
+a given round is \code{TRUE}, then training will be stopped at that round.
+
+Return values of \code{NULL} will be interpreted as \code{FALSE}.}
+
+\item{f_after_training}{A function that will be executed after training is finished.
+
+This function can optionally output something non-NULL, which will become part of the R
+attributes of the booster (assuming one passes \code{keep_extra_attributes=TRUE} to \link{xgb.train})
+under the name supplied for parameter \code{cb_name} imn the case of \link{xgb.train}; or a part
+of the named elements in the result of \link{xgb.cv}.}
+}
+\value{
+An \code{xgb.Callback} object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+}
+\description{
+Constructor for defining the structure of callback functions that can be executed
+at different stages of model training (before / after training, before / after each boosting
+iteration).
+}
+\details{
+Arguments that will be passed to the supplied functions are as follows:\itemize{
+
+\item env The same environment that is passed under argument \code{env}.
+
+It may be modified by the functions in order to e.g. keep tracking of what happens
+across iterations or similar.
+
+This environment is only used by the functions supplied to the callback, and will
+not be kept after the model fitting function terminates (see parameter \code{f_after_training}).
+
+\item model The booster object when using \link{xgb.train}, or the folds when using
+\link{xgb.cv}.
+
+For \link{xgb.cv}, folds are a list with a structure as follows:\itemize{
+\item \code{dtrain}: The training data for the fold (as an \code{xgb.DMatrix} object).
+\item \code{bst}: Rhe \code{xgb.Booster} object for the fold.
+\item \code{evals}: A list containing two DMatrices, with names \code{train} and \code{test}
+(\code{test} is the held-out data for the fold).
+\item \code{index}: The indices of the hold-out data for that fold (base-1 indexing),
+from which the \code{test} entry in \code{evals} was obtained.
+}
+
+This object should \bold{not} be in-place modified in ways that conflict with the
+training (e.g. resetting the parameters for a training update in a way that resets
+the number of rounds to zero in order to overwrite rounds).
+
+Note that any R attributes that are assigned to the booster during the callback functions,
+will not be kept thereafter as the booster object variable is not re-assigned during
+training. It is however possible to set C-level attributes of the booster through
+\link{xgb.attr} or \link{xgb.attributes}, which should remain available for the rest
+of the iterations and after the training is done.
+
+For keeping variables across iterations, it's recommended to use \code{env} instead.
+\item data The data to which the model is being fit, as an \code{xgb.DMatrix} object.
+
+Note that, for \link{xgb.cv}, this will be the full data, while data for the specific
+folds can be found in the \code{model} object.
+
+\item evals The evaluation data, as passed under argument \code{evals} to
+\link{xgb.train}.
+
+For \link{xgb.cv}, this will always be \code{NULL}.
+
+\item begin_iteration Index of the first boosting iteration that will be executed
+(base-1 indexing).
+
+This will typically be '1', but when using training continuation, depending on the
+parameters for updates, boosting rounds will be continued from where the previous
+model ended, in which case this will be larger than 1.
+
+\item end_iteration Index of the last boostign iteration that will be executed
+(base-1 indexing, inclusive of this end).
+
+It should match with argument \code{nrounds} passed to \link{xgb.train} or \link{xgb.cv}.
+
+Note that boosting might be interrupted before reaching this last iteration, for
+example by using the early stopping callback \link{xgb.cb.early.stop}.
+
+\item iteration Index of the iteration number that is being executed (first iteration
+will be the same as parameter \code{begin_iteration}, then next one will add +1, and so on).
+
+\item iter_feval Evaluation metrics for \code{evals} that were supplied, either
+determined by the objective, or by parameter \code{feval}.
+
+For \link{xgb.train}, this will be a named vector with one entry per element in
+\code{evals}, where the names are determined as 'evals name' + '-' + 'metric name' - for
+example, if \code{evals} contains an entry named "tr" and the metric is "rmse",
+this will be a one-element vector with name "tr-rmse".
+
+For \link{xgb.cv}, this will be a 2d matrix with dimensions \verb{[length(evals), nfolds]},
+where the row names will follow the same naming logic as the one-dimensional vector
+that is passed in \link{xgb.train}.
+
+Note that, internally, the built-in callbacks such as \link{xgb.cb.print.evaluation} summarize
+this table by calculating the row-wise means and standard deviations.
+
+\item final_feval The evaluation results after the last boosting round is executed
+(same format as \code{iter_feval}, and will be the exact same input as passed under
+\code{iter_feval} to the last round that is executed during model fitting).
+
+\item prev_cb_res Result from a previous run of a callback sharing the same name
+(as given by parameter \code{cb_name}) when conducting training continuation, if there
+was any in the booster R attributes.
+
+Some times, one might want to append the new results to the previous one, and this will
+be done automatically by the built-in callbacks such as \link{xgb.cb.evaluation.log},
+which will append the new rows to the previous table.
+
+If no such previous callback result is available (which it never will when fitting
+a model from start instead of updating an existing model), this will be \code{NULL}.
+
+For \link{xgb.cv}, which doesn't support training continuation, this will always be \code{NULL}.
+}
+
+The following names (\code{cb_name} values) are reserved for internal callbacks:\itemize{
+\item print_evaluation
+\item evaluation_log
+\item reset_parameters
+\item early_stop
+\item save_model
+\item cv_predict
+\item gblinear_history
+}
+
+The following names are reserved for other non-callback attributes:\itemize{
+\item names
+\item class
+\item call
+\item params
+\item niter
+\item nfeatures
+\item folds
+}
+
+When using the built-in early stopping callback (\link{xgb.cb.early.stop}), said callback
+will always be executed before the others, as it sets some booster C-level attributes
+that other callbacks might also use. Otherwise, the order of execution will match with
+the order in which the callbacks are passed to the model fitting function.
+}
+\examples{
+# Example constructing a custom callback that calculates
+# squared error on the training data (no separate test set),
+# and outputs the per-iteration results.
+ssq_callback <- xgb.Callback(
+  cb_name = "ssq",
+  f_before_training = function(env, model, data, evals,
+                               begin_iteration, end_iteration) {
+    # A vector to keep track of a number at each iteration
+    env$logs <- rep(NA_real_, end_iteration - begin_iteration + 1)
+  },
+  f_after_iter = function(env, model, data, evals, iteration, iter_feval) {
+    # This calculates the sum of squared errors on the training data.
+    # Note that this can be better done by passing an 'evals' entry,
+    # but this demonstrates a way in which callbacks can be structured.
+    pred <- predict(model, data)
+    err <- pred - getinfo(data, "label")
+    sq_err <- sum(err^2)
+    env$logs[iteration] <- sq_err
+    cat(
+      sprintf(
+        "Squared error at iteration \%d: \%.2f\n",
+        iteration, sq_err
+      )
+    )
+
+    # A return value of 'TRUE' here would signal to finalize the training
+    return(FALSE)
+  },
+  f_after_training = function(env, model, data, evals, iteration,
+                              final_feval, prev_cb_res) {
+    return(env$logs)
+  }
+)
+
+data(mtcars)
+y <- mtcars$mpg
+x <- as.matrix(mtcars[, -1])
+dm <- xgb.DMatrix(x, label = y, nthread = 1)
+model <- xgb.train(
+  data = dm,
+  params = list(objective = "reg:squarederror", nthread = 1),
+  nrounds = 5,
+  callbacks = list(ssq_callback),
+  keep_extra_attributes = TRUE
+)
+
+# Result from 'f_after_iter' will be available as an attribute
+attributes(model)$ssq
+}
+\seealso{
+Built-in callbacks:\itemize{
+\item \link{xgb.cb.print.evaluation}
+\item \link{xgb.cb.evaluation.log}
+\item \link{xgb.cb.reset.parameters}
+\item \link{xgb.cb.early.stop}
+\item \link{xgb.cb.save.model}
+\item \link{xgb.cb.cv.predict}
+\item \link{xgb.cb.gblinear.history}
+}
+}
diff --git a/R-package/man/xgb.DMatrix.Rd b/R-package/man/xgb.DMatrix.Rd
index d18270733..5f764ed45 100644
--- a/R-package/man/xgb.DMatrix.Rd
+++ b/R-package/man/xgb.DMatrix.Rd
@@ -19,7 +19,8 @@ xgb.DMatrix(
   qid = NULL,
   label_lower_bound = NULL,
   label_upper_bound = NULL,
-  feature_weights = NULL
+  feature_weights = NULL,
+  data_split_mode = "row"
 )
 
 xgb.QuantileDMatrix(
@@ -60,10 +61,27 @@ Other column types are not supported.
 'xgb.QuantileDMatrix'.
 \item Single-row CSR matrices, as class \code{dsparseVector} from package \code{Matrix}, which is interpreted
 as a single row (only when making predictions from a fitted model).
-\item Text files in SVMLight / LibSVM formats, passed as a path to the file. These are \bold{not}
-supported for xgb.QuantileDMatrix'.
-\item Binary files generated by \link{xgb.DMatrix.save},  passed as a path to the file. These are
-\bold{not} supported for xgb.QuantileDMatrix'.
+\item Text files in a supported format, passed as a \code{character} variable containing the URI path to
+the file, with an optional format specifier.
+
+These are \bold{not} supported for \code{xgb.QuantileDMatrix}. Supported formats are:\itemize{
+\item XGBoost's own binary format for DMatrices, as produced by \link{xgb.DMatrix.save}.
+\item SVMLight (a.k.a. LibSVM) format for CSR matrices. This format can be signaled by suffix
+\code{?format=libsvm} at the end of the file path. It will be the default format if not
+otherwise specified.
+\item CSV files (comma-separated values). This format can be specified by adding suffix
+\code{?format=csv} at the end ofthe file path. It will \bold{not} be auto-deduced from file extensions.
+}
+
+Be aware that the format of the file will not be auto-deduced - for example, if a file is named 'file.csv',
+it will not look at the extension or file contents to determine that it is a comma-separated value.
+Instead, the format must be specified following the URI format, so the input to \code{data} should be passed
+like this: \code{"file.csv?format=csv"} (or \code{"file.csv?format=csv&label_column=0"} if the first column
+corresponds to the labels).
+
+For more information about passing text files as input, see the articles
+\href{https://xgboost.readthedocs.io/en/stable/tutorials/input_format.html}{Text Input Format of DMatrix} and
+\href{https://xgboost.readthedocs.io/en/stable/python/python_intro.html#python-data-interface}{Data Interface}.
 }}
 
 \item{label}{Label of the training data. For classification problems, should be passed encoded as
@@ -129,6 +147,14 @@ not be saved, so make sure that \code{factor} columns passed to \code{predict} h
 
 \item{feature_weights}{Set feature weights for column sampling.}
 
+\item{data_split_mode}{When passing a URI (as R \code{character}) as input, this signals
+whether to split by row or column. Allowed values are \code{"row"} and \code{"col"}.
+
+In distributed mode, the file is split accordingly; otherwise this is only an indicator on
+how the file was split beforehand. Default to row.
+
+This is not used when \code{data} is not a URI.}
+
 \item{ref}{The training dataset that provides quantile information, needed when creating
 validation/test dataset with \code{xgb.QuantileDMatrix}. Supplying the training DMatrix
 as a reference means that the same quantisation applied to the training data is
diff --git a/R-package/man/xgb.DMatrix.save.Rd b/R-package/man/xgb.DMatrix.save.Rd
index d5c0563b3..51643274d 100644
--- a/R-package/man/xgb.DMatrix.save.Rd
+++ b/R-package/man/xgb.DMatrix.save.Rd
@@ -15,6 +15,7 @@ xgb.DMatrix.save(dmatrix, fname)
 Save xgb.DMatrix object to binary file
 }
 \examples{
+\dontshow{RhpcBLASctl::omp_set_num_threads(1)}
 data(agaricus.train, package='xgboost')
 dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label, nthread = 2))
 fname <- file.path(tempdir(), "xgb.DMatrix.data")
diff --git a/R-package/man/cb.cv.predict.Rd b/R-package/man/xgb.cb.cv.predict.Rd
similarity index 53%
rename from R-package/man/cb.cv.predict.Rd
rename to R-package/man/xgb.cb.cv.predict.Rd
index 4cabac1c9..d2d9a084b 100644
--- a/R-package/man/cb.cv.predict.Rd
+++ b/R-package/man/xgb.cb.cv.predict.Rd
@@ -1,16 +1,27 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/callbacks.R
-\name{cb.cv.predict}
-\alias{cb.cv.predict}
-\title{Callback closure for returning cross-validation based predictions.}
+\name{xgb.cb.cv.predict}
+\alias{xgb.cb.cv.predict}
+\title{Callback for returning cross-validation based predictions.}
 \usage{
-cb.cv.predict(save_models = FALSE)
+xgb.cb.cv.predict(save_models = FALSE, outputmargin = FALSE)
 }
 \arguments{
-\item{save_models}{a flag for whether to save the folds' models.}
+\item{save_models}{A flag for whether to save the folds' models.}
+
+\item{outputmargin}{Whether to save margin predictions (same effect as passing this
+parameter to \link{predict.xgb.Booster}).}
 }
 \value{
-Predictions are returned inside of the \code{pred} element, which is either a vector or a matrix,
+An \code{xgb.Callback} object, which can be passed to \link{xgb.cv},
+but \bold{not} to \link{xgb.train}.
+}
+\description{
+This callback function saves predictions for all of the test folds,
+and also allows to save the folds' models.
+}
+\details{
+Predictions are saved inside of the \code{pred} element, which is either a vector or a matrix,
 depending on the number of prediction outputs per data row. The order of predictions corresponds
 to the order of rows in the original dataset. Note that when a custom \code{folds} list is
 provided in \code{xgb.cv}, the predictions would only be returned properly when this list is a
@@ -19,23 +30,3 @@ meaningful when user-provided folds have overlapping indices as in, e.g., random
 When some of the indices in the training dataset are not included into user-provided \code{folds},
 their prediction value would be \code{NA}.
 }
-\description{
-Callback closure for returning cross-validation based predictions.
-}
-\details{
-This callback function saves predictions for all of the test folds,
-and also allows to save the folds' models.
-
-It is a "finalizer" callback and it uses early stopping information whenever it is available,
-thus it must be run after the early stopping callback if the early stopping is used.
-
-Callback function expects the following values to be set in its calling frame:
-\code{bst_folds},
-\code{basket},
-\code{data},
-\code{end_iteration},
-\code{params},
-}
-\seealso{
-\code{\link{callbacks}}
-}
diff --git a/R-package/man/xgb.cb.early.stop.Rd b/R-package/man/xgb.cb.early.stop.Rd
new file mode 100644
index 000000000..2a70f4943
--- /dev/null
+++ b/R-package/man/xgb.cb.early.stop.Rd
@@ -0,0 +1,55 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/callbacks.R
+\name{xgb.cb.early.stop}
+\alias{xgb.cb.early.stop}
+\title{Callback to activate early stopping}
+\usage{
+xgb.cb.early.stop(
+  stopping_rounds,
+  maximize = FALSE,
+  metric_name = NULL,
+  verbose = TRUE,
+  keep_all_iter = TRUE
+)
+}
+\arguments{
+\item{stopping_rounds}{The number of rounds with no improvement in
+the evaluation metric in order to stop the training.}
+
+\item{maximize}{Whether to maximize the evaluation metric.}
+
+\item{metric_name}{The name of an evaluation column to use as a criteria for early
+stopping. If not set, the last column would be used.
+Let's say the test data in \code{evals} was labelled as \code{dtest},
+and one wants to use the AUC in test data for early stopping regardless of where
+it is in the \code{evals}, then one of the following would need to be set:
+\code{metric_name='dtest-auc'} or \code{metric_name='dtest_auc'}.
+All dash '-' characters in metric names are considered equivalent to '_'.}
+
+\item{verbose}{Whether to print the early stopping information.}
+
+\item{keep_all_iter}{Whether to keep all of the boosting rounds that were produced
+in the resulting object. If passing \code{FALSE}, will only keep the boosting rounds
+up to the detected best iteration, discarding the ones that come after.}
+}
+\value{
+An \code{xgb.Callback} object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+}
+\description{
+This callback function determines the condition for early stopping.
+
+The following attributes are assigned to the booster's object:
+\itemize{
+\item \code{best_score} the evaluation score at the best iteration
+\item \code{best_iteration} at which boosting iteration the best score has occurred
+(0-based index for interoperability of binary models)
+}
+
+The same values are also stored as R attributes as a result of the callback, plus an additional
+attribute \code{stopped_by_max_rounds} which indicates whether an early stopping by the \code{stopping_rounds}
+condition occurred. Note that the \code{best_iteration} that is stored under R attributes will follow
+base-1 indexing, so it will be larger by '1' than the C-level 'best_iteration' that is accessed
+through \link{xgb.attr} or \link{xgb.attributes}.
+
+At least one dataset is required in \code{evals} for early stopping to work.
+}
diff --git a/R-package/man/xgb.cb.evaluation.log.Rd b/R-package/man/xgb.cb.evaluation.log.Rd
new file mode 100644
index 000000000..4cc6ef636
--- /dev/null
+++ b/R-package/man/xgb.cb.evaluation.log.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/callbacks.R
+\name{xgb.cb.evaluation.log}
+\alias{xgb.cb.evaluation.log}
+\title{Callback for logging the evaluation history}
+\usage{
+xgb.cb.evaluation.log()
+}
+\value{
+An \code{xgb.Callback} object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+}
+\description{
+Callback for logging the evaluation history
+}
+\details{
+This callback creates a table with per-iteration evaluation metrics (see parameters
+\code{evals} and \code{feval} in \link{xgb.train}).
+
+Note: in the column names of the final data.table, the dash '-' character is replaced with
+the underscore '_' in order to make the column names more like regular R identifiers.
+}
+\seealso{
+\link{xgb.cb.print.evaluation}
+}
diff --git a/R-package/man/cb.gblinear.history.Rd b/R-package/man/xgb.cb.gblinear.history.Rd
similarity index 63%
rename from R-package/man/cb.gblinear.history.Rd
rename to R-package/man/xgb.cb.gblinear.history.Rd
index 2a03c14db..0ebaa4685 100644
--- a/R-package/man/cb.gblinear.history.Rd
+++ b/R-package/man/xgb.cb.gblinear.history.Rd
@@ -1,37 +1,48 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/callbacks.R
-\name{cb.gblinear.history}
-\alias{cb.gblinear.history}
-\title{Callback closure for collecting the model coefficients history of a gblinear booster
-during its training.}
+\name{xgb.cb.gblinear.history}
+\alias{xgb.cb.gblinear.history}
+\title{Callback for collecting coefficients history of a gblinear booster}
 \usage{
-cb.gblinear.history(sparse = FALSE)
+xgb.cb.gblinear.history(sparse = FALSE)
 }
 \arguments{
-\item{sparse}{when set to FALSE/TRUE, a dense/sparse matrix is used to store the result.
+\item{sparse}{when set to \code{FALSE}/\code{TRUE}, a dense/sparse matrix is used to store the result.
 Sparse format is useful when one expects only a subset of coefficients to be non-zero,
 when using the "thrifty" feature selector with fairly small number of top features
 selected per iteration.}
 }
 \value{
-Results are stored in the \code{coefs} element of the closure.
-The \code{\link{xgb.gblinear.history}} convenience function provides an easy
-way to access it.
-With \code{xgb.train}, it is either a dense of a sparse matrix.
-While with \code{xgb.cv}, it is a list (an element per each fold) of such
-matrices.
+An \code{xgb.Callback} object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
 }
 \description{
-Callback closure for collecting the model coefficients history of a gblinear booster
-during its training.
+Callback for collecting coefficients history of a gblinear booster
 }
 \details{
 To keep things fast and simple, gblinear booster does not internally store the history of linear
 model coefficients at each boosting iteration. This callback provides a workaround for storing
 the coefficients' path, by extracting them after each training iteration.
 
-Callback function expects the following values to be set in its calling frame:
-\code{bst} (or \code{bst_folds}).
+This callback will construct a matrix where rows are boosting iterations and columns are
+feature coefficients (same order as when calling \link{coef.xgb.Booster}, with the intercept
+corresponding to the first column).
+
+When there is more than one coefficient per feature (e.g. multi-class classification),
+the result will be reshaped into a vector where coefficients are arranged first by features and
+then by class (e.g. first 1 through N coefficients will be for the first class, then
+coefficients N+1 through 2N for the second class, and so on).
+
+If the result has only one coefficient per feature in the data, then the resulting matrix
+will have column names matching with the feature names, otherwise (when there's more than
+one coefficient per feature) the names will be composed as 'column name' + ':' + 'class index'
+(so e.g. column 'c1' for class '0' will be named 'c1:0').
+
+With \code{xgb.train}, the output is either a dense or a sparse matrix.
+With with \code{xgb.cv}, it is a list (one element per each fold) of such
+matrices.
+
+Function \link{xgb.gblinear.history} function provides an easy way to retrieve the
+outputs from this callback.
 }
 \examples{
 #### Binary classification:
@@ -52,7 +63,7 @@ param <- list(booster = "gblinear", objective = "reg:logistic", eval_metric = "a
 # rate does not break the convergence, but allows us to illustrate the typical pattern of
 # "stochastic explosion" behaviour of this lock-free algorithm at early boosting iterations.
 bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 200, eta = 1.,
-                 callbacks = list(cb.gblinear.history()))
+                 callbacks = list(xgb.cb.gblinear.history()))
 # Extract the coefficients' path and plot them vs boosting iteration number:
 coef_path <- xgb.gblinear.history(bst)
 matplot(coef_path, type = 'l')
@@ -61,7 +72,7 @@ matplot(coef_path, type = 'l')
 # Will try the classical componentwise boosting which selects a single best feature per round:
 bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 200, eta = 0.8,
                  updater = 'coord_descent', feature_selector = 'thrifty', top_k = 1,
-                 callbacks = list(cb.gblinear.history()))
+                 callbacks = list(xgb.cb.gblinear.history()))
 matplot(xgb.gblinear.history(bst), type = 'l')
 #  Componentwise boosting is known to have similar effect to Lasso regularization.
 # Try experimenting with various values of top_k, eta, nrounds,
@@ -69,7 +80,7 @@ matplot(xgb.gblinear.history(bst), type = 'l')
 
 # For xgb.cv:
 bst <- xgb.cv(param, dtrain, nfold = 5, nrounds = 100, eta = 0.8,
-              callbacks = list(cb.gblinear.history()))
+              callbacks = list(xgb.cb.gblinear.history()))
 # coefficients in the CV fold #3
 matplot(xgb.gblinear.history(bst)[[3]], type = 'l')
 
@@ -82,7 +93,7 @@ param <- list(booster = "gblinear", objective = "multi:softprob", num_class = 3,
 # For the default linear updater 'shotgun' it sometimes is helpful
 # to use smaller eta to reduce instability
 bst <- xgb.train(param, dtrain, list(tr=dtrain), nrounds = 50, eta = 0.5,
-                 callbacks = list(cb.gblinear.history()))
+                 callbacks = list(xgb.cb.gblinear.history()))
 # Will plot the coefficient paths separately for each class:
 matplot(xgb.gblinear.history(bst, class_index = 0), type = 'l')
 matplot(xgb.gblinear.history(bst, class_index = 1), type = 'l')
@@ -90,11 +101,11 @@ matplot(xgb.gblinear.history(bst, class_index = 2), type = 'l')
 
 # CV:
 bst <- xgb.cv(param, dtrain, nfold = 5, nrounds = 70, eta = 0.5,
-              callbacks = list(cb.gblinear.history(FALSE)))
+              callbacks = list(xgb.cb.gblinear.history(FALSE)))
 # 1st fold of 1st class
 matplot(xgb.gblinear.history(bst, class_index = 0)[[1]], type = 'l')
 
 }
 \seealso{
-\code{\link{callbacks}}, \code{\link{xgb.gblinear.history}}.
+\link{xgb.gblinear.history}, \link{coef.xgb.Booster}.
 }
diff --git a/R-package/man/xgb.cb.print.evaluation.Rd b/R-package/man/xgb.cb.print.evaluation.Rd
new file mode 100644
index 000000000..c4f2e6991
--- /dev/null
+++ b/R-package/man/xgb.cb.print.evaluation.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/callbacks.R
+\name{xgb.cb.print.evaluation}
+\alias{xgb.cb.print.evaluation}
+\title{Callback for printing the result of evaluation}
+\usage{
+xgb.cb.print.evaluation(period = 1, showsd = TRUE)
+}
+\arguments{
+\item{period}{results would be printed every number of periods}
+
+\item{showsd}{whether standard deviations should be printed (when available)}
+}
+\value{
+An \code{xgb.Callback} object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+}
+\description{
+The callback function prints the result of evaluation at every \code{period} iterations.
+The initial and the last iteration's evaluations are always printed.
+
+Does not leave any attribute in the booster (see \link{xgb.cb.evaluation.log} for that).
+}
+\seealso{
+\link{xgb.Callback}
+}
diff --git a/R-package/man/cb.reset.parameters.Rd b/R-package/man/xgb.cb.reset.parameters.Rd
similarity index 57%
rename from R-package/man/cb.reset.parameters.Rd
rename to R-package/man/xgb.cb.reset.parameters.Rd
index ee0a5d1bd..c7e863817 100644
--- a/R-package/man/cb.reset.parameters.Rd
+++ b/R-package/man/xgb.cb.reset.parameters.Rd
@@ -1,10 +1,10 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/callbacks.R
-\name{cb.reset.parameters}
-\alias{cb.reset.parameters}
-\title{Callback closure for resetting the booster's parameters at each iteration.}
+\name{xgb.cb.reset.parameters}
+\alias{xgb.cb.reset.parameters}
+\title{Callback for resetting the booster's parameters at each iteration.}
 \usage{
-cb.reset.parameters(new_params)
+xgb.cb.reset.parameters(new_params)
 }
 \arguments{
 \item{new_params}{a list where each element corresponds to a parameter that needs to be reset.
@@ -14,23 +14,16 @@ or a function of two parameters \code{learning_rates(iteration, nrounds)}
 which returns a new parameter value by using the current iteration number
 and the total number of boosting rounds.}
 }
+\value{
+An \code{xgb.Callback} object, which can be passed to \link{xgb.train} or \link{xgb.cv}.
+}
 \description{
-Callback closure for resetting the booster's parameters at each iteration.
+Callback for resetting the booster's parameters at each iteration.
 }
 \details{
-This is a "pre-iteration" callback function used to reset booster's parameters
-at the beginning of each iteration.
-
 Note that when training is resumed from some previous model, and a function is used to
 reset a parameter value, the \code{nrounds} argument in this function would be the
 the number of boosting rounds in the current training.
 
-Callback function expects the following values to be set in its calling frame:
-\code{bst} or \code{bst_folds},
-\code{iteration},
-\code{begin_iteration},
-\code{end_iteration}.
-}
-\seealso{
-\code{\link{callbacks}}
+Does not leave any attribute in the booster.
 }
diff --git a/R-package/man/xgb.cb.save.model.Rd b/R-package/man/xgb.cb.save.model.Rd
new file mode 100644
index 000000000..8ddba2f1a
--- /dev/null
+++ b/R-package/man/xgb.cb.save.model.Rd
@@ -0,0 +1,28 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/callbacks.R
+\name{xgb.cb.save.model}
+\alias{xgb.cb.save.model}
+\title{Callback for saving a model file.}
+\usage{
+xgb.cb.save.model(save_period = 0, save_name = "xgboost.ubj")
+}
+\arguments{
+\item{save_period}{Save the model to disk after every
+\code{save_period} iterations; 0 means save the model at the end.}
+
+\item{save_name}{The name or path for the saved model file.
+It can contain a \code{\link[base]{sprintf}} formatting specifier
+to include the integer iteration number in the file name.
+E.g., with \code{save_name} = 'xgboost_\%04d.model',
+the file saved at iteration 50 would be named "xgboost_0050.model".}
+}
+\value{
+An \code{xgb.Callback} object, which can be passed to \link{xgb.train},
+but \bold{not} to \link{xgb.cv}.
+}
+\description{
+This callback function allows to save an xgb-model file, either periodically
+after each \code{save_period}'s or at the end.
+
+Does not leave any attribute in the booster.
+}
diff --git a/R-package/man/xgb.create.features.Rd b/R-package/man/xgb.create.features.Rd
index 68b561997..995c27459 100644
--- a/R-package/man/xgb.create.features.Rd
+++ b/R-package/man/xgb.create.features.Rd
@@ -82,7 +82,6 @@ new.dtrain <- xgb.DMatrix(
 new.dtest <- xgb.DMatrix(
   data = new.features.test, label = agaricus.test$label, nthread = 2
 )
-watchlist <- list(train = new.dtrain)
 bst <- xgb.train(params = param, data = new.dtrain, nrounds = nrounds, nthread = 2)
 
 # Model accuracy with new features
diff --git a/R-package/man/xgb.cv.Rd b/R-package/man/xgb.cv.Rd
index 9f6103a52..cede67570 100644
--- a/R-package/man/xgb.cv.Rd
+++ b/R-package/man/xgb.cv.Rd
@@ -9,14 +9,12 @@ xgb.cv(
   data,
   nrounds,
   nfold,
-  label = NULL,
-  missing = NA,
   prediction = FALSE,
   showsd = TRUE,
   metrics = list(),
   obj = NULL,
   feval = NULL,
-  stratified = TRUE,
+  stratified = "auto",
   folds = NULL,
   train_folds = NULL,
   verbose = TRUE,
@@ -44,22 +42,25 @@ is a shorter summary:
 }
 
 See \code{\link{xgb.train}} for further details.
-See also demo/ for walkthrough example in R.}
+See also demo/ for walkthrough example in R.
 
-\item{data}{takes an \code{xgb.DMatrix}, \code{matrix}, or \code{dgCMatrix} as the input.}
+Note that, while \code{params} accepts a \code{seed} entry and will use such parameter for model training if
+supplied, this seed is not used for creation of train-test splits, which instead rely on R's own RNG
+system - thus, for reproducible results, one needs to call the \code{set.seed} function beforehand.}
+
+\item{data}{An \code{xgb.DMatrix} object, with corresponding fields like \code{label} or bounds as required
+for model training by the objective.
+
+\if{html}{\out{<div class="sourceCode">}}\preformatted{   Note that only the basic `xgb.DMatrix` class is supported - variants such as `xgb.QuantileDMatrix`
+   or `xgb.ExternalDMatrix` are not supported here.
+}\if{html}{\out{</div>}}}
 
 \item{nrounds}{the max number of iterations}
 
 \item{nfold}{the original dataset is randomly partitioned into \code{nfold} equal size subsamples.}
 
-\item{label}{vector of response values. Should be provided only when data is an R-matrix.}
-
-\item{missing}{is only used when input is a dense matrix. By default is set to NA, which means
-that NA values should be considered as 'missing' by the algorithm.
-Sometimes, 0 or other extreme value might be used to represent missing values.}
-
 \item{prediction}{A logical value indicating whether to return the test fold predictions
-from each CV model. This parameter engages the \code{\link{cb.cv.predict}} callback.}
+from each CV model. This parameter engages the \code{\link{xgb.cb.cv.predict}} callback.}
 
 \item{showsd}{\code{boolean}, whether to show standard deviation of cross validation}
 
@@ -84,34 +85,54 @@ gradient with given prediction and dtrain.}
 \code{list(metric='metric-name', value='metric-value')} with given
 prediction and dtrain.}
 
-\item{stratified}{a \code{boolean} indicating whether sampling of folds should be stratified
-by the values of outcome labels.}
+\item{stratified}{A \code{boolean} indicating whether sampling of folds should be stratified
+by the values of outcome labels. For real-valued labels in regression objectives,
+stratification will be done by discretizing the labels into up to 5 buckets beforehand.
+
+\if{html}{\out{<div class="sourceCode">}}\preformatted{   If passing "auto", will be set to `TRUE` if the objective in `params` is a classification
+   objective (from XGBoost's built-in objectives, doesn't apply to custom ones), and to
+   `FALSE` otherwise.
+
+   This parameter is ignored when `data` has a `group` field - in such case, the splitting
+   will be based on whole groups (note that this might make the folds have different sizes).
+
+   Value `TRUE` here is \\bold\{not\} supported for custom objectives.
+}\if{html}{\out{</div>}}}
 
 \item{folds}{\code{list} provides a possibility to use a list of pre-defined CV folds
 (each element must be a vector of test fold's indices). When folds are supplied,
-the \code{nfold} and \code{stratified} parameters are ignored.}
+the \code{nfold} and \code{stratified} parameters are ignored.
+
+\if{html}{\out{<div class="sourceCode">}}\preformatted{   If `data` has a `group` field and the objective requires this field, each fold (list element)
+   must additionally have two attributes (retrievable through \link{attributes}) named `group_test`
+   and `group_train`, which should hold the `group` to assign through \link{setinfo.xgb.DMatrix} to
+   the resulting DMatrices.
+}\if{html}{\out{</div>}}}
 
 \item{train_folds}{\code{list} list specifying which indicies to use for training. If \code{NULL}
-(the default) all indices not specified in \code{folds} will be used for training.}
+(the default) all indices not specified in \code{folds} will be used for training.
+
+\if{html}{\out{<div class="sourceCode">}}\preformatted{   This is not supported when `data` has `group` field.
+}\if{html}{\out{</div>}}}
 
 \item{verbose}{\code{boolean}, print the statistics during the process}
 
 \item{print_every_n}{Print each n-th iteration evaluation messages when \code{verbose>0}.
 Default is 1 which means all messages are printed. This parameter is passed to the
-\code{\link{cb.print.evaluation}} callback.}
+\code{\link{xgb.cb.print.evaluation}} callback.}
 
 \item{early_stopping_rounds}{If \code{NULL}, the early stopping function is not triggered.
 If set to an integer \code{k}, training with a validation set will stop if the performance
 doesn't improve for \code{k} rounds.
-Setting this parameter engages the \code{\link{cb.early.stop}} callback.}
+Setting this parameter engages the \code{\link{xgb.cb.early.stop}} callback.}
 
 \item{maximize}{If \code{feval} and \code{early_stopping_rounds} are set,
 then this parameter must be set as well.
 When it is \code{TRUE}, it means the larger the evaluation score the better.
-This parameter is passed to the \code{\link{cb.early.stop}} callback.}
+This parameter is passed to the \code{\link{xgb.cb.early.stop}} callback.}
 
 \item{callbacks}{a list of callback functions to perform various task during boosting.
-See \code{\link{callbacks}}. Some of the callbacks are automatically created depending on the
+See \code{\link{xgb.Callback}}. Some of the callbacks are automatically created depending on the
 parameters' values. User can provide either existing or their own callback methods in order
 to customize the training process.}
 
@@ -122,27 +143,27 @@ An object of class \code{xgb.cv.synchronous} with the following elements:
 \itemize{
 \item \code{call} a function call.
 \item \code{params} parameters that were passed to the xgboost library. Note that it does not
-capture parameters changed by the \code{\link{cb.reset.parameters}} callback.
-\item \code{callbacks} callback functions that were either automatically assigned or
-explicitly passed.
+capture parameters changed by the \code{\link{xgb.cb.reset.parameters}} callback.
 \item \code{evaluation_log} evaluation history stored as a \code{data.table} with the
 first column corresponding to iteration number and the rest corresponding to the
 CV-based evaluation means and standard deviations for the training and test CV-sets.
-It is created by the \code{\link{cb.evaluation.log}} callback.
+It is created by the \code{\link{xgb.cb.evaluation.log}} callback.
 \item \code{niter} number of boosting iterations.
 \item \code{nfeatures} number of features in training data.
 \item \code{folds} the list of CV folds' indices - either those passed through the \code{folds}
 parameter or randomly generated.
 \item \code{best_iteration} iteration number with the best evaluation metric value
 (only available with early stopping).
-\item \code{pred} CV prediction values available when \code{prediction} is set.
-It is either vector or matrix (see \code{\link{cb.cv.predict}}).
-\item \code{models} a list of the CV folds' models. It is only available with the explicit
-setting of the \code{cb.cv.predict(save_models = TRUE)} callback.
 }
+
+Plus other potential elements that are the result of callbacks, such as a list \code{cv_predict} with
+a sub-element \code{pred} when passing \code{prediction = TRUE}, which is added by the \link{xgb.cb.cv.predict}
+callback (note that one can also pass it manually under \code{callbacks} with different settings,
+such as saving also the models created during cross validation); or a list \code{early_stop} which
+will contain elements such as \code{best_iteration} when using the early stopping callback (\link{xgb.cb.early.stop}).
 }
 \description{
-The cross validation function of xgboost
+The cross validation function of xgboost.
 }
 \details{
 The original sample is randomly partitioned into \code{nfold} equal size subsamples.
diff --git a/R-package/man/xgb.dump.Rd b/R-package/man/xgb.dump.Rd
index 2cdb6b16a..6f97f6924 100644
--- a/R-package/man/xgb.dump.Rd
+++ b/R-package/man/xgb.dump.Rd
@@ -44,6 +44,7 @@ as a \code{character} vector. Otherwise it will return \code{TRUE}.
 Dump an xgboost model in text format.
 }
 \examples{
+\dontshow{RhpcBLASctl::omp_set_num_threads(1)}
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
 train <- agaricus.train
diff --git a/R-package/man/xgb.gblinear.history.Rd b/R-package/man/xgb.gblinear.history.Rd
index 103be16f1..25aef7163 100644
--- a/R-package/man/xgb.gblinear.history.Rd
+++ b/R-package/man/xgb.gblinear.history.Rd
@@ -8,7 +8,7 @@ xgb.gblinear.history(model, class_index = NULL)
 }
 \arguments{
 \item{model}{either an \code{xgb.Booster} or a result of \code{xgb.cv()}, trained
-using the \code{cb.gblinear.history()} callback, but \bold{not} a booster
+using the \link{xgb.cb.gblinear.history} callback, but \bold{not} a booster
 loaded from \link{xgb.load} or \link{xgb.load.raw}.}
 
 \item{class_index}{zero-based class index to extract the coefficients for only that
@@ -16,23 +16,31 @@ specific class in a multinomial multiclass model. When it is NULL, all the
 coefficients are returned. Has no effect in non-multiclass models.}
 }
 \value{
-For an \code{xgb.train} result, a matrix (either dense or sparse) with the columns
-corresponding to iteration's coefficients (in the order as \code{xgb.dump()} would
-return) and the rows corresponding to boosting iterations.
+For an \link{xgb.train} result, a matrix (either dense or sparse) with the columns
+corresponding to iteration's coefficients and the rows corresponding to boosting iterations.
 
-For an \code{xgb.cv} result, a list of such matrices is returned with the elements
+For an \link{xgb.cv} result, a list of such matrices is returned with the elements
 corresponding to CV folds.
+
+When there is more than one coefficient per feature (e.g. multi-class classification)
+and \code{class_index} is not provided,
+the result will be reshaped into a vector where coefficients are arranged first by features and
+then by class (e.g. first 1 through N coefficients will be for the first class, then
+coefficients N+1 through 2N for the second class, and so on).
 }
 \description{
 A helper function to extract the matrix of linear coefficients' history
-from a gblinear model created while using the \code{cb.gblinear.history()}
-callback.
+from a gblinear model created while using the \link{xgb.cb.gblinear.history}
+callback (which must be added manually as by default it's not used).
 }
 \details{
 Note that this is an R-specific function that relies on R attributes that
 are not saved when using xgboost's own serialization functions like \link{xgb.load}
 or \link{xgb.load.raw}.
 
-In order for a serialized model to be accepted by tgis function, one must use R
+In order for a serialized model to be accepted by this function, one must use R
 serializers such as \link{saveRDS}.
 }
+\seealso{
+\link{xgb.cb.gblinear.history}, \link{coef.xgb.Booster}.
+}
diff --git a/R-package/man/xgb.load.Rd b/R-package/man/xgb.load.Rd
index 1a6873171..e18a900e3 100644
--- a/R-package/man/xgb.load.Rd
+++ b/R-package/man/xgb.load.Rd
@@ -17,7 +17,7 @@ Load xgboost model from the binary model file.
 }
 \details{
 The input file is expected to contain a model saved in an xgboost model format
-using either \code{\link{xgb.save}} or \code{\link{cb.save.model}} in R, or using some
+using either \code{\link{xgb.save}} or \code{\link{xgb.cb.save.model}} in R, or using some
 appropriate methods from other xgboost interfaces. E.g., a model trained in Python and
 saved from there in xgboost format, could be loaded from R.
 
@@ -25,6 +25,7 @@ Note: a model saved as an R-object, has to be loaded using corresponding R-metho
 not \code{xgb.load}.
 }
 \examples{
+\dontshow{RhpcBLASctl::omp_set_num_threads(1)}
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
 
diff --git a/R-package/man/xgb.save.Rd b/R-package/man/xgb.save.Rd
index 0db80a120..bcfbd0bb4 100644
--- a/R-package/man/xgb.save.Rd
+++ b/R-package/man/xgb.save.Rd
@@ -41,6 +41,7 @@ how to persist models in a future-proof way, i.e. to make the model accessible i
 releases of XGBoost.
 }
 \examples{
+\dontshow{RhpcBLASctl::omp_set_num_threads(1)}
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
 
diff --git a/R-package/man/xgb.save.raw.Rd b/R-package/man/xgb.save.raw.Rd
index 15400bb14..6cdafd3d9 100644
--- a/R-package/man/xgb.save.raw.Rd
+++ b/R-package/man/xgb.save.raw.Rd
@@ -21,6 +21,7 @@ xgb.save.raw(model, raw_format = "ubj")
 Save xgboost model from xgboost or xgb.train
 }
 \examples{
+\dontshow{RhpcBLASctl::omp_set_num_threads(1)}
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
 
diff --git a/R-package/man/xgb.slice.DMatrix.Rd b/R-package/man/xgb.slice.DMatrix.Rd
index c9695996b..c4f776594 100644
--- a/R-package/man/xgb.slice.DMatrix.Rd
+++ b/R-package/man/xgb.slice.DMatrix.Rd
@@ -6,14 +6,18 @@
 \title{Get a new DMatrix containing the specified rows of
 original xgb.DMatrix object}
 \usage{
-xgb.slice.DMatrix(object, idxset)
+xgb.slice.DMatrix(object, idxset, allow_groups = FALSE)
 
 \method{[}{xgb.DMatrix}(object, idxset, colset = NULL)
 }
 \arguments{
-\item{object}{Object of class "xgb.DMatrix"}
+\item{object}{Object of class "xgb.DMatrix".}
 
-\item{idxset}{a integer vector of indices of rows needed}
+\item{idxset}{An integer vector of indices of rows needed (base-1 indexing).}
+
+\item{allow_groups}{Whether to allow slicing an \code{xgb.DMatrix} with \code{group} (or
+equivalently \code{qid}) field. Note that in such case, the result will not have
+the groups anymore - they need to be set manually through \code{setinfo}.}
 
 \item{colset}{currently not used (columns subsetting is not available)}
 }
diff --git a/R-package/man/xgb.train.Rd b/R-package/man/xgb.train.Rd
index 0421b9c4a..21c8dbe16 100644
--- a/R-package/man/xgb.train.Rd
+++ b/R-package/man/xgb.train.Rd
@@ -9,7 +9,7 @@ xgb.train(
   params = list(),
   data,
   nrounds,
-  watchlist = list(),
+  evals = list(),
   obj = NULL,
   feval = NULL,
   verbose = 1,
@@ -158,13 +158,13 @@ List is provided in detail section.}
 
 \item{nrounds}{max number of boosting iterations.}
 
-\item{watchlist}{named list of xgb.DMatrix datasets to use for evaluating model performance.
+\item{evals}{Named list of \code{xgb.DMatrix} datasets to use for evaluating model performance.
 Metrics specified in either \code{eval_metric} or \code{feval} will be computed for each
 of these datasets during each boosting iteration, and stored in the end as a field named
 \code{evaluation_log} in the resulting object. When either \code{verbose>=1} or
-\code{\link{cb.print.evaluation}} callback is engaged, the performance results are continuously
+\code{\link{xgb.cb.print.evaluation}} callback is engaged, the performance results are continuously
 printed out during the training.
-E.g., specifying \code{watchlist=list(validation1=mat1, validation2=mat2)} allows to track
+E.g., specifying \code{evals=list(validation1=mat1, validation2=mat2)} allows to track
 the performance of each round's model on mat1 and mat2.}
 
 \item{obj}{customized objective function. Returns gradient and second order
@@ -177,24 +177,24 @@ prediction and dtrain.}
 \item{verbose}{If 0, xgboost will stay silent. If 1, it will print information about performance.
 If 2, some additional information will be printed out.
 Note that setting \code{verbose > 0} automatically engages the
-\code{cb.print.evaluation(period=1)} callback function.}
+\code{xgb.cb.print.evaluation(period=1)} callback function.}
 
 \item{print_every_n}{Print each n-th iteration evaluation messages when \code{verbose>0}.
 Default is 1 which means all messages are printed. This parameter is passed to the
-\code{\link{cb.print.evaluation}} callback.}
+\code{\link{xgb.cb.print.evaluation}} callback.}
 
 \item{early_stopping_rounds}{If \code{NULL}, the early stopping function is not triggered.
 If set to an integer \code{k}, training with a validation set will stop if the performance
 doesn't improve for \code{k} rounds.
-Setting this parameter engages the \code{\link{cb.early.stop}} callback.}
+Setting this parameter engages the \code{\link{xgb.cb.early.stop}} callback.}
 
 \item{maximize}{If \code{feval} and \code{early_stopping_rounds} are set,
 then this parameter must be set as well.
 When it is \code{TRUE}, it means the larger the evaluation score the better.
-This parameter is passed to the \code{\link{cb.early.stop}} callback.}
+This parameter is passed to the \code{\link{xgb.cb.early.stop}} callback.}
 
 \item{save_period}{when it is non-NULL, model is saved to disk after every \code{save_period} rounds,
-0 means save at the end. The saving is handled by the \code{\link{cb.save.model}} callback.}
+0 means save at the end. The saving is handled by the \code{\link{xgb.cb.save.model}} callback.}
 
 \item{save_name}{the name or path for periodically saved model file.}
 
@@ -203,12 +203,13 @@ Could be either an object of class \code{xgb.Booster}, or its raw data, or the n
 file with a previously saved model.}
 
 \item{callbacks}{a list of callback functions to perform various task during boosting.
-See \code{\link{callbacks}}. Some of the callbacks are automatically created depending on the
+See \code{\link{xgb.Callback}}. Some of the callbacks are automatically created depending on the
 parameters' values. User can provide either existing or their own callback methods in order
 to customize the training process.
 
-\if{html}{\out{<div class="sourceCode">}}\preformatted{   Note that some callbacks might try to set an evaluation log - be aware that these evaluation logs
-   are kept as R attributes, and thus do not get saved when using non-R serializaters like
+\if{html}{\out{<div class="sourceCode">}}\preformatted{   Note that some callbacks might try to leave attributes in the resulting model object,
+   such as an evaluation log (a `data.table` object) - be aware that these objects are kept
+   as R attributes, and thus do not get saved when using XGBoost's own serializaters like
    \link{xgb.save} (but are kept when using R serializers like \link{saveRDS}).
 }\if{html}{\out{</div>}}}
 
@@ -233,7 +234,7 @@ The \code{xgboost} function is a simpler wrapper for \code{xgb.train}.
 \details{
 These are the training functions for \code{xgboost}.
 
-The \code{xgb.train} interface supports advanced features such as \code{watchlist},
+The \code{xgb.train} interface supports advanced features such as \code{evals},
 customized objective and evaluation metric functions, therefore it is more flexible
 than the \code{xgboost} interface.
 
@@ -241,6 +242,11 @@ Parallelization is automatically enabled if \code{OpenMP} is present.
 Number of threads can also be manually specified via the \code{nthread}
 parameter.
 
+While in other interfaces, the default random seed defaults to zero, in R, if a parameter \code{seed}
+is not manually supplied, it will generate a random seed through R's own random number generator,
+whose seed in turn is controllable through \code{set.seed}. If \code{seed} is passed, it will override the
+RNG from R.
+
 The evaluation metric is chosen automatically by XGBoost (according to the objective)
 when the \code{eval_metric} parameter is not provided.
 User may set one or several \code{eval_metric} parameters.
@@ -264,18 +270,19 @@ Different threshold (e.g., 0.) could be specified as "error@0."
 
 The following callbacks are automatically created when certain parameters are set:
 \itemize{
-\item \code{cb.print.evaluation} is turned on when \code{verbose > 0};
+\item \code{xgb.cb.print.evaluation} is turned on when \code{verbose > 0};
 and the \code{print_every_n} parameter is passed to it.
-\item \code{cb.evaluation.log} is on when \code{watchlist} is present.
-\item \code{cb.early.stop}: when \code{early_stopping_rounds} is set.
-\item \code{cb.save.model}: when \code{save_period > 0} is set.
+\item \code{xgb.cb.evaluation.log} is on when \code{evals} is present.
+\item \code{xgb.cb.early.stop}: when \code{early_stopping_rounds} is set.
+\item \code{xgb.cb.save.model}: when \code{save_period > 0} is set.
 }
 
 Note that objects of type \code{xgb.Booster} as returned by this function behave a bit differently
 from typical R objects (it's an 'altrep' list class), and it makes a separation between
 internal booster attributes (restricted to jsonifyable data), accessed through \link{xgb.attr}
 and shared between interfaces through serialization functions like \link{xgb.save}; and
-R-specific attributes, accessed through \link{attributes} and \link{attr}, which are otherwise
+R-specific attributes (typically the result from a callback), accessed through \link{attributes}
+and \link{attr}, which are otherwise
 only used in the R interface, only kept when using R's serializers like \link{saveRDS}, and
 not anyhow used by functions like \link{predict.xgb.Booster}.
 
@@ -300,12 +307,12 @@ dtrain <- with(
 dtest <- with(
   agaricus.test, xgb.DMatrix(data, label = label, nthread = nthread)
 )
-watchlist <- list(train = dtrain, eval = dtest)
+evals <- list(train = dtrain, eval = dtest)
 
 ## A simple xgb.train example:
 param <- list(max_depth = 2, eta = 1, nthread = nthread,
               objective = "binary:logistic", eval_metric = "auc")
-bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0)
+bst <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0)
 
 ## An xgb.train example where custom objective and evaluation metric are
 ## used:
@@ -326,15 +333,15 @@ evalerror <- function(preds, dtrain) {
 #  as 'objective' and 'eval_metric' parameters in the params list:
 param <- list(max_depth = 2, eta = 1, nthread = nthread,
               objective = logregobj, eval_metric = evalerror)
-bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0)
+bst <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0)
 
 #  or through the ... arguments:
 param <- list(max_depth = 2, eta = 1, nthread = nthread)
-bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
+bst <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0,
                  objective = logregobj, eval_metric = evalerror)
 
 #  or as dedicated 'obj' and 'feval' parameters of xgb.train:
-bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
+bst <- xgb.train(param, dtrain, nrounds = 2, evals = evals,
                  obj = logregobj, feval = evalerror)
 
 
@@ -342,11 +349,11 @@ bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
 param <- list(max_depth = 2, eta = 1, nthread = nthread,
               objective = "binary:logistic", eval_metric = "auc")
 my_etas <- list(eta = c(0.5, 0.1))
-bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
-                 callbacks = list(cb.reset.parameters(my_etas)))
+bst <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0,
+                 callbacks = list(xgb.cb.reset.parameters(my_etas)))
 
 ## Early stopping:
-bst <- xgb.train(param, dtrain, nrounds = 25, watchlist,
+bst <- xgb.train(param, dtrain, nrounds = 25, evals = evals,
                  early_stopping_rounds = 3)
 
 ## An 'xgboost' interface example:
@@ -361,7 +368,7 @@ Tianqi Chen and Carlos Guestrin, "XGBoost: A Scalable Tree Boosting System",
 22nd SIGKDD Conference on Knowledge Discovery and Data Mining, 2016, \url{https://arxiv.org/abs/1603.02754}
 }
 \seealso{
-\code{\link{callbacks}},
+\code{\link{xgb.Callback}},
 \code{\link{predict.xgb.Booster}},
 \code{\link{xgb.cv}}
 }
diff --git a/R-package/man/xgbConfig.Rd b/R-package/man/xgbConfig.Rd
index 94b220c77..164c62ef4 100644
--- a/R-package/man/xgbConfig.Rd
+++ b/R-package/man/xgbConfig.Rd
@@ -25,6 +25,15 @@ values of one or more global-scope parameters. Use \code{xgb.get.config} to fetc
 values of all global-scope parameters (listed in
 \url{https://xgboost.readthedocs.io/en/stable/parameter.html}).
 }
+\details{
+Note that serialization-related functions might use a globally-configured number of threads,
+which is managed by the system's OpenMP (OMP) configuration instead. Typically, XGBoost methods
+accept an \code{nthreads} parameter, but some methods like \code{readRDS} might get executed before such
+parameter can be supplied.
+
+The number of OMP threads can in turn be configured for example through an environment variable
+\code{OMP_NUM_THREADS} (needs to be set before R is started), or through \code{RhpcBLASctl::omp_set_num_threads}.
+}
 \examples{
 # Set verbosity level to silent (0)
 xgb.set.config(verbosity = 0)
diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in
index dd13983f5..69cdd09a3 100644
--- a/R-package/src/Makevars.in
+++ b/R-package/src/Makevars.in
@@ -99,11 +99,14 @@ OBJECTS= \
     $(PKGROOT)/src/context.o \
     $(PKGROOT)/src/logging.o \
     $(PKGROOT)/src/global_config.o \
+    $(PKGROOT)/src/collective/result.o \
     $(PKGROOT)/src/collective/allgather.o \
     $(PKGROOT)/src/collective/allreduce.o \
     $(PKGROOT)/src/collective/broadcast.o \
     $(PKGROOT)/src/collective/comm.o \
+    $(PKGROOT)/src/collective/comm_group.o \
     $(PKGROOT)/src/collective/coll.o \
+    $(PKGROOT)/src/collective/communicator-inl.o \
     $(PKGROOT)/src/collective/tracker.o \
     $(PKGROOT)/src/collective/communicator.o \
     $(PKGROOT)/src/collective/in_memory_communicator.o \
diff --git a/R-package/src/Makevars.win b/R-package/src/Makevars.win
index 46a862711..b34d8c649 100644
--- a/R-package/src/Makevars.win
+++ b/R-package/src/Makevars.win
@@ -99,11 +99,14 @@ OBJECTS= \
     $(PKGROOT)/src/context.o \
     $(PKGROOT)/src/logging.o \
     $(PKGROOT)/src/global_config.o \
+    $(PKGROOT)/src/collective/result.o \
     $(PKGROOT)/src/collective/allgather.o \
     $(PKGROOT)/src/collective/allreduce.o \
     $(PKGROOT)/src/collective/broadcast.o \
     $(PKGROOT)/src/collective/comm.o \
+    $(PKGROOT)/src/collective/comm_group.o \
     $(PKGROOT)/src/collective/coll.o \
+    $(PKGROOT)/src/collective/communicator-inl.o \
     $(PKGROOT)/src/collective/tracker.o \
     $(PKGROOT)/src/collective/communicator.o \
     $(PKGROOT)/src/collective/in_memory_communicator.o \
diff --git a/R-package/src/init.c b/R-package/src/init.c
index a9f3f3e38..5db3218b4 100644
--- a/R-package/src/init.c
+++ b/R-package/src/init.c
@@ -37,6 +37,9 @@ extern SEXP XGBoosterLoadJsonConfig_R(SEXP handle, SEXP value);
 extern SEXP XGBoosterSerializeToBuffer_R(SEXP handle);
 extern SEXP XGBoosterUnserializeFromBuffer_R(SEXP handle, SEXP raw);
 extern SEXP XGBoosterPredictFromDMatrix_R(SEXP, SEXP, SEXP);
+extern SEXP XGBoosterPredictFromDense_R(SEXP, SEXP, SEXP, SEXP, SEXP);
+extern SEXP XGBoosterPredictFromCSR_R(SEXP, SEXP, SEXP, SEXP, SEXP);
+extern SEXP XGBoosterPredictFromColumnar_R(SEXP, SEXP, SEXP, SEXP, SEXP);
 extern SEXP XGBoosterSaveModel_R(SEXP, SEXP);
 extern SEXP XGBoosterSetAttr_R(SEXP, SEXP, SEXP);
 extern SEXP XGBoosterSetParam_R(SEXP, SEXP, SEXP);
@@ -46,7 +49,7 @@ extern SEXP XGSetArrayDimInplace_R(SEXP, SEXP);
 extern SEXP XGSetArrayDimNamesInplace_R(SEXP, SEXP);
 extern SEXP XGDMatrixCreateFromCSC_R(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
 extern SEXP XGDMatrixCreateFromCSR_R(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
-extern SEXP XGDMatrixCreateFromFile_R(SEXP, SEXP);
+extern SEXP XGDMatrixCreateFromURI_R(SEXP, SEXP, SEXP);
 extern SEXP XGDMatrixCreateFromMat_R(SEXP, SEXP, SEXP);
 extern SEXP XGDMatrixGetFloatInfo_R(SEXP, SEXP);
 extern SEXP XGDMatrixGetUIntInfo_R(SEXP, SEXP);
@@ -68,11 +71,12 @@ extern SEXP XGDMatrixGetDataAsCSR_R(SEXP);
 extern SEXP XGDMatrixSaveBinary_R(SEXP, SEXP, SEXP);
 extern SEXP XGDMatrixSetInfo_R(SEXP, SEXP, SEXP);
 extern SEXP XGDMatrixSetStrFeatureInfo_R(SEXP, SEXP, SEXP);
-extern SEXP XGDMatrixSliceDMatrix_R(SEXP, SEXP);
+extern SEXP XGDMatrixSliceDMatrix_R(SEXP, SEXP, SEXP);
 extern SEXP XGBSetGlobalConfig_R(SEXP);
 extern SEXP XGBGetGlobalConfig_R(void);
 extern SEXP XGBoosterFeatureScore_R(SEXP, SEXP);
 extern SEXP XGBoosterSlice_R(SEXP, SEXP, SEXP, SEXP);
+extern SEXP XGBoosterSliceAndReplace_R(SEXP, SEXP, SEXP, SEXP);
 
 static const R_CallMethodDef CallEntries[] = {
   {"XGDuplicate_R",               (DL_FUNC) &XGDuplicate_R,               1},
@@ -96,6 +100,9 @@ static const R_CallMethodDef CallEntries[] = {
   {"XGBoosterSerializeToBuffer_R",     (DL_FUNC) &XGBoosterSerializeToBuffer_R,     1},
   {"XGBoosterUnserializeFromBuffer_R", (DL_FUNC) &XGBoosterUnserializeFromBuffer_R, 2},
   {"XGBoosterPredictFromDMatrix_R", (DL_FUNC) &XGBoosterPredictFromDMatrix_R, 3},
+  {"XGBoosterPredictFromDense_R", (DL_FUNC) &XGBoosterPredictFromDense_R, 5},
+  {"XGBoosterPredictFromCSR_R",   (DL_FUNC) &XGBoosterPredictFromCSR_R,   5},
+  {"XGBoosterPredictFromColumnar_R", (DL_FUNC) &XGBoosterPredictFromColumnar_R, 5},
   {"XGBoosterSaveModel_R",        (DL_FUNC) &XGBoosterSaveModel_R,        2},
   {"XGBoosterSetAttr_R",          (DL_FUNC) &XGBoosterSetAttr_R,          3},
   {"XGBoosterSetParam_R",         (DL_FUNC) &XGBoosterSetParam_R,         3},
@@ -105,7 +112,7 @@ static const R_CallMethodDef CallEntries[] = {
   {"XGSetArrayDimNamesInplace_R", (DL_FUNC) &XGSetArrayDimNamesInplace_R, 2},
   {"XGDMatrixCreateFromCSC_R",    (DL_FUNC) &XGDMatrixCreateFromCSC_R,    6},
   {"XGDMatrixCreateFromCSR_R",    (DL_FUNC) &XGDMatrixCreateFromCSR_R,    6},
-  {"XGDMatrixCreateFromFile_R",   (DL_FUNC) &XGDMatrixCreateFromFile_R,   2},
+  {"XGDMatrixCreateFromURI_R",    (DL_FUNC) &XGDMatrixCreateFromURI_R,    3},
   {"XGDMatrixCreateFromMat_R",    (DL_FUNC) &XGDMatrixCreateFromMat_R,    3},
   {"XGDMatrixGetFloatInfo_R",     (DL_FUNC) &XGDMatrixGetFloatInfo_R,     2},
   {"XGDMatrixGetUIntInfo_R",      (DL_FUNC) &XGDMatrixGetUIntInfo_R,      2},
@@ -127,11 +134,12 @@ static const R_CallMethodDef CallEntries[] = {
   {"XGDMatrixSaveBinary_R",       (DL_FUNC) &XGDMatrixSaveBinary_R,       3},
   {"XGDMatrixSetInfo_R",          (DL_FUNC) &XGDMatrixSetInfo_R,          3},
   {"XGDMatrixSetStrFeatureInfo_R", (DL_FUNC) &XGDMatrixSetStrFeatureInfo_R, 3},
-  {"XGDMatrixSliceDMatrix_R",     (DL_FUNC) &XGDMatrixSliceDMatrix_R,     2},
+  {"XGDMatrixSliceDMatrix_R",     (DL_FUNC) &XGDMatrixSliceDMatrix_R,     3},
   {"XGBSetGlobalConfig_R",        (DL_FUNC) &XGBSetGlobalConfig_R,        1},
   {"XGBGetGlobalConfig_R",        (DL_FUNC) &XGBGetGlobalConfig_R,        0},
   {"XGBoosterFeatureScore_R",     (DL_FUNC) &XGBoosterFeatureScore_R,     2},
   {"XGBoosterSlice_R",            (DL_FUNC) &XGBoosterSlice_R,            4},
+  {"XGBoosterSliceAndReplace_R",  (DL_FUNC) &XGBoosterSliceAndReplace_R,  4},
   {NULL, NULL, 0}
 };
 
diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc
index c91fb94c4..cdb9ba65c 100644
--- a/R-package/src/xgboost_R.cc
+++ b/R-package/src/xgboost_R.cc
@@ -13,6 +13,7 @@
 #include <cstdint>
 #include <cstdio>
 #include <cstring>
+#include <memory>
 #include <limits>
 #include <sstream>
 #include <string>
@@ -207,25 +208,24 @@ SEXP SafeAllocInteger(size_t size, SEXP continuation_token) {
   return xgboost::Json::Dump(jinterface);
 }
 
-[[nodiscard]] std::string MakeJsonConfigForArray(SEXP missing, SEXP n_threads, SEXPTYPE arr_type) {
-  using namespace ::xgboost;  // NOLINT
-  Json jconfig{Object{}};
-
-  const SEXPTYPE missing_type = TYPEOF(missing);
-  if (Rf_isNull(missing) || (missing_type == REALSXP && ISNAN(Rf_asReal(missing))) ||
-      (missing_type == LGLSXP && Rf_asLogical(missing) == R_NaInt) ||
-      (missing_type == INTSXP && Rf_asInteger(missing) == R_NaInt)) {
+void AddMissingToJson(xgboost::Json *jconfig, SEXP missing, SEXPTYPE arr_type) {
+  if (Rf_isNull(missing) || ISNAN(Rf_asReal(missing))) {
     // missing is not specified
     if (arr_type == REALSXP) {
-      jconfig["missing"] = std::numeric_limits<double>::quiet_NaN();
+      (*jconfig)["missing"] = std::numeric_limits<double>::quiet_NaN();
     } else {
-      jconfig["missing"] = R_NaInt;
+      (*jconfig)["missing"] = R_NaInt;
     }
   } else {
     // missing specified
-    jconfig["missing"] = Rf_asReal(missing);
+    (*jconfig)["missing"] = Rf_asReal(missing);
   }
+}
 
+[[nodiscard]] std::string MakeJsonConfigForArray(SEXP missing, SEXP n_threads, SEXPTYPE arr_type) {
+  using namespace ::xgboost;  // NOLINT
+  Json jconfig{Object{}};
+  AddMissingToJson(&jconfig, missing, arr_type);
   jconfig["nthread"] = Rf_asInteger(n_threads);
   return Json::Dump(jconfig);
 }
@@ -365,15 +365,22 @@ XGB_DLL SEXP XGBGetGlobalConfig_R() {
   return mkString(json_str);
 }
 
-XGB_DLL SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) {
-  SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
+XGB_DLL SEXP XGDMatrixCreateFromURI_R(SEXP uri, SEXP silent, SEXP data_split_mode) {
+  SEXP ret = Rf_protect(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
+  SEXP uri_char = Rf_protect(Rf_asChar(uri));
+  const char *uri_ptr = CHAR(uri_char);
   R_API_BEGIN();
+  xgboost::Json jconfig{xgboost::Object{}};
+  jconfig["uri"] = std::string(uri_ptr);
+  jconfig["silent"] = Rf_asLogical(silent);
+  jconfig["data_split_mode"] = Rf_asInteger(data_split_mode);
+  const std::string sconfig = xgboost::Json::Dump(jconfig);
   DMatrixHandle handle;
-  CHECK_CALL(XGDMatrixCreateFromFile(CHAR(asChar(fname)), asInteger(silent), &handle));
+  CHECK_CALL(XGDMatrixCreateFromURI(sconfig.c_str(), &handle));
   R_SetExternalPtrAddr(ret, handle);
   R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
   R_API_END();
-  UNPROTECT(1);
+  Rf_unprotect(2);
   return ret;
 }
 
@@ -404,7 +411,7 @@ XGB_DLL SEXP XGDMatrixCreateFromDF_R(SEXP df, SEXP missing, SEXP n_threads) {
   DMatrixHandle handle;
   std::int32_t rc{0};
   {
-    std::string sinterface = MakeArrayInterfaceFromRDataFrame(df);
+    const std::string sinterface = MakeArrayInterfaceFromRDataFrame(df);
     xgboost::Json jconfig{xgboost::Object{}};
     jconfig["missing"] = asReal(missing);
     jconfig["nthread"] = asInteger(n_threads);
@@ -456,7 +463,7 @@ XGB_DLL SEXP XGDMatrixCreateFromCSC_R(SEXP indptr, SEXP indices, SEXP data, SEXP
     Json jconfig{Object{}};
     // Construct configuration
     jconfig["nthread"] = Integer{threads};
-    jconfig["missing"] = xgboost::Number{asReal(missing)};
+    AddMissingToJson(&jconfig, missing, TYPEOF(data));
     std::string config;
     Json::Dump(jconfig, &config);
     res_code = XGDMatrixCreateFromCSC(sindptr.c_str(), sindices.c_str(), sdata.c_str(), nrow,
@@ -491,7 +498,7 @@ XGB_DLL SEXP XGDMatrixCreateFromCSR_R(SEXP indptr, SEXP indices, SEXP data, SEXP
     Json jconfig{Object{}};
     // Construct configuration
     jconfig["nthread"] = Integer{threads};
-    jconfig["missing"] = xgboost::Number{asReal(missing)};
+    AddMissingToJson(&jconfig, missing, TYPEOF(data));
     std::string config;
     Json::Dump(jconfig, &config);
     res_code = XGDMatrixCreateFromCSR(sindptr.c_str(), sindices.c_str(), sdata.c_str(), ncol,
@@ -505,7 +512,7 @@ XGB_DLL SEXP XGDMatrixCreateFromCSR_R(SEXP indptr, SEXP indices, SEXP data, SEXP
   return ret;
 }
 
-XGB_DLL SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset) {
+XGB_DLL SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset, SEXP allow_groups) {
   SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
   R_API_BEGIN();
   R_xlen_t len = Rf_xlength(idxset);
@@ -524,7 +531,7 @@ XGB_DLL SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset) {
     res_code = XGDMatrixSliceDMatrixEx(R_ExternalPtrAddr(handle),
                                        BeginPtr(idxvec), len,
                                        &res,
-                                       0);
+                                       Rf_asLogical(allow_groups));
   }
   CHECK_CALL(res_code);
   R_SetExternalPtrAddr(ret, res);
@@ -1240,7 +1247,60 @@ XGB_DLL SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evn
   return mkString(ret);
 }
 
-XGB_DLL SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_config)  {
+namespace {
+
+struct ProxyDmatrixError : public std::exception {};
+
+struct ProxyDmatrixWrapper {
+  DMatrixHandle proxy_dmat_handle;
+
+  ProxyDmatrixWrapper() {
+    int res_code = XGProxyDMatrixCreate(&this->proxy_dmat_handle);
+    if (res_code != 0) {
+      throw ProxyDmatrixError();
+    }
+  }
+
+  ~ProxyDmatrixWrapper() {
+    if (this->proxy_dmat_handle) {
+      XGDMatrixFree(this->proxy_dmat_handle);
+      this->proxy_dmat_handle = nullptr;
+    }
+  }
+
+  DMatrixHandle get_handle() {
+    return this->proxy_dmat_handle;
+  }
+};
+
+std::unique_ptr<ProxyDmatrixWrapper> GetProxyDMatrixWithBaseMargin(SEXP base_margin) {
+  if (Rf_isNull(base_margin)) {
+    return std::unique_ptr<ProxyDmatrixWrapper>(nullptr);
+  }
+
+  SEXP base_margin_dim = Rf_getAttrib(base_margin, R_DimSymbol);
+  int res_code;
+  try {
+    const std::string array_str = Rf_isNull(base_margin_dim)?
+      MakeArrayInterfaceFromRVector(base_margin) : MakeArrayInterfaceFromRMat(base_margin);
+    std::unique_ptr<ProxyDmatrixWrapper> proxy_dmat(new ProxyDmatrixWrapper());
+    res_code = XGDMatrixSetInfoFromInterface(proxy_dmat->get_handle(),
+                                             "base_margin",
+                                             array_str.c_str());
+    if (res_code != 0) {
+      throw ProxyDmatrixError();
+    }
+    return proxy_dmat;
+  } catch(ProxyDmatrixError &err) {
+    Rf_error("%s", XGBGetLastError());
+  }
+}
+
+enum class PredictionInputType {DMatrix, DenseMatrix, CSRMatrix, DataFrame};
+
+SEXP XGBoosterPredictGeneric(SEXP handle, SEXP input_data, SEXP json_config,
+                                    PredictionInputType input_type, SEXP missing,
+                                    SEXP base_margin) {
   SEXP r_out_shape;
   SEXP r_out_result;
   SEXP r_out = PROTECT(allocVector(VECSXP, 2));
@@ -1252,9 +1312,79 @@ XGB_DLL SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_con
   bst_ulong out_dim;
   bst_ulong const *out_shape;
   float const *out_result;
-  CHECK_CALL(XGBoosterPredictFromDMatrix(R_ExternalPtrAddr(handle),
-                                         R_ExternalPtrAddr(dmat), c_json_config,
-                                         &out_shape, &out_dim, &out_result));
+
+  int res_code;
+  {
+    switch (input_type) {
+      case PredictionInputType::DMatrix: {
+        res_code = XGBoosterPredictFromDMatrix(R_ExternalPtrAddr(handle),
+                                               R_ExternalPtrAddr(input_data), c_json_config,
+                                               &out_shape, &out_dim, &out_result);
+        break;
+      }
+
+      case PredictionInputType::CSRMatrix: {
+        std::unique_ptr<ProxyDmatrixWrapper> proxy_dmat = GetProxyDMatrixWithBaseMargin(
+          base_margin);
+        DMatrixHandle proxy_dmat_handle = proxy_dmat.get()? proxy_dmat->get_handle() : nullptr;
+
+        SEXP indptr = VECTOR_ELT(input_data, 0);
+        SEXP indices = VECTOR_ELT(input_data, 1);
+        SEXP data = VECTOR_ELT(input_data, 2);
+        const int ncol_csr = Rf_asInteger(VECTOR_ELT(input_data, 3));
+        const SEXPTYPE type_data = TYPEOF(data);
+        CHECK_EQ(type_data, REALSXP);
+        std::string sindptr, sindices, sdata;
+        CreateFromSparse(indptr, indices, data, &sindptr, &sindices, &sdata);
+
+        xgboost::StringView json_str(c_json_config);
+        xgboost::Json new_json = xgboost::Json::Load(json_str);
+        AddMissingToJson(&new_json, missing, type_data);
+        const std::string new_c_json = xgboost::Json::Dump(new_json);
+
+        res_code = XGBoosterPredictFromCSR(
+          R_ExternalPtrAddr(handle), sindptr.c_str(), sindices.c_str(), sdata.c_str(),
+          ncol_csr, new_c_json.c_str(), proxy_dmat_handle, &out_shape, &out_dim, &out_result);
+        break;
+      }
+
+      case PredictionInputType::DenseMatrix: {
+        std::unique_ptr<ProxyDmatrixWrapper> proxy_dmat = GetProxyDMatrixWithBaseMargin(
+          base_margin);
+        DMatrixHandle proxy_dmat_handle = proxy_dmat.get()? proxy_dmat->get_handle() : nullptr;
+        const std::string array_str = MakeArrayInterfaceFromRMat(input_data);
+
+        xgboost::StringView json_str(c_json_config);
+        xgboost::Json new_json = xgboost::Json::Load(json_str);
+        AddMissingToJson(&new_json, missing, TYPEOF(input_data));
+        const std::string new_c_json = xgboost::Json::Dump(new_json);
+
+        res_code = XGBoosterPredictFromDense(
+          R_ExternalPtrAddr(handle), array_str.c_str(), new_c_json.c_str(),
+          proxy_dmat_handle, &out_shape, &out_dim, &out_result);
+        break;
+      }
+
+      case PredictionInputType::DataFrame: {
+        std::unique_ptr<ProxyDmatrixWrapper> proxy_dmat = GetProxyDMatrixWithBaseMargin(
+          base_margin);
+        DMatrixHandle proxy_dmat_handle = proxy_dmat.get()? proxy_dmat->get_handle() : nullptr;
+
+        const std::string df_str = MakeArrayInterfaceFromRDataFrame(input_data);
+
+        xgboost::StringView json_str(c_json_config);
+        xgboost::Json new_json = xgboost::Json::Load(json_str);
+        AddMissingToJson(&new_json, missing, REALSXP);
+        const std::string new_c_json = xgboost::Json::Dump(new_json);
+
+        res_code = XGBoosterPredictFromColumnar(
+          R_ExternalPtrAddr(handle), df_str.c_str(), new_c_json.c_str(),
+          proxy_dmat_handle, &out_shape, &out_dim, &out_result);
+        break;
+      }
+    }
+  }
+  CHECK_CALL(res_code);
 
   r_out_shape = PROTECT(allocVector(INTSXP, out_dim));
   size_t len = 1;
@@ -1275,6 +1405,31 @@ XGB_DLL SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_con
   return r_out;
 }
 
+}  // namespace
+
+XGB_DLL SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_config)  {
+  return XGBoosterPredictGeneric(handle, dmat, json_config,
+                                 PredictionInputType::DMatrix, R_NilValue, R_NilValue);
+}
+
+XGB_DLL SEXP XGBoosterPredictFromDense_R(SEXP handle, SEXP R_mat, SEXP missing,
+                                         SEXP json_config, SEXP base_margin) {
+  return XGBoosterPredictGeneric(handle, R_mat, json_config,
+                                 PredictionInputType::DenseMatrix, missing, base_margin);
+}
+
+XGB_DLL SEXP XGBoosterPredictFromCSR_R(SEXP handle, SEXP lst, SEXP missing,
+                                       SEXP json_config, SEXP base_margin) {
+  return XGBoosterPredictGeneric(handle, lst, json_config,
+                                 PredictionInputType::CSRMatrix, missing, base_margin);
+}
+
+XGB_DLL SEXP XGBoosterPredictFromColumnar_R(SEXP handle, SEXP R_df, SEXP missing,
+                                            SEXP json_config, SEXP base_margin) {
+  return XGBoosterPredictGeneric(handle, R_df, json_config,
+                                 PredictionInputType::DataFrame, missing, base_margin);
+}
+
 XGB_DLL SEXP XGBoosterLoadModel_R(SEXP handle, SEXP fname) {
   R_API_BEGIN();
   CHECK_CALL(XGBoosterLoadModel(R_ExternalPtrAddr(handle), CHAR(asChar(fname))));
@@ -1519,3 +1674,18 @@ XGB_DLL SEXP XGBoosterSlice_R(SEXP handle, SEXP begin_layer, SEXP end_layer, SEX
   Rf_unprotect(1);
   return out;
 }
+
+XGB_DLL SEXP XGBoosterSliceAndReplace_R(SEXP handle, SEXP begin_layer, SEXP end_layer, SEXP step) {
+  R_API_BEGIN();
+  BoosterHandle old_handle = R_ExternalPtrAddr(handle);
+  BoosterHandle new_handle = nullptr;
+  CHECK_CALL(XGBoosterSlice(old_handle,
+                            Rf_asInteger(begin_layer),
+                            Rf_asInteger(end_layer),
+                            Rf_asInteger(step),
+                            &new_handle));
+  R_SetExternalPtrAddr(handle, new_handle);
+  CHECK_CALL(XGBoosterFree(old_handle));
+  R_API_END();
+  return R_NilValue;
+}
diff --git a/R-package/src/xgboost_R.h b/R-package/src/xgboost_R.h
index d2e0ae828..62be5022a 100644
--- a/R-package/src/xgboost_R.h
+++ b/R-package/src/xgboost_R.h
@@ -53,12 +53,13 @@ XGB_DLL SEXP XGBSetGlobalConfig_R(SEXP json_str);
 XGB_DLL SEXP XGBGetGlobalConfig_R();
 
 /*!
- * \brief load a data matrix
- * \param fname name of the content
+ * \brief load a data matrix from URI
+ * \param uri URI to the source file to read data from
  * \param silent whether print messages
+ * \param Data split mode (0=rows, 1=columns)
  * \return a loaded data matrix
  */
-XGB_DLL SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent);
+XGB_DLL SEXP XGDMatrixCreateFromURI_R(SEXP uri, SEXP silent, SEXP data_split_mode);
 
 /*!
  * \brief create matrix content from dense matrix
@@ -111,9 +112,10 @@ XGB_DLL SEXP XGDMatrixCreateFromCSR_R(SEXP indptr, SEXP indices, SEXP data, SEXP
  * \brief create a new dmatrix from sliced content of existing matrix
  * \param handle instance of data matrix to be sliced
  * \param idxset index set
+ * \param allow_groups Whether to allow slicing the DMatrix if it has a 'group' field
  * \return a sliced new matrix
  */
-XGB_DLL SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset);
+XGB_DLL SEXP XGDMatrixSliceDMatrix_R(SEXP handle, SEXP idxset, SEXP allow_groups);
 
 /*!
  * \brief load a data matrix into binary file
@@ -370,6 +372,50 @@ XGB_DLL SEXP XGBoosterEvalOneIter_R(SEXP handle, SEXP iter, SEXP dmats, SEXP evn
  * \return A list containing 2 vectors, first one for shape while second one for prediction result.
  */
 XGB_DLL SEXP XGBoosterPredictFromDMatrix_R(SEXP handle, SEXP dmat, SEXP json_config);
+
+/*!
+ * \brief Run prediction on R dense matrix
+ * \param handle handle
+ * \param R_mat R matrix
+ * \param missing missing value
+ * \param json_config See `XGBoosterPredictFromDense` in xgboost c_api.h. Doesn't include 'missing'
+ * \param base_margin base margin for the prediction
+ *
+ * \return A list containing 2 vectors, first one for shape while second one for prediction result.
+ */
+XGB_DLL SEXP XGBoosterPredictFromDense_R(SEXP handle, SEXP R_mat, SEXP missing,
+                                         SEXP json_config, SEXP base_margin);
+
+/*!
+ * \brief Run prediction on R CSR matrix
+ * \param handle handle
+ * \param lst An R list, containing, in this order:
+ *              (a) 'p' array (a.k.a. indptr)
+ *              (b) 'j' array (a.k.a. indices)
+ *              (c) 'x' array (a.k.a. data / values)
+ *              (d) number of columns
+ * \param missing missing value
+ * \param json_config See `XGBoosterPredictFromCSR` in xgboost c_api.h. Doesn't include 'missing'
+ * \param base_margin base margin for the prediction
+ *
+ * \return A list containing 2 vectors, first one for shape while second one for prediction result.
+ */
+XGB_DLL SEXP XGBoosterPredictFromCSR_R(SEXP handle, SEXP lst, SEXP missing,
+                                       SEXP json_config, SEXP base_margin);
+
+/*!
+ * \brief Run prediction on R data.frame
+ * \param handle handle
+ * \param R_df R data.frame
+ * \param missing missing value
+ * \param json_config See `XGBoosterPredictFromDense` in xgboost c_api.h. Doesn't include 'missing'
+ * \param base_margin base margin for the prediction
+ *
+ * \return A list containing 2 vectors, first one for shape while second one for prediction result.
+ */
+XGB_DLL SEXP XGBoosterPredictFromColumnar_R(SEXP handle, SEXP R_df, SEXP missing,
+                                            SEXP json_config, SEXP base_margin);
+
 /*!
  * \brief load model from existing file
  * \param handle handle
@@ -490,4 +536,14 @@ XGB_DLL SEXP XGBoosterFeatureScore_R(SEXP handle, SEXP json_config);
  */
 XGB_DLL SEXP XGBoosterSlice_R(SEXP handle, SEXP begin_layer, SEXP end_layer, SEXP step);
 
+/*!
+ * \brief Slice a fitted booster model (by rounds), and replace its handle with the result
+ * \param handle handle to the fitted booster
+ * \param begin_layer start of the slice
+ * \param end_later end of the slice; end_layer=0 is equivalent to end_layer=num_boost_round
+ * \param step step size of the slice
+ * \return NULL
+ */
+XGB_DLL SEXP XGBoosterSliceAndReplace_R(SEXP handle, SEXP begin_layer, SEXP end_layer, SEXP step);
+
 #endif  // XGBOOST_WRAPPER_R_H_ // NOLINT(*)
diff --git a/R-package/src/xgboost_custom.cc b/R-package/src/xgboost_custom.cc
index 4b05361ca..fb548c61d 100644
--- a/R-package/src/xgboost_custom.cc
+++ b/R-package/src/xgboost_custom.cc
@@ -41,16 +41,6 @@ double LogGamma(double v) {
   return lgammafn(v);
 }
 #endif  // !defined(XGBOOST_USE_CUDA)
-// customize random engine.
-void CustomGlobalRandomEngine::seed(CustomGlobalRandomEngine::result_type val) {
-  // ignore the seed
-}
 
-// use R's PRNG to replacd
-CustomGlobalRandomEngine::result_type
-CustomGlobalRandomEngine::operator()() {
-  return static_cast<result_type>(
-      std::floor(unif_rand() * CustomGlobalRandomEngine::max()));
-}
 }  // namespace common
 }  // namespace xgboost
diff --git a/R-package/tests/helper_scripts/install_deps.R b/R-package/tests/helper_scripts/install_deps.R
index 3ae44f6b1..7a621798a 100644
--- a/R-package/tests/helper_scripts/install_deps.R
+++ b/R-package/tests/helper_scripts/install_deps.R
@@ -20,6 +20,7 @@ pkgs <- c(
   "igraph",
   "float",
   "titanic",
+  "RhpcBLASctl",
   ## imports
   "Matrix",
   "methods",
diff --git a/R-package/tests/testthat.R b/R-package/tests/testthat.R
index 3bb229e70..bad6c1df3 100644
--- a/R-package/tests/testthat.R
+++ b/R-package/tests/testthat.R
@@ -1,4 +1,6 @@
 library(testthat)
 library(xgboost)
+library(Matrix)
 
 test_check("xgboost", reporter = ProgressReporter)
+RhpcBLASctl::omp_set_num_threads(1)
diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
index 03a8ddbe1..bbb8fb323 100644
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -20,7 +20,7 @@ test_that("train and predict binary classification", {
       data = xgb.DMatrix(train$data, label = train$label), max_depth = 2,
       eta = 1, nthread = n_threads, nrounds = nrounds,
       objective = "binary:logistic", eval_metric = "error",
-      watchlist = list(train = xgb.DMatrix(train$data, label = train$label))
+      evals = list(train = xgb.DMatrix(train$data, label = train$label))
     ),
     "train-error"
   )
@@ -139,8 +139,8 @@ test_that("dart prediction works", {
   pred_by_train_1 <- predict(booster_by_train, newdata = dtrain, iterationrange = c(1, nrounds))
   pred_by_train_2 <- predict(booster_by_train, newdata = dtrain, training = TRUE)
 
-  expect_true(all(matrix(pred_by_train_0, byrow = TRUE) == matrix(pred_by_xgboost_0, byrow = TRUE)))
-  expect_true(all(matrix(pred_by_train_1, byrow = TRUE) == matrix(pred_by_xgboost_1, byrow = TRUE)))
+  expect_equal(pred_by_train_0, pred_by_xgboost_0, tolerance = 1e-6)
+  expect_equal(pred_by_train_1, pred_by_xgboost_1, tolerance = 1e-6)
   expect_true(all(matrix(pred_by_train_2, byrow = TRUE) == matrix(pred_by_xgboost_2, byrow = TRUE)))
 })
 
@@ -152,7 +152,7 @@ test_that("train and predict softprob", {
       data = xgb.DMatrix(as.matrix(iris[, -5]), label = lb),
       max_depth = 3, eta = 0.5, nthread = n_threads, nrounds = 5,
       objective = "multi:softprob", num_class = 3, eval_metric = "merror",
-      watchlist = list(train = xgb.DMatrix(as.matrix(iris[, -5]), label = lb))
+      evals = list(train = xgb.DMatrix(as.matrix(iris[, -5]), label = lb))
     ),
     "train-merror"
   )
@@ -203,7 +203,7 @@ test_that("train and predict softmax", {
       data = xgb.DMatrix(as.matrix(iris[, -5]), label = lb),
       max_depth = 3, eta = 0.5, nthread = n_threads, nrounds = 5,
       objective = "multi:softmax", num_class = 3, eval_metric = "merror",
-      watchlist = list(train = xgb.DMatrix(as.matrix(iris[, -5]), label = lb))
+      evals = list(train = xgb.DMatrix(as.matrix(iris[, -5]), label = lb))
     ),
     "train-merror"
   )
@@ -226,7 +226,7 @@ test_that("train and predict RF", {
     nthread = n_threads,
     nrounds = 1, objective = "binary:logistic", eval_metric = "error",
     num_parallel_tree = 20, subsample = 0.6, colsample_bytree = 0.1,
-    watchlist = list(train = xgb.DMatrix(train$data, label = lb))
+    evals = list(train = xgb.DMatrix(train$data, label = lb))
   )
   expect_equal(xgb.get.num.boosted.rounds(bst), 1)
 
@@ -250,7 +250,7 @@ test_that("train and predict RF with softprob", {
     objective = "multi:softprob", eval_metric = "merror",
     num_class = 3, verbose = 0,
     num_parallel_tree = 4, subsample = 0.5, colsample_bytree = 0.5,
-    watchlist = list(train = xgb.DMatrix(as.matrix(iris[, -5]), label = lb))
+    evals = list(train = xgb.DMatrix(as.matrix(iris[, -5]), label = lb))
   )
   expect_equal(xgb.get.num.boosted.rounds(bst), 15)
   # predict for all iterations:
@@ -271,7 +271,7 @@ test_that("use of multiple eval metrics works", {
       data = xgb.DMatrix(train$data, label = train$label), max_depth = 2,
       eta = 1, nthread = n_threads, nrounds = 2, objective = "binary:logistic",
       eval_metric = "error", eval_metric = "auc", eval_metric = "logloss",
-      watchlist = list(train = xgb.DMatrix(train$data, label = train$label))
+      evals = list(train = xgb.DMatrix(train$data, label = train$label))
     ),
     "train-error.*train-auc.*train-logloss"
   )
@@ -283,7 +283,7 @@ test_that("use of multiple eval metrics works", {
       data = xgb.DMatrix(train$data, label = train$label), max_depth = 2,
       eta = 1, nthread = n_threads, nrounds = 2, objective = "binary:logistic",
       eval_metric = list("error", "auc", "logloss"),
-      watchlist = list(train = xgb.DMatrix(train$data, label = train$label))
+      evals = list(train = xgb.DMatrix(train$data, label = train$label))
     ),
     "train-error.*train-auc.*train-logloss"
   )
@@ -295,19 +295,19 @@ test_that("use of multiple eval metrics works", {
 
 test_that("training continuation works", {
   dtrain <- xgb.DMatrix(train$data, label = train$label, nthread = n_threads)
-  watchlist <- list(train = dtrain)
+  evals <- list(train = dtrain)
   param <- list(
     objective = "binary:logistic", max_depth = 2, eta = 1, nthread = n_threads
   )
 
   # for the reference, use 4 iterations at once:
   set.seed(11)
-  bst <- xgb.train(param, dtrain, nrounds = 4, watchlist, verbose = 0)
+  bst <- xgb.train(param, dtrain, nrounds = 4, evals = evals, verbose = 0)
   # first two iterations:
   set.seed(11)
-  bst1 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0)
+  bst1 <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0)
   # continue for two more:
-  bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = bst1)
+  bst2 <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0, xgb_model = bst1)
   if (!windows_flag && !solaris_flag) {
     expect_equal(xgb.save.raw(bst), xgb.save.raw(bst2))
   }
@@ -315,7 +315,7 @@ test_that("training continuation works", {
   expect_equal(dim(attributes(bst2)$evaluation_log), c(4, 2))
   expect_equal(attributes(bst2)$evaluation_log, attributes(bst)$evaluation_log)
   # test continuing from raw model data
-  bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = xgb.save.raw(bst1))
+  bst2 <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0, xgb_model = xgb.save.raw(bst1))
   if (!windows_flag && !solaris_flag) {
     expect_equal(xgb.save.raw(bst), xgb.save.raw(bst2))
   }
@@ -323,7 +323,7 @@ test_that("training continuation works", {
   # test continuing from a model in file
   fname <- file.path(tempdir(), "xgboost.json")
   xgb.save(bst1, fname)
-  bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = fname)
+  bst2 <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0, xgb_model = fname)
   if (!windows_flag && !solaris_flag) {
     expect_equal(xgb.save.raw(bst), xgb.save.raw(bst2))
   }
@@ -334,7 +334,7 @@ test_that("xgb.cv works", {
   set.seed(11)
   expect_output(
     cv <- xgb.cv(
-      data = train$data, label = train$label, max_depth = 2, nfold = 5,
+      data = xgb.DMatrix(train$data, label = train$label), max_depth = 2, nfold = 5,
       eta = 1., nthread = n_threads, nrounds = 2, objective = "binary:logistic",
       eval_metric = "error", verbose = TRUE
     ),
@@ -348,7 +348,6 @@ test_that("xgb.cv works", {
   expect_false(is.null(cv$folds) && is.list(cv$folds))
   expect_length(cv$folds, 5)
   expect_false(is.null(cv$params) && is.list(cv$params))
-  expect_false(is.null(cv$callbacks))
   expect_false(is.null(cv$call))
 })
 
@@ -358,13 +357,13 @@ test_that("xgb.cv works with stratified folds", {
   cv <- xgb.cv(
     data = dtrain, max_depth = 2, nfold = 5,
     eta = 1., nthread = n_threads, nrounds = 2, objective = "binary:logistic",
-    verbose = TRUE, stratified = FALSE
+    verbose = FALSE, stratified = FALSE
   )
   set.seed(314159)
   cv2 <- xgb.cv(
     data = dtrain, max_depth = 2, nfold = 5,
     eta = 1., nthread = n_threads, nrounds = 2, objective = "binary:logistic",
-    verbose = TRUE, stratified = TRUE
+    verbose = FALSE, stratified = TRUE
   )
   # Stratified folds should result in a different evaluation logs
   expect_true(all(cv$evaluation_log[, test_logloss_mean] != cv2$evaluation_log[, test_logloss_mean]))
@@ -418,7 +417,7 @@ test_that("max_delta_step works", {
   dtrain <- xgb.DMatrix(
     agaricus.train$data, label = agaricus.train$label, nthread = n_threads
   )
-  watchlist <- list(train = dtrain)
+  evals <- list(train = dtrain)
   param <- list(
     objective = "binary:logistic", eval_metric = "logloss", max_depth = 2,
     nthread = n_threads,
@@ -426,9 +425,9 @@ test_that("max_delta_step works", {
   )
   nrounds <- 5
   # model with no restriction on max_delta_step
-  bst1 <- xgb.train(param, dtrain, nrounds, watchlist, verbose = 1)
+  bst1 <- xgb.train(param, dtrain, nrounds, evals = evals, verbose = 1)
   # model with restricted max_delta_step
-  bst2 <- xgb.train(param, dtrain, nrounds, watchlist, verbose = 1, max_delta_step = 1)
+  bst2 <- xgb.train(param, dtrain, nrounds, evals = evals, verbose = 1, max_delta_step = 1)
   # the no-restriction model is expected to have consistently lower loss during the initial iterations
   expect_true(all(attributes(bst1)$evaluation_log$train_logloss < attributes(bst2)$evaluation_log$train_logloss))
   expect_lt(mean(attributes(bst1)$evaluation_log$train_logloss) / mean(attributes(bst2)$evaluation_log$train_logloss), 0.8)
@@ -445,7 +444,7 @@ test_that("colsample_bytree works", {
   colnames(test_x) <- paste0("Feature_", sprintf("%03d", 1:100))
   dtrain <- xgb.DMatrix(train_x, label = train_y, nthread = n_threads)
   dtest <- xgb.DMatrix(test_x, label = test_y, nthread = n_threads)
-  watchlist <- list(train = dtrain, eval = dtest)
+  evals <- list(train = dtrain, eval = dtest)
   ## Use colsample_bytree = 0.01, so that roughly one out of 100 features is chosen for
   ## each tree
   param <- list(
@@ -454,7 +453,7 @@ test_that("colsample_bytree works", {
     eval_metric = "auc"
   )
   set.seed(2)
-  bst <- xgb.train(param, dtrain, nrounds = 100, watchlist, verbose = 0)
+  bst <- xgb.train(param, dtrain, nrounds = 100, evals = evals, verbose = 0)
   xgb.importance(model = bst)
   # If colsample_bytree works properly, a variety of features should be used
   # in the 100 trees
@@ -651,6 +650,51 @@ test_that("Can use ranking objectives with either 'qid' or 'group'", {
   expect_equal(pred_qid, pred_gr)
 })
 
+test_that("Can predict on data.frame objects", {
+  data("mtcars")
+  y <- mtcars$mpg
+  x_df <- mtcars[, -1]
+  x_mat <- as.matrix(x_df)
+  dm <- xgb.DMatrix(x_mat, label = y, nthread = n_threads)
+  model <- xgb.train(
+    params = list(
+      tree_method = "hist",
+      objective = "reg:squarederror",
+      nthread = n_threads
+    ),
+    data = dm,
+    nrounds = 5
+  )
+
+  pred_mat <- predict(model, xgb.DMatrix(x_mat), nthread = n_threads)
+  pred_df <- predict(model, x_df, nthread = n_threads)
+  expect_equal(pred_mat, pred_df)
+})
+
+test_that("'base_margin' gives the same result in DMatrix as in inplace_predict", {
+  data("mtcars")
+  y <- mtcars$mpg
+  x <- as.matrix(mtcars[, -1])
+  dm <- xgb.DMatrix(x, label = y, nthread = n_threads)
+  model <- xgb.train(
+    params = list(
+      tree_method = "hist",
+      objective = "reg:squarederror",
+      nthread = n_threads
+    ),
+    data = dm,
+    nrounds = 5
+  )
+
+  set.seed(123)
+  base_margin <- rnorm(nrow(x))
+  dm_w_base <- xgb.DMatrix(data = x, base_margin = base_margin)
+  pred_from_dm <- predict(model, dm_w_base)
+  pred_from_mat <- predict(model, x, base_margin = base_margin)
+
+  expect_equal(pred_from_dm, pred_from_mat)
+})
+
 test_that("Coefficients from gblinear have the expected shape and names", {
   # Single-column coefficients
   data(mtcars)
@@ -778,3 +822,120 @@ test_that("DMatrix field are set to booster when training", {
   expect_equal(getinfo(model_feature_types, "feature_type"), c("q", "c", "q"))
   expect_equal(getinfo(model_both, "feature_type"), c("q", "c", "q"))
 })
+
+test_that("Seed in params override PRNG from R", {
+  set.seed(123)
+  model1 <- xgb.train(
+    data = xgb.DMatrix(
+      agaricus.train$data,
+      label = agaricus.train$label, nthread = 1L
+    ),
+    params = list(
+      objective = "binary:logistic",
+      max_depth = 3L,
+      subsample = 0.1,
+      colsample_bytree = 0.1,
+      seed = 111L
+    ),
+    nrounds = 3L
+  )
+
+  set.seed(456)
+  model2 <- xgb.train(
+    data = xgb.DMatrix(
+      agaricus.train$data,
+      label = agaricus.train$label, nthread = 1L
+    ),
+    params = list(
+      objective = "binary:logistic",
+      max_depth = 3L,
+      subsample = 0.1,
+      colsample_bytree = 0.1,
+      seed = 111L
+    ),
+    nrounds = 3L
+  )
+
+  expect_equal(
+    xgb.save.raw(model1, raw_format = "json"),
+    xgb.save.raw(model2, raw_format = "json")
+  )
+
+  set.seed(123)
+  model3 <- xgb.train(
+    data = xgb.DMatrix(
+      agaricus.train$data,
+      label = agaricus.train$label, nthread = 1L
+    ),
+    params = list(
+      objective = "binary:logistic",
+      max_depth = 3L,
+      subsample = 0.1,
+      colsample_bytree = 0.1,
+      seed = 222L
+    ),
+    nrounds = 3L
+  )
+  expect_false(
+    isTRUE(
+      all.equal(
+        xgb.save.raw(model1, raw_format = "json"),
+        xgb.save.raw(model3, raw_format = "json")
+      )
+    )
+  )
+})
+
+test_that("xgb.cv works for AFT", {
+  X <- matrix(c(1, -1, -1, 1, 0, 1, 1, 0), nrow = 4, byrow = TRUE)  # 4x2 matrix
+  dtrain <- xgb.DMatrix(X, nthread = n_threads)
+
+  params <- list(objective = 'survival:aft', learning_rate = 0.2, max_depth = 2L)
+
+  # data must have bounds
+  expect_error(
+    xgb.cv(
+      params = params,
+      data = dtrain,
+      nround = 5L,
+      nfold = 4L,
+      nthread = n_threads
+    )
+  )
+
+  setinfo(dtrain, 'label_lower_bound', c(2, 3, 0, 4))
+  setinfo(dtrain, 'label_upper_bound', c(2, Inf, 4, 5))
+
+  # automatic stratified splitting is turned off
+  expect_warning(
+    xgb.cv(
+      params = params, data = dtrain, nround = 5L, nfold = 4L,
+      nthread = n_threads, stratified = TRUE, verbose = FALSE
+    )
+  )
+
+  # this works without any issue
+  expect_no_warning(
+    xgb.cv(params = params, data = dtrain, nround = 5L, nfold = 4L, verbose = FALSE)
+  )
+})
+
+test_that("xgb.cv works for ranking", {
+  data(iris)
+  x <- iris[, -(4:5)]
+  y <- as.integer(iris$Petal.Width)
+  group <- rep(50, 3)
+  dm <- xgb.DMatrix(x, label = y, group = group)
+  res <- xgb.cv(
+    data = dm,
+    params = list(
+      objective = "rank:pairwise",
+      max_depth = 3
+    ),
+    nrounds = 3,
+    nfold = 2,
+    verbose = FALSE,
+    stratified = FALSE
+  )
+  expect_equal(length(res$folds), 2L)
+})
diff --git a/R-package/tests/testthat/test_callbacks.R b/R-package/tests/testthat/test_callbacks.R
index c60d0c246..bf95a170d 100644
--- a/R-package/tests/testthat/test_callbacks.R
+++ b/R-package/tests/testthat/test_callbacks.R
@@ -19,7 +19,7 @@ ltrain <- add.noise(train$label, 0.2)
 ltest <- add.noise(test$label, 0.2)
 dtrain <- xgb.DMatrix(train$data, label = ltrain, nthread = n_threads)
 dtest <- xgb.DMatrix(test$data, label = ltest, nthread = n_threads)
-watchlist <- list(train = dtrain, test = dtest)
+evals <- list(train = dtrain, test = dtest)
 
 
 err <- function(label, pr) sum((pr > 0.5) != label) / length(label)
@@ -28,79 +28,125 @@ param <- list(objective = "binary:logistic", eval_metric = "error",
               max_depth = 2, nthread = n_threads)
 
 
-test_that("cb.print.evaluation works as expected", {
+test_that("xgb.cb.print.evaluation works as expected for xgb.train", {
+  logs1 <- capture.output({
+    model <- xgb.train(
+      data = dtrain,
+      params = list(
+        objective = "binary:logistic",
+        eval_metric = "auc",
+        max_depth = 2,
+        nthread = n_threads
+      ),
+      nrounds = 10,
+      evals = list(train = dtrain, test = dtest),
+      callbacks = list(xgb.cb.print.evaluation(period = 1))
+    )
+  })
+  expect_equal(length(logs1), 10)
+  expect_true(all(grepl("^\\[\\d{1,2}\\]\ttrain-auc:0\\.\\d+\ttest-auc:0\\.\\d+\\s*$", logs1)))
+  lapply(seq(1, 10), function(x) expect_true(grepl(paste0("^\\[", x), logs1[x])))
 
-  bst_evaluation <- c('train-auc' = 0.9, 'test-auc' = 0.8)
-  bst_evaluation_err <- NULL
-  begin_iteration <- 1
-  end_iteration <- 7
-
-  f0 <- cb.print.evaluation(period = 0)
-  f1 <- cb.print.evaluation(period = 1)
-  f5 <- cb.print.evaluation(period = 5)
-
-  expect_false(is.null(attr(f1, 'call')))
-  expect_equal(attr(f1, 'name'), 'cb.print.evaluation')
-
-  iteration <- 1
-  expect_silent(f0())
-  expect_output(f1(), "\\[1\\]\ttrain-auc:0.900000\ttest-auc:0.800000")
-  expect_output(f5(), "\\[1\\]\ttrain-auc:0.900000\ttest-auc:0.800000")
-  expect_null(f1())
-
-  iteration <- 2
-  expect_output(f1(), "\\[2\\]\ttrain-auc:0.900000\ttest-auc:0.800000")
-  expect_silent(f5())
-
-  iteration <- 7
-  expect_output(f1(), "\\[7\\]\ttrain-auc:0.900000\ttest-auc:0.800000")
-  expect_output(f5(), "\\[7\\]\ttrain-auc:0.900000\ttest-auc:0.800000")
-
-  bst_evaluation_err  <- c('train-auc' = 0.1, 'test-auc' = 0.2)
-  expect_output(f1(), "\\[7\\]\ttrain-auc:0.900000±0.100000\ttest-auc:0.800000±0.200000")
+  logs2 <- capture.output({
+    model <- xgb.train(
+      data = dtrain,
+      params = list(
+        objective = "binary:logistic",
+        eval_metric = "auc",
+        max_depth = 2,
+        nthread = n_threads
+      ),
+      nrounds = 10,
+      evals = list(train = dtrain, test = dtest),
+      callbacks = list(xgb.cb.print.evaluation(period = 2))
+    )
+  })
+  expect_equal(length(logs2), 6)
+  expect_true(all(grepl("^\\[\\d{1,2}\\]\ttrain-auc:0\\.\\d+\ttest-auc:0\\.\\d+\\s*$", logs2)))
+  seq_matches <- c(seq(1, 10, 2), 10)
+  lapply(seq_along(seq_matches), function(x) expect_true(grepl(paste0("^\\[", seq_matches[x]), logs2[x])))
 })
 
-test_that("cb.evaluation.log works as expected", {
+test_that("xgb.cb.print.evaluation works as expected for xgb.cv", {
+  logs1 <- capture.output({
+    model <- xgb.cv(
+      data = dtrain,
+      params = list(
+        objective = "binary:logistic",
+        eval_metric = "auc",
+        max_depth = 2,
+        nthread = n_threads
+      ),
+      nrounds = 10,
+      nfold = 3,
+      callbacks = list(xgb.cb.print.evaluation(period = 1, showsd = TRUE))
+    )
+  })
+  expect_equal(length(logs1), 10)
+  expect_true(all(grepl("^\\[\\d{1,2}\\]\ttrain-auc:0\\.\\d+±0\\.\\d+\ttest-auc:0\\.\\d+±0\\.\\d+\\s*$", logs1)))
+  lapply(seq(1, 10), function(x) expect_true(grepl(paste0("^\\[", x), logs1[x])))
 
-  bst_evaluation <- c('train-auc' = 0.9, 'test-auc' = 0.8)
-  bst_evaluation_err <- NULL
+  logs2 <- capture.output({
+    model <- xgb.cv(
+      data = dtrain,
+      params = list(
+        objective = "binary:logistic",
+        eval_metric = "auc",
+        max_depth = 2,
+        nthread = n_threads
+      ),
+      nrounds = 10,
+      nfold = 3,
+      callbacks = list(xgb.cb.print.evaluation(period = 2, showsd = TRUE))
+    )
+  })
+  expect_equal(length(logs2), 6)
+  expect_true(all(grepl("^\\[\\d{1,2}\\]\ttrain-auc:0\\.\\d+±0\\.\\d+\ttest-auc:0\\.\\d+±0\\.\\d+\\s*$", logs2)))
+  seq_matches <- c(seq(1, 10, 2), 10)
+  lapply(seq_along(seq_matches), function(x) expect_true(grepl(paste0("^\\[", seq_matches[x]), logs2[x])))
+})
 
-  evaluation_log <- list()
-  f <- cb.evaluation.log()
+test_that("xgb.cb.evaluation.log works as expected for xgb.train", {
+  model <- xgb.train(
+    data = dtrain,
+    params = list(
+      objective = "binary:logistic",
+      eval_metric = "auc",
+      max_depth = 2,
+      nthread = n_threads
+    ),
+    nrounds = 10,
+    verbose = FALSE,
+    evals = list(train = dtrain, test = dtest),
+    callbacks = list(xgb.cb.evaluation.log())
+  )
+  logs <- attributes(model)$evaluation_log
 
-  expect_false(is.null(attr(f, 'call')))
-  expect_equal(attr(f, 'name'), 'cb.evaluation.log')
+  expect_equal(nrow(logs), 10)
+  expect_equal(colnames(logs), c("iter", "train_auc", "test_auc"))
+})
 
-  iteration <- 1
-  expect_silent(f())
-  expect_equal(evaluation_log,
-               list(c(iter = 1, bst_evaluation)))
-  iteration <- 2
-  expect_silent(f())
-  expect_equal(evaluation_log,
-               list(c(iter = 1, bst_evaluation), c(iter = 2, bst_evaluation)))
-  expect_silent(f(finalize = TRUE))
-  expect_equal(evaluation_log,
-               data.table::data.table(iter = 1:2, train_auc = c(0.9, 0.9), test_auc = c(0.8, 0.8)))
+test_that("xgb.cb.evaluation.log works as expected for xgb.cv", {
+  model <- xgb.cv(
+    data = dtrain,
+    params = list(
+      objective = "binary:logistic",
+      eval_metric = "auc",
+      max_depth = 2,
+      nthread = n_threads
+    ),
+    nrounds = 10,
+    verbose = FALSE,
+    nfold = 3,
+    callbacks = list(xgb.cb.evaluation.log())
+  )
+  logs <- model$evaluation_log
 
-  bst_evaluation_err  <- c('train-auc' = 0.1, 'test-auc' = 0.2)
-  evaluation_log <- list()
-  f <- cb.evaluation.log()
-
-  iteration <- 1
-  expect_silent(f())
-  expect_equal(evaluation_log,
-               list(c(iter = 1, c(bst_evaluation, bst_evaluation_err))))
-  iteration <- 2
-  expect_silent(f())
-  expect_equal(evaluation_log,
-               list(c(iter = 1, c(bst_evaluation, bst_evaluation_err)),
-                    c(iter = 2, c(bst_evaluation, bst_evaluation_err))))
-  expect_silent(f(finalize = TRUE))
-  expect_equal(evaluation_log,
-               data.table::data.table(iter = 1:2,
-                          train_auc_mean = c(0.9, 0.9), train_auc_std = c(0.1, 0.1),
-                          test_auc_mean = c(0.8, 0.8), test_auc_std = c(0.2, 0.2)))
+  expect_equal(nrow(logs), 10)
+  expect_equal(
+    colnames(logs),
+    c("iter", "train_auc_mean", "train_auc_std", "test_auc_mean", "test_auc_std")
+  )
 })
 
 
@@ -109,26 +155,26 @@ param <- list(objective = "binary:logistic", eval_metric = "error",
 
 test_that("can store evaluation_log without printing", {
   expect_silent(
-    bst <- xgb.train(param, dtrain, nrounds = 10, watchlist, eta = 1, verbose = 0)
+    bst <- xgb.train(param, dtrain, nrounds = 10, evals = evals, eta = 1, verbose = 0)
   )
   expect_false(is.null(attributes(bst)$evaluation_log))
   expect_false(is.null(attributes(bst)$evaluation_log$train_error))
   expect_lt(attributes(bst)$evaluation_log[, min(train_error)], 0.2)
 })
 
-test_that("cb.reset.parameters works as expected", {
+test_that("xgb.cb.reset.parameters works as expected", {
 
   # fixed eta
   set.seed(111)
-  bst0 <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 0.9, verbose = 0)
+  bst0 <- xgb.train(param, dtrain, nrounds = 2, evals = evals, eta = 0.9, verbose = 0)
   expect_false(is.null(attributes(bst0)$evaluation_log))
   expect_false(is.null(attributes(bst0)$evaluation_log$train_error))
 
   # same eta but re-set as a vector parameter in the callback
   set.seed(111)
   my_par <- list(eta = c(0.9, 0.9))
-  bst1 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
-                    callbacks = list(cb.reset.parameters(my_par)))
+  bst1 <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0,
+                    callbacks = list(xgb.cb.reset.parameters(my_par)))
   expect_false(is.null(attributes(bst1)$evaluation_log$train_error))
   expect_equal(attributes(bst0)$evaluation_log$train_error,
                attributes(bst1)$evaluation_log$train_error)
@@ -136,8 +182,8 @@ test_that("cb.reset.parameters works as expected", {
   # same eta but re-set via a function in the callback
   set.seed(111)
   my_par <- list(eta = function(itr, itr_end) 0.9)
-  bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
-                    callbacks = list(cb.reset.parameters(my_par)))
+  bst2 <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0,
+                    callbacks = list(xgb.cb.reset.parameters(my_par)))
   expect_false(is.null(attributes(bst2)$evaluation_log$train_error))
   expect_equal(attributes(bst0)$evaluation_log$train_error,
                attributes(bst2)$evaluation_log$train_error)
@@ -145,39 +191,39 @@ test_that("cb.reset.parameters works as expected", {
   # different eta re-set as a vector parameter in the callback
   set.seed(111)
   my_par <- list(eta = c(0.6, 0.5))
-  bst3 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
-                    callbacks = list(cb.reset.parameters(my_par)))
+  bst3 <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0,
+                    callbacks = list(xgb.cb.reset.parameters(my_par)))
   expect_false(is.null(attributes(bst3)$evaluation_log$train_error))
   expect_false(all(attributes(bst0)$evaluation_log$train_error == attributes(bst3)$evaluation_log$train_error))
 
   # resetting multiple parameters at the same time runs with no error
   my_par <- list(eta = c(1., 0.5), gamma = c(1, 2), max_depth = c(4, 8))
   expect_error(
-    bst4 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
-                      callbacks = list(cb.reset.parameters(my_par)))
+    bst4 <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0,
+                      callbacks = list(xgb.cb.reset.parameters(my_par)))
   , NA) # NA = no error
   # CV works as well
   expect_error(
     bst4 <- xgb.cv(param, dtrain, nfold = 2, nrounds = 2, verbose = 0,
-                   callbacks = list(cb.reset.parameters(my_par)))
+                   callbacks = list(xgb.cb.reset.parameters(my_par)))
   , NA) # NA = no error
 
   # expect no learning with 0 learning rate
   my_par <- list(eta = c(0., 0.))
-  bstX <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
-                    callbacks = list(cb.reset.parameters(my_par)))
+  bstX <- xgb.train(param, dtrain, nrounds = 2, evals = evals, verbose = 0,
+                    callbacks = list(xgb.cb.reset.parameters(my_par)))
   expect_false(is.null(attributes(bstX)$evaluation_log$train_error))
   er <- unique(attributes(bstX)$evaluation_log$train_error)
   expect_length(er, 1)
   expect_gt(er, 0.4)
 })
 
-test_that("cb.save.model works as expected", {
+test_that("xgb.cb.save.model works as expected", {
   files <- c('xgboost_01.json', 'xgboost_02.json', 'xgboost.json')
   files <- unname(sapply(files, function(f) file.path(tempdir(), f)))
   for (f in files) if (file.exists(f)) file.remove(f)
 
-  bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 1, verbose = 0,
+  bst <- xgb.train(param, dtrain, nrounds = 2, evals = evals, eta = 1, verbose = 0,
                    save_period = 1, save_name = file.path(tempdir(), "xgboost_%02d.json"))
   expect_true(file.exists(files[1]))
   expect_true(file.exists(files[2]))
@@ -193,7 +239,7 @@ test_that("cb.save.model works as expected", {
   expect_equal(xgb.save.raw(bst), xgb.save.raw(b2))
 
   # save_period = 0 saves the last iteration's model
-  bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 1, verbose = 0,
+  bst <- xgb.train(param, dtrain, nrounds = 2, evals = evals, eta = 1, verbose = 0,
                    save_period = 0, save_name = file.path(tempdir(), 'xgboost.json'))
   expect_true(file.exists(files[3]))
   b2 <- xgb.load(files[3])
@@ -206,7 +252,7 @@ test_that("cb.save.model works as expected", {
 test_that("early stopping xgb.train works", {
   set.seed(11)
   expect_output(
-    bst <- xgb.train(param, dtrain, nrounds = 20, watchlist, eta = 0.3,
+    bst <- xgb.train(param, dtrain, nrounds = 20, evals = evals, eta = 0.3,
                      early_stopping_rounds = 3, maximize = FALSE)
   , "Stopping. Best iteration")
   expect_false(is.null(xgb.attr(bst, "best_iteration")))
@@ -220,7 +266,7 @@ test_that("early stopping xgb.train works", {
 
   set.seed(11)
   expect_silent(
-    bst0 <- xgb.train(param, dtrain, nrounds = 20, watchlist, eta = 0.3,
+    bst0 <- xgb.train(param, dtrain, nrounds = 20, evals = evals, eta = 0.3,
                       early_stopping_rounds = 3, maximize = FALSE, verbose = 0)
   )
   expect_equal(attributes(bst)$evaluation_log, attributes(bst0)$evaluation_log)
@@ -236,10 +282,10 @@ test_that("early stopping xgb.train works", {
 test_that("early stopping using a specific metric works", {
   set.seed(11)
   expect_output(
-    bst <- xgb.train(param[-2], dtrain, nrounds = 20, watchlist, eta = 0.6,
+    bst <- xgb.train(param[-2], dtrain, nrounds = 20, evals = evals, eta = 0.6,
                      eval_metric = "logloss", eval_metric = "auc",
-                     callbacks = list(cb.early.stop(stopping_rounds = 3, maximize = FALSE,
-                                                    metric_name = 'test_logloss')))
+                     callbacks = list(xgb.cb.early.stop(stopping_rounds = 3, maximize = FALSE,
+                                                        metric_name = 'test_logloss')))
   , "Stopping. Best iteration")
   expect_false(is.null(xgb.attr(bst, "best_iteration")))
   expect_lt(xgb.attr(bst, "best_iteration"), 19)
@@ -269,7 +315,7 @@ test_that("early stopping works with titanic", {
     nrounds = 100,
     early_stopping_rounds = 3,
     nthread = n_threads,
-    watchlist = list(train = xgb.DMatrix(dtx, label = dty))
+    evals = list(train = xgb.DMatrix(dtx, label = dty))
   )
 
   expect_true(TRUE)  # should not crash
@@ -281,10 +327,10 @@ test_that("early stopping xgb.cv works", {
     cv <- xgb.cv(param, dtrain, nfold = 5, eta = 0.3, nrounds = 20,
                  early_stopping_rounds = 3, maximize = FALSE)
   , "Stopping. Best iteration")
-  expect_false(is.null(cv$best_iteration))
-  expect_lt(cv$best_iteration, 19)
+  expect_false(is.null(cv$early_stop$best_iteration))
+  expect_lt(cv$early_stop$best_iteration, 19)
   # the best error is min error:
-  expect_true(cv$evaluation_log[, test_error_mean[cv$best_iteration] == min(test_error_mean)])
+  expect_true(cv$evaluation_log[, test_error_mean[cv$early_stop$best_iteration] == min(test_error_mean)])
 })
 
 test_that("prediction in xgb.cv works", {
@@ -292,19 +338,19 @@ test_that("prediction in xgb.cv works", {
   nrounds <- 4
   cv <- xgb.cv(param, dtrain, nfold = 5, eta = 0.5, nrounds = nrounds, prediction = TRUE, verbose = 0)
   expect_false(is.null(cv$evaluation_log))
-  expect_false(is.null(cv$pred))
-  expect_length(cv$pred, nrow(train$data))
-  err_pred <- mean(sapply(cv$folds, function(f) mean(err(ltrain[f], cv$pred[f]))))
+  expect_false(is.null(cv$cv_predict$pred))
+  expect_length(cv$cv_predict$pred, nrow(train$data))
+  err_pred <- mean(sapply(cv$folds, function(f) mean(err(ltrain[f], cv$cv_predict$pred[f]))))
   err_log <- cv$evaluation_log[nrounds, test_error_mean]
   expect_equal(err_pred, err_log, tolerance = 1e-6)
 
   # save CV models
   set.seed(11)
   cvx <- xgb.cv(param, dtrain, nfold = 5, eta = 0.5, nrounds = nrounds, prediction = TRUE, verbose = 0,
-                callbacks = list(cb.cv.predict(save_models = TRUE)))
+                callbacks = list(xgb.cb.cv.predict(save_models = TRUE)))
   expect_equal(cv$evaluation_log, cvx$evaluation_log)
-  expect_length(cvx$models, 5)
-  expect_true(all(sapply(cvx$models, class) == 'xgb.Booster'))
+  expect_length(cvx$cv_predict$models, 5)
+  expect_true(all(sapply(cvx$cv_predict$models, class) == 'xgb.Booster'))
 })
 
 test_that("prediction in xgb.cv works for gblinear too", {
@@ -312,8 +358,8 @@ test_that("prediction in xgb.cv works for gblinear too", {
   p <- list(booster = 'gblinear', objective = "reg:logistic", nthread = n_threads)
   cv <- xgb.cv(p, dtrain, nfold = 5, eta = 0.5, nrounds = 2, prediction = TRUE, verbose = 0)
   expect_false(is.null(cv$evaluation_log))
-  expect_false(is.null(cv$pred))
-  expect_length(cv$pred, nrow(train$data))
+  expect_false(is.null(cv$cv_predict$pred))
+  expect_length(cv$cv_predict$pred, nrow(train$data))
 })
 
 test_that("prediction in early-stopping xgb.cv works", {
@@ -321,17 +367,17 @@ test_that("prediction in early-stopping xgb.cv works", {
   expect_output(
     cv <- xgb.cv(param, dtrain, nfold = 5, eta = 0.1, nrounds = 20,
                  early_stopping_rounds = 5, maximize = FALSE, stratified = FALSE,
-                 prediction = TRUE, base_score = 0.5)
+                 prediction = TRUE, base_score = 0.5, verbose = TRUE)
   , "Stopping. Best iteration")
 
-  expect_false(is.null(cv$best_iteration))
-  expect_lt(cv$best_iteration, 19)
+  expect_false(is.null(cv$early_stop$best_iteration))
+  expect_lt(cv$early_stop$best_iteration, 19)
   expect_false(is.null(cv$evaluation_log))
-  expect_false(is.null(cv$pred))
-  expect_length(cv$pred, nrow(train$data))
+  expect_false(is.null(cv$cv_predict$pred))
+  expect_length(cv$cv_predict$pred, nrow(train$data))
 
-  err_pred <- mean(sapply(cv$folds, function(f) mean(err(ltrain[f], cv$pred[f]))))
-  err_log <- cv$evaluation_log[cv$best_iteration, test_error_mean]
+  err_pred <- mean(sapply(cv$folds, function(f) mean(err(ltrain[f], cv$cv_predict$pred[f]))))
+  err_log <- cv$evaluation_log[cv$early_stop$best_iteration, test_error_mean]
   expect_equal(err_pred, err_log, tolerance = 1e-6)
   err_log_last <- cv$evaluation_log[cv$niter, test_error_mean]
   expect_gt(abs(err_pred - err_log_last), 1e-4)
@@ -341,14 +387,14 @@ test_that("prediction in xgb.cv for softprob works", {
   lb <- as.numeric(iris$Species) - 1
   set.seed(11)
   expect_warning(
-    cv <- xgb.cv(data = as.matrix(iris[, -5]), label = lb, nfold = 4,
+    cv <- xgb.cv(data = xgb.DMatrix(as.matrix(iris[, -5]), label = lb), nfold = 4,
                  eta = 0.5, nrounds = 5, max_depth = 3, nthread = n_threads,
                  subsample = 0.8, gamma = 2, verbose = 0,
                  prediction = TRUE, objective = "multi:softprob", num_class = 3)
   , NA)
-  expect_false(is.null(cv$pred))
-  expect_equal(dim(cv$pred), c(nrow(iris), 3))
-  expect_lt(diff(range(rowSums(cv$pred))), 1e-6)
+  expect_false(is.null(cv$cv_predict$pred))
+  expect_equal(dim(cv$cv_predict$pred), c(nrow(iris), 3))
+  expect_lt(diff(range(rowSums(cv$cv_predict$pred))), 1e-6)
 })
 
 test_that("prediction in xgb.cv works for multi-quantile", {
@@ -368,7 +414,7 @@ test_that("prediction in xgb.cv works for multi-quantile", {
     prediction = TRUE,
     verbose = 0
   )
-  expect_equal(dim(cv$pred), c(nrow(x), 5))
+  expect_equal(dim(cv$cv_predict$pred), c(nrow(x), 5))
 })
 
 test_that("prediction in xgb.cv works for multi-output", {
@@ -389,5 +435,46 @@ test_that("prediction in xgb.cv works for multi-output", {
     prediction = TRUE,
     verbose = 0
   )
-  expect_equal(dim(cv$pred), c(nrow(x), 2))
+  expect_equal(dim(cv$cv_predict$pred), c(nrow(x), 2))
+})
+
+test_that("prediction in xgb.cv works for multi-quantile", {
+  data(mtcars)
+  y <- mtcars$mpg
+  x <- as.matrix(mtcars[, -1])
+  dm <- xgb.DMatrix(x, label = y, nthread = 1)
+  cv <- xgb.cv(
+    data = dm,
+    params = list(
+      objective = "reg:quantileerror",
+      quantile_alpha = c(0.1, 0.2, 0.5, 0.8, 0.9),
+      nthread = 1
+    ),
+    nrounds = 5,
+    nfold = 3,
+    prediction = TRUE,
+    verbose = 0
+  )
+  expect_equal(dim(cv$cv_predict$pred), c(nrow(x), 5))
+})
+
+test_that("prediction in xgb.cv works for multi-output", {
+  data(mtcars)
+  y <- mtcars$mpg
+  x <- as.matrix(mtcars[, -1])
+  dm <- xgb.DMatrix(x, label = cbind(y, -y), nthread = 1)
+  cv <- xgb.cv(
+    data = dm,
+    params = list(
+      tree_method = "hist",
+      multi_strategy = "multi_output_tree",
+      objective = "reg:squarederror",
+      nthread = n_threads
+    ),
+    nrounds = 5,
+    nfold = 3,
+    prediction = TRUE,
+    verbose = 0
+  )
+  expect_equal(dim(cv$cv_predict$pred), c(nrow(x), 2))
 })
diff --git a/R-package/tests/testthat/test_custom_objective.R b/R-package/tests/testthat/test_custom_objective.R
index c65031246..d3050b152 100644
--- a/R-package/tests/testthat/test_custom_objective.R
+++ b/R-package/tests/testthat/test_custom_objective.R
@@ -12,7 +12,7 @@ dtrain <- xgb.DMatrix(
 dtest <- xgb.DMatrix(
   agaricus.test$data, label = agaricus.test$label, nthread = n_threads
 )
-watchlist <- list(eval = dtest, train = dtrain)
+evals <- list(eval = dtest, train = dtrain)
 
 logregobj <- function(preds, dtrain) {
   labels <- getinfo(dtrain, "label")
@@ -33,7 +33,7 @@ param <- list(max_depth = 2, eta = 1, nthread = n_threads,
 num_round <- 2
 
 test_that("custom objective works", {
-  bst <- xgb.train(param, dtrain, num_round, watchlist)
+  bst <- xgb.train(param, dtrain, num_round, evals)
   expect_equal(class(bst), "xgb.Booster")
   expect_false(is.null(attributes(bst)$evaluation_log))
   expect_false(is.null(attributes(bst)$evaluation_log$eval_error))
@@ -48,7 +48,7 @@ test_that("custom objective in CV works", {
 })
 
 test_that("custom objective with early stop works", {
-  bst <- xgb.train(param, dtrain, 10, watchlist)
+  bst <- xgb.train(param, dtrain, 10, evals)
   expect_equal(class(bst), "xgb.Booster")
   train_log <- attributes(bst)$evaluation_log$train_error
   expect_true(all(diff(train_log) <= 0))
@@ -66,7 +66,7 @@ test_that("custom objective using DMatrix attr works", {
     return(list(grad = grad, hess = hess))
   }
   param$objective <- logregobjattr
-  bst <- xgb.train(param, dtrain, num_round, watchlist)
+  bst <- xgb.train(param, dtrain, num_round, evals)
   expect_equal(class(bst), "xgb.Booster")
 })
 
diff --git a/R-package/tests/testthat/test_dmatrix.R b/R-package/tests/testthat/test_dmatrix.R
index 50621f241..548afece3 100644
--- a/R-package/tests/testthat/test_dmatrix.R
+++ b/R-package/tests/testthat/test_dmatrix.R
@@ -41,13 +41,13 @@ test_that("xgb.DMatrix: basic construction", {
 
   params <- list(tree_method = "hist", nthread = n_threads)
   bst_fd <- xgb.train(
-    params, nrounds = 8, fd, watchlist = list(train = fd)
+    params, nrounds = 8, fd, evals = list(train = fd)
   )
   bst_dgr <- xgb.train(
-    params, nrounds = 8, fdgr, watchlist = list(train = fdgr)
+    params, nrounds = 8, fdgr, evals = list(train = fdgr)
   )
   bst_dgc <- xgb.train(
-    params, nrounds = 8, fdgc, watchlist = list(train = fdgc)
+    params, nrounds = 8, fdgc, evals = list(train = fdgc)
   )
 
   raw_fd <- xgb.save.raw(bst_fd, raw_format = "ubj")
@@ -243,7 +243,7 @@ test_that("xgb.DMatrix: print", {
     txt <- capture.output({
         print(dtrain)
     })
-    expect_equal(txt, "xgb.DMatrix  dim: 6513 x 126  info: label weight base_margin  colnames: yes")
+    expect_equal(txt, "xgb.DMatrix  dim: 6513 x 126  info: base_margin, label, weight  colnames: yes")
 
     # DMatrix with just features
     dtrain <- xgb.DMatrix(
@@ -302,6 +302,37 @@ test_that("xgb.DMatrix: Inf as missing", {
   file.remove(fname_nan)
 })
 
+test_that("xgb.DMatrix: missing in CSR", {
+  x_dense <- matrix(as.numeric(1:10), nrow = 5)
+  x_dense[2, 1] <- NA_real_
+
+  x_csr <- as(x_dense, "RsparseMatrix")
+
+  m_dense <- xgb.DMatrix(x_dense, nthread = n_threads, missing = NA_real_)
+  xgb.DMatrix.save(m_dense, "dense.dmatrix")
+
+  m_csr <- xgb.DMatrix(x_csr, nthread = n_threads, missing = NA)
+  xgb.DMatrix.save(m_csr, "csr.dmatrix")
+
+  denseconn <- file("dense.dmatrix", "rb")
+  csrconn <- file("csr.dmatrix", "rb")
+
+  expect_equal(file.size("dense.dmatrix"), file.size("csr.dmatrix"))
+
+  bytes <- file.size("dense.dmatrix")
+  densedmatrix <- readBin(denseconn, "raw", n = bytes)
+  csrmatrix <- readBin(csrconn, "raw", n = bytes)
+
+  expect_equal(length(densedmatrix), length(csrmatrix))
+  expect_equal(densedmatrix, csrmatrix)
+
+  close(denseconn)
+  close(csrconn)
+
+  file.remove("dense.dmatrix")
+  file.remove("csr.dmatrix")
+})
+
 test_that("xgb.DMatrix: error on three-dimensional array", {
   set.seed(123)
   x <- matrix(rnorm(500), nrow = 50)
@@ -692,3 +723,58 @@ test_that("xgb.DMatrix: quantile cuts look correct", {
     }
   )
 })
+
+test_that("xgb.DMatrix: slicing keeps field indicators", {
+  data(mtcars)
+  x <- as.matrix(mtcars[, -1])
+  y <- mtcars[, 1]
+  dm <- xgb.DMatrix(
+    data = x,
+    label_lower_bound = -y,
+    label_upper_bound = y,
+    nthread = 1
+  )
+  idx_take <- seq(1, 5)
+  dm_slice <- xgb.slice.DMatrix(dm, idx_take)
+
+  expect_true(xgb.DMatrix.hasinfo(dm_slice, "label_lower_bound"))
+  expect_true(xgb.DMatrix.hasinfo(dm_slice, "label_upper_bound"))
+  expect_false(xgb.DMatrix.hasinfo(dm_slice, "label"))
+
+  expect_equal(getinfo(dm_slice, "label_lower_bound"), -y[idx_take], tolerance = 1e-6)
+  expect_equal(getinfo(dm_slice, "label_upper_bound"), y[idx_take], tolerance = 1e-6)
+})
+
+test_that("xgb.DMatrix: can slice with groups", {
+  data(iris)
+  x <- as.matrix(iris[, -5])
+  set.seed(123)
+  y <- sample(3, size = nrow(x), replace = TRUE)
+  group <- c(50, 50, 50)
+  dm <- xgb.DMatrix(x, label = y, group = group, nthread = 1)
+  idx_take <- seq(1, 50)
+  dm_slice <- xgb.slice.DMatrix(dm, idx_take, allow_groups = TRUE)
+
+  expect_true(xgb.DMatrix.hasinfo(dm_slice, "label"))
+  expect_false(xgb.DMatrix.hasinfo(dm_slice, "group"))
+  expect_false(xgb.DMatrix.hasinfo(dm_slice, "qid"))
+  expect_null(getinfo(dm_slice, "group"))
+  expect_equal(getinfo(dm_slice, "label"), y[idx_take], tolerance = 1e-6)
+})
+
+test_that("xgb.DMatrix: can read CSV", {
+  txt <- paste(
+    "1,2,3",
+    "-1,3,2",
+    sep = "\n"
+  )
+  fname <- file.path(tempdir(), "data.csv")
+  writeChar(txt, fname)
+  uri <- paste0(fname, "?format=csv&label_column=0")
+  dm <- xgb.DMatrix(uri, silent = TRUE)
+  expect_equal(getinfo(dm, "label"), c(1, -1))
+  expect_equal(
+    as.matrix(xgb.get.DMatrix.data(dm)),
+    matrix(c(2, 3, 3, 2), nrow = 2, byrow = TRUE)
+  )
+})
diff --git a/R-package/tests/testthat/test_feature_weights.R b/R-package/tests/testthat/test_feature_weights.R
index 4ed78c9b6..54fec67cf 100644
--- a/R-package/tests/testthat/test_feature_weights.R
+++ b/R-package/tests/testthat/test_feature_weights.R
@@ -25,7 +25,7 @@ test_that("training with feature weights works", {
     expect_lt(importance[1, Frequency], importance[9, Frequency])
   }
 
-  for (tm in c("hist", "approx", "exact")) {
+  for (tm in c("hist", "approx")) {
     test(tm)
   }
 })
diff --git a/R-package/tests/testthat/test_glm.R b/R-package/tests/testthat/test_glm.R
index 349bcce8d..b59de8b62 100644
--- a/R-package/tests/testthat/test_glm.R
+++ b/R-package/tests/testthat/test_glm.R
@@ -14,37 +14,37 @@ test_that("gblinear works", {
 
   param <- list(objective = "binary:logistic", eval_metric = "error", booster = "gblinear",
                 nthread = n_threads, eta = 0.8, alpha = 0.0001, lambda = 0.0001)
-  watchlist <- list(eval = dtest, train = dtrain)
+  evals <- list(eval = dtest, train = dtrain)
 
   n <- 5         # iterations
   ERR_UL <- 0.005 # upper limit for the test set error
   VERB <- 0      # chatterbox switch
 
   param$updater <- 'shotgun'
-  bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'shuffle')
+  bst <- xgb.train(param, dtrain, n, evals, verbose = VERB, feature_selector = 'shuffle')
   ypred <- predict(bst, dtest)
   expect_equal(length(getinfo(dtest, 'label')), 1611)
   expect_lt(attributes(bst)$evaluation_log$eval_error[n], ERR_UL)
 
-  bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'cyclic',
-                   callbacks = list(cb.gblinear.history()))
+  bst <- xgb.train(param, dtrain, n, evals, verbose = VERB, feature_selector = 'cyclic',
+                   callbacks = list(xgb.cb.gblinear.history()))
   expect_lt(attributes(bst)$evaluation_log$eval_error[n], ERR_UL)
   h <- xgb.gblinear.history(bst)
   expect_equal(dim(h), c(n, ncol(dtrain) + 1))
   expect_is(h, "matrix")
 
   param$updater <- 'coord_descent'
-  bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'cyclic')
+  bst <- xgb.train(param, dtrain, n, evals, verbose = VERB, feature_selector = 'cyclic')
   expect_lt(attributes(bst)$evaluation_log$eval_error[n], ERR_UL)
 
-  bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'shuffle')
+  bst <- xgb.train(param, dtrain, n, evals, verbose = VERB, feature_selector = 'shuffle')
   expect_lt(attributes(bst)$evaluation_log$eval_error[n], ERR_UL)
 
-  bst <- xgb.train(param, dtrain, 2, watchlist, verbose = VERB, feature_selector = 'greedy')
+  bst <- xgb.train(param, dtrain, 2, evals, verbose = VERB, feature_selector = 'greedy')
   expect_lt(attributes(bst)$evaluation_log$eval_error[2], ERR_UL)
 
-  bst <- xgb.train(param, dtrain, n, watchlist, verbose = VERB, feature_selector = 'thrifty',
-                   top_k = 50, callbacks = list(cb.gblinear.history(sparse = TRUE)))
+  bst <- xgb.train(param, dtrain, n, evals, verbose = VERB, feature_selector = 'thrifty',
+                   top_k = 50, callbacks = list(xgb.cb.gblinear.history(sparse = TRUE)))
   expect_lt(attributes(bst)$evaluation_log$eval_error[n], ERR_UL)
   h <- xgb.gblinear.history(bst)
   expect_equal(dim(h), c(n, ncol(dtrain) + 1))
diff --git a/R-package/tests/testthat/test_ranking.R b/R-package/tests/testthat/test_ranking.R
index e49a32025..0e7db42da 100644
--- a/R-package/tests/testthat/test_ranking.R
+++ b/R-package/tests/testthat/test_ranking.R
@@ -15,7 +15,7 @@ test_that('Test ranking with unweighted data', {
 
   params <- list(eta = 1, tree_method = 'exact', objective = 'rank:pairwise', max_depth = 1,
                  eval_metric = 'auc', eval_metric = 'aucpr', nthread = n_threads)
-  bst <- xgb.train(params, dtrain, nrounds = 10, watchlist = list(train = dtrain))
+  bst <- xgb.train(params, dtrain, nrounds = 10, evals = list(train = dtrain))
   # Check if the metric is monotone increasing
   expect_true(all(diff(attributes(bst)$evaluation_log$train_auc) >= 0))
   expect_true(all(diff(attributes(bst)$evaluation_log$train_aucpr) >= 0))
@@ -39,7 +39,7 @@ test_that('Test ranking with weighted data', {
     eta = 1, tree_method = "exact", objective = "rank:pairwise", max_depth = 1,
     eval_metric = "auc", eval_metric = "aucpr", nthread = n_threads
   )
-  bst <- xgb.train(params, dtrain, nrounds = 10, watchlist = list(train = dtrain))
+  bst <- xgb.train(params, dtrain, nrounds = 10, evals = list(train = dtrain))
   # Check if the metric is monotone increasing
   expect_true(all(diff(attributes(bst)$evaluation_log$train_auc) >= 0))
   expect_true(all(diff(attributes(bst)$evaluation_log$train_aucpr) >= 0))
diff --git a/R-package/tests/testthat/test_update.R b/R-package/tests/testthat/test_update.R
index 3c88178e0..7fdc6eb84 100644
--- a/R-package/tests/testthat/test_update.R
+++ b/R-package/tests/testthat/test_update.R
@@ -17,7 +17,7 @@ dtest <- xgb.DMatrix(
 win32_flag <- .Platform$OS.type == "windows" && .Machine$sizeof.pointer != 8
 
 test_that("updating the model works", {
-  watchlist <- list(train = dtrain, test = dtest)
+  evals <- list(train = dtrain, test = dtest)
 
   # no-subsampling
   p1 <- list(
@@ -25,19 +25,19 @@ test_that("updating the model works", {
     updater = "grow_colmaker,prune"
   )
   set.seed(11)
-  bst1 <- xgb.train(p1, dtrain, nrounds = 10, watchlist, verbose = 0)
+  bst1 <- xgb.train(p1, dtrain, nrounds = 10, evals = evals, verbose = 0)
   tr1 <- xgb.model.dt.tree(model = bst1)
 
   # with subsampling
   p2 <- modifyList(p1, list(subsample = 0.1))
   set.seed(11)
-  bst2 <- xgb.train(p2, dtrain, nrounds = 10, watchlist, verbose = 0)
+  bst2 <- xgb.train(p2, dtrain, nrounds = 10, evals = evals, verbose = 0)
   tr2 <- xgb.model.dt.tree(model = bst2)
 
   # the same no-subsampling boosting with an extra 'refresh' updater:
   p1r <- modifyList(p1, list(updater = 'grow_colmaker,prune,refresh', refresh_leaf = FALSE))
   set.seed(11)
-  bst1r <- xgb.train(p1r, dtrain, nrounds = 10, watchlist, verbose = 0)
+  bst1r <- xgb.train(p1r, dtrain, nrounds = 10, evals = evals, verbose = 0)
   tr1r <- xgb.model.dt.tree(model = bst1r)
   # all should be the same when no subsampling
   expect_equal(attributes(bst1)$evaluation_log, attributes(bst1r)$evaluation_log)
@@ -53,7 +53,7 @@ test_that("updating the model works", {
   # the same boosting with subsampling with an extra 'refresh' updater:
   p2r <- modifyList(p2, list(updater = 'grow_colmaker,prune,refresh', refresh_leaf = FALSE))
   set.seed(11)
-  bst2r <- xgb.train(p2r, dtrain, nrounds = 10, watchlist, verbose = 0)
+  bst2r <- xgb.train(p2r, dtrain, nrounds = 10, evals = evals, verbose = 0)
   tr2r <- xgb.model.dt.tree(model = bst2r)
   # should be the same evaluation but different gains and larger cover
   expect_equal(attributes(bst2)$evaluation_log, attributes(bst2r)$evaluation_log)
@@ -66,7 +66,7 @@ test_that("updating the model works", {
   # process type 'update' for no-subsampling model, refreshing the tree stats AND leaves from training data:
   set.seed(123)
   p1u <- modifyList(p1, list(process_type = 'update', updater = 'refresh', refresh_leaf = TRUE))
-  bst1u <- xgb.train(p1u, dtrain, nrounds = 10, watchlist, verbose = 0, xgb_model = bst1)
+  bst1u <- xgb.train(p1u, dtrain, nrounds = 10, evals = evals, verbose = 0, xgb_model = bst1)
   tr1u <- xgb.model.dt.tree(model = bst1u)
   # all should be the same when no subsampling
   expect_equal(attributes(bst1)$evaluation_log, attributes(bst1u)$evaluation_log)
@@ -79,7 +79,7 @@ test_that("updating the model works", {
 
   # same thing but with a serialized model
   set.seed(123)
-  bst1u <- xgb.train(p1u, dtrain, nrounds = 10, watchlist, verbose = 0, xgb_model = xgb.save.raw(bst1))
+  bst1u <- xgb.train(p1u, dtrain, nrounds = 10, evals = evals, verbose = 0, xgb_model = xgb.save.raw(bst1))
   tr1u <- xgb.model.dt.tree(model = bst1u)
   # all should be the same when no subsampling
   expect_equal(attributes(bst1)$evaluation_log, attributes(bst1u)$evaluation_log)
@@ -87,7 +87,7 @@ test_that("updating the model works", {
 
   # process type 'update' for model with subsampling, refreshing only the tree stats from training data:
   p2u <- modifyList(p2, list(process_type = 'update', updater = 'refresh', refresh_leaf = FALSE))
-  bst2u <- xgb.train(p2u, dtrain, nrounds = 10, watchlist, verbose = 0, xgb_model = bst2)
+  bst2u <- xgb.train(p2u, dtrain, nrounds = 10, evals = evals, verbose = 0, xgb_model = bst2)
   tr2u <- xgb.model.dt.tree(model = bst2u)
   # should be the same evaluation but different gains and larger cover
   expect_equal(attributes(bst2)$evaluation_log, attributes(bst2u)$evaluation_log)
@@ -102,7 +102,7 @@ test_that("updating the model works", {
 
   # process type 'update' for no-subsampling model, refreshing only the tree stats from TEST data:
   p1ut <- modifyList(p1, list(process_type = 'update', updater = 'refresh', refresh_leaf = FALSE))
-  bst1ut <- xgb.train(p1ut, dtest, nrounds = 10, watchlist, verbose = 0, xgb_model = bst1)
+  bst1ut <- xgb.train(p1ut, dtest, nrounds = 10, evals = evals, verbose = 0, xgb_model = bst1)
   tr1ut <- xgb.model.dt.tree(model = bst1ut)
   # should be the same evaluations but different gains and smaller cover (test data is smaller)
   expect_equal(attributes(bst1)$evaluation_log, attributes(bst1ut)$evaluation_log)
@@ -115,18 +115,18 @@ test_that("updating works for multiclass & multitree", {
   dtr <- xgb.DMatrix(
     as.matrix(iris[, -5]), label = as.numeric(iris$Species) - 1, nthread = n_threads
   )
-  watchlist <- list(train = dtr)
+  evals <- list(train = dtr)
   p0 <- list(max_depth = 2, eta = 0.5, nthread = n_threads, subsample = 0.6,
              objective = "multi:softprob", num_class = 3, num_parallel_tree = 2,
              base_score = 0)
   set.seed(121)
-  bst0 <- xgb.train(p0, dtr, 5, watchlist, verbose = 0)
+  bst0 <- xgb.train(p0, dtr, 5, evals = evals, verbose = 0)
   tr0 <- xgb.model.dt.tree(model = bst0)
 
   # run update process for an original model with subsampling
   p0u <- modifyList(p0, list(process_type = 'update', updater = 'refresh', refresh_leaf = FALSE))
   bst0u <- xgb.train(p0u, dtr, nrounds = xgb.get.num.boosted.rounds(bst0),
-                     watchlist, xgb_model = bst0, verbose = 0)
+                     evals = evals, xgb_model = bst0, verbose = 0)
   tr0u <- xgb.model.dt.tree(model = bst0u)
 
   # should be the same evaluation but different gains and larger cover
diff --git a/R-package/vignettes/xgboostPresentation.Rmd b/R-package/vignettes/xgboostPresentation.Rmd
index efafc624d..fc49adc0f 100644
--- a/R-package/vignettes/xgboostPresentation.Rmd
+++ b/R-package/vignettes/xgboostPresentation.Rmd
@@ -341,10 +341,10 @@ One way to measure progress in learning of a model is to provide to **XGBoost**
 
 > in some way it is similar to what we have done above with the average error. The main difference is that below it was after building the model, and now it is during the construction that we measure errors.
 
-For the purpose of this example, we use `watchlist` parameter. It is a list of `xgb.DMatrix`, each of them tagged with a name.
+For the purpose of this example, we use the `evals` parameter. It is a list of `xgb.DMatrix` objects, each of them tagged with a name.
 
-```{r watchlist, message=F, warning=F}
-watchlist <- list(train = dtrain, test = dtest)
+```{r evals, message=F, warning=F}
+evals <- list(train = dtrain, test = dtest)
 
 bst <- xgb.train(
     data = dtrain
@@ -355,7 +355,7 @@ bst <- xgb.train(
         , objective = "binary:logistic"
     )
     , nrounds = 2
-    , watchlist = watchlist
+    , evals = evals
 )
 ```
 
@@ -367,7 +367,7 @@ If with your own dataset you have not such results, you should think about how y
 
 For a better understanding of the learning progression, you may want to have some specific metric or even use multiple evaluation metrics.
 
-```{r watchlist2, message=F, warning=F}
+```{r evals2, message=F, warning=F}
 bst <- xgb.train(
     data = dtrain
     , max_depth = 2
@@ -379,7 +379,7 @@ bst <- xgb.train(
         , eval_metric = "logloss"
     )
     , nrounds = 2
-    , watchlist = watchlist
+    , evals = evals
 )
 ```
 
@@ -401,7 +401,7 @@ bst <- xgb.train(
         , eval_metric = "logloss"
     )
     , nrounds = 2
-    , watchlist = watchlist
+    , evals = evals
 )
 ```
 
@@ -430,7 +430,7 @@ bst <- xgb.train(
         , objective = "binary:logistic"
     )
     , nrounds = 2
-    , watchlist = watchlist
+    , evals = evals
 )
 ```
 
@@ -496,6 +496,9 @@ An interesting test to see how identical our saved model is to the original one
 
 ```{r loadModel, message=F, warning=F}
 # load binary model to R
+# Note that the number of threads for 'xgb.load' is taken from global config,
+# can be modified like this:
+RhpcBLASctl::omp_set_num_threads(1)
 bst2 <- xgb.load(fname)
 xgb.parameters(bst2) <- list(nthread = 2)
 pred2 <- predict(bst2, test$data)
diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
index f295d1446..fbc24a315 100644
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@@ -1,6 +1,5 @@
 # Automatically set source group based on folder
 function(auto_source_group SOURCES)
-
   foreach(FILE ${SOURCES})
       get_filename_component(PARENT_DIR "${FILE}" PATH)
 
diff --git a/demo/dask/cpu_training.py b/demo/dask/cpu_training.py
index 2bee444f7..7117eddd9 100644
--- a/demo/dask/cpu_training.py
+++ b/demo/dask/cpu_training.py
@@ -40,7 +40,7 @@ def main(client):
     # you can pass output directly into `predict` too.
     prediction = dxgb.predict(client, bst, dtrain)
     print("Evaluation history:", history)
-    return prediction
+    print("Error:", da.sqrt((prediction - y) ** 2).mean().compute())
 
 
 if __name__ == "__main__":
diff --git a/doc/R-package/index.rst b/doc/R-package/index.rst
index 8a27d0174..bf9c1f8d9 100644
--- a/doc/R-package/index.rst
+++ b/doc/R-package/index.rst
@@ -34,4 +34,5 @@ Other topics
 .. toctree::
   :maxdepth: 2
   :titlesonly:
+
   Handling of indexable elements <index_base>
diff --git a/doc/contrib/unit_tests.rst b/doc/contrib/unit_tests.rst
index 662a632e2..908e5ed99 100644
--- a/doc/contrib/unit_tests.rst
+++ b/doc/contrib/unit_tests.rst
@@ -144,6 +144,14 @@ which provides higher flexibility. For example:
 
   ctest --verbose
 
+If you need to debug errors on Windows using the debugger from VS, you can append the gtest flags in `test_main.cc`:
+
+.. code-block::
+
+  ::testing::GTEST_FLAG(filter) = "Suite.Test";
+  ::testing::GTEST_FLAG(repeat) = 10;
+
+
 ***********************************************
 Sanitizers: Detect memory errors and data races
 ***********************************************
diff --git a/doc/index.rst b/doc/index.rst
index a2ae9bbd3..7b241c0a1 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -28,7 +28,7 @@ Contents
   Python Package <python/index>
   R Package <R-package/index>
   JVM Package <jvm/index>
-  Ruby Package <https://github.com/ankane/xgb>
+  Ruby Package <https://github.com/ankane/xgboost-ruby>
   Swift Package <https://github.com/kongzii/SwiftXGBoost>
   Julia Package <julia>
   C Package <c>
diff --git a/doc/parameter.rst b/doc/parameter.rst
index a7d8203b0..00f0eaea6 100644
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -118,7 +118,7 @@ Parameters for Tree Booster
   - All ``colsample_by*`` parameters have a range of (0, 1], the default value of 1, and specify the fraction of columns to be subsampled.
   - ``colsample_bytree`` is the subsample ratio of columns when constructing each tree. Subsampling occurs once for every tree constructed.
   - ``colsample_bylevel`` is the subsample ratio of columns for each level. Subsampling occurs once for every new depth level reached in a tree. Columns are subsampled from the set of columns chosen for the current tree.
-  - ``colsample_bynode`` is the subsample ratio of columns for each node (split). Subsampling occurs once every time a new split is evaluated. Columns are subsampled from the set of columns chosen for the current level.
+  - ``colsample_bynode`` is the subsample ratio of columns for each node (split). Subsampling occurs once every time a new split is evaluated. Columns are subsampled from the set of columns chosen for the current level. This is not supported by the exact tree method.
   - ``colsample_by*`` parameters work cumulatively. For instance,
     the combination ``{'colsample_bytree':0.5, 'colsample_bylevel':0.5,
     'colsample_bynode':0.5}`` with 64 features will leave 8 features to choose from at
@@ -450,7 +450,7 @@ Specify the learning task and the corresponding learning objective. The objectiv
 
 * ``seed`` [default=0]
 
-  - Random number seed.  This parameter is ignored in R package, use `set.seed()` instead.
+  - Random number seed.  In the R package, if not specified, instead of defaulting to seed 'zero', will take a random seed through R's own RNG engine.
 
 * ``seed_per_iteration`` [default= ``false``]
 
@@ -489,7 +489,7 @@ Parameters for learning to rank (``rank:ndcg``, ``rank:map``, ``rank:pairwise``)
 
 These are parameters specific to learning to rank task. See :doc:`Learning to Rank </tutorials/learning_to_rank>` for an in-depth explanation.
 
-* ``lambdarank_pair_method`` [default = ``mean``]
+* ``lambdarank_pair_method`` [default = ``topk``]
 
   How to construct pairs for pair-wise learning.
 
@@ -500,7 +500,13 @@ These are parameters specific to learning to rank task. See :doc:`Learning to Ra
 
   It specifies the number of pairs sampled for each document when pair method is ``mean``, or the truncation level for queries when the pair method is ``topk``. For example, to train with ``ndcg@6``, set ``lambdarank_num_pair_per_sample`` to :math:`6` and ``lambdarank_pair_method`` to ``topk``.
 
-* ``lambdarank_unbiased`` [default = ``false``]
+* ``lambdarank_normalization`` [default = ``true``]
+
+  .. versionadded:: 2.1.0
+
+  Whether to normalize the leaf value by lambda gradient. This can sometimes stagnate the training progress.
+
+*  ``lambdarank_unbiased`` [default = ``false``]
 
   Specify whether do we need to debias input click data.
 
diff --git a/doc/python/callbacks.rst b/doc/python/callbacks.rst
index 7cb257a81..6d8b43a11 100644
--- a/doc/python/callbacks.rst
+++ b/doc/python/callbacks.rst
@@ -36,7 +36,7 @@ inside iteration loop.  You can also pass this callback function directly into X
     # Specify which dataset and which metric should be used for early stopping.
     early_stop = xgb.callback.EarlyStopping(rounds=early_stopping_rounds,
                                             metric_name='CustomErr',
-                                            data_name='Train')
+                                            data_name='Valid')
 
     booster = xgb.train(
         {'objective': 'binary:logistic',
diff --git a/doc/python/python_intro.rst b/doc/python/python_intro.rst
index 0d26a5253..cfdd20da0 100644
--- a/doc/python/python_intro.rst
+++ b/doc/python/python_intro.rst
@@ -63,7 +63,7 @@ The input data is stored in a :py:class:`DMatrix <xgboost.DMatrix>` object. For
 
   .. code-block:: python
 
-    dtrain = xgb.DMatrix('train.svm.txt')
+    dtrain = xgb.DMatrix('train.svm.txt?format=libsvm')
     dtrain.save_binary('train.buffer')
 
 * Missing values can be replaced by a default value in the :py:class:`DMatrix <xgboost.DMatrix>` constructor:
@@ -86,7 +86,7 @@ to number of groups.
 
   .. code-block:: python
 
-    dtrain = xgb.DMatrix('train.svm.txt')
+    dtrain = xgb.DMatrix('train.svm.txt?format=libsvm')
     dtest = xgb.DMatrix('test.svm.buffer')
 
   The parser in XGBoost has limited functionality. When using Python interface, it's
@@ -176,7 +176,6 @@ Support Matrix
 +-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
 | pyarrow.Table           | NPA       | NPA               | NPA       | NPA       | NPA                | NPA         |
 +-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
-+-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
 | _\_array\_\_            | NPA       | F                 | NPA       | NPA       | H                  |             |
 +-------------------------+-----------+-------------------+-----------+-----------+--------------------+-------------+
 | Others                  | SciCSR    | F                 |           | F         | F                  |             |
@@ -240,7 +239,7 @@ A saved model can be loaded as follows:
 .. code-block:: python
 
   bst = xgb.Booster({'nthread': 4})  # init model
-  bst.load_model('model.bin')  # load data
+  bst.load_model('model.bin')  # load model data
 
 Methods including `update` and `boost` from `xgboost.Booster` are designed for
 internal usage only.  The wrapper function `xgboost.train` does some
diff --git a/doc/python/sklearn_estimator.rst b/doc/python/sklearn_estimator.rst
index 207b9fa30..1aaa340b1 100644
--- a/doc/python/sklearn_estimator.rst
+++ b/doc/python/sklearn_estimator.rst
@@ -62,7 +62,7 @@ stack of trees:
 .. code-block:: python
 
     early_stop = xgb.callback.EarlyStopping(
-        rounds=2, metric_name='logloss', data_name='Validation_0', save_best=True
+        rounds=2, metric_name='logloss', data_name='validation_0', save_best=True
     )
     clf = xgb.XGBClassifier(tree_method="hist", callbacks=[early_stop])
     clf.fit(X_train, y_train, eval_set=[(X_test, y_test)])
diff --git a/doc/requirements.txt b/doc/requirements.txt
index 667ef268f..ddff9be92 100644
--- a/doc/requirements.txt
+++ b/doc/requirements.txt
@@ -7,7 +7,9 @@ sh
 matplotlib
 graphviz
 numpy
+scipy
 myst-parser
+ray[train]
 xgboost_ray
 sphinx-gallery
 pyspark
diff --git a/doc/tutorials/learning_to_rank.rst b/doc/tutorials/learning_to_rank.rst
index 015f736e0..15a611bd0 100644
--- a/doc/tutorials/learning_to_rank.rst
+++ b/doc/tutorials/learning_to_rank.rst
@@ -48,11 +48,11 @@ Notice that the samples are sorted based on their query index in a non-decreasin
   import xgboost as xgb
 
   # Make a synthetic ranking dataset for demonstration
-  seed = 1994 
+  seed = 1994
   X, y = make_classification(random_state=seed)
   rng = np.random.default_rng(seed)
   n_query_groups = 3
-  qid = rng.integers(0, 3, size=X.shape[0])
+  qid = rng.integers(0, n_query_groups, size=X.shape[0])
 
   # Sort the inputs based on query index
   sorted_idx = np.argsort(qid)
@@ -65,14 +65,14 @@ The simplest way to train a ranking model is by using the scikit-learn estimator
 .. code-block:: python
 
   ranker = xgb.XGBRanker(tree_method="hist", lambdarank_num_pair_per_sample=8, objective="rank:ndcg", lambdarank_pair_method="topk")
-  ranker.fit(X, y, qid=qid)
+  ranker.fit(X, y, qid=qid[sorted_idx])
 
 Please note that, as of writing, there's no learning-to-rank interface in scikit-learn. As a result, the :py:class:`xgboost.XGBRanker` class does not fully conform the scikit-learn estimator guideline and can not be directly used with some of its utility functions. For instances, the ``auc_score`` and ``ndcg_score`` in scikit-learn don't consider query group information nor the pairwise loss. Most of the metrics are implemented as part of XGBoost, but to use scikit-learn utilities like :py:func:`sklearn.model_selection.cross_validation`, we need to make some adjustments in order to pass the ``qid`` as an additional parameter for :py:meth:`xgboost.XGBRanker.score`. Given a data frame ``X`` (either pandas or cuDF), add the column ``qid`` as follows:
 
 .. code-block:: python
 
   df = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])])
-  df["qid"] = qid
+  df["qid"] = qid[sorted_idx]
   ranker.fit(df, y)  # No need to pass qid as a separate argument
 
   from sklearn.model_selection import StratifiedGroupKFold, cross_val_score
@@ -146,7 +146,8 @@ The consideration of effective pairs also applies to the choice of pair method (
 
 When using the mean strategy for generating pairs, where the target metric (like ``NDCG``) is computed over the whole query list, users can specify how many pairs should be generated per each document, by setting the ``lambdarank_num_pair_per_sample``. XGBoost will randomly sample ``lambdarank_num_pair_per_sample`` pairs for each element in the query group (:math:`|pairs| = |query| \times num\_pairsample`). Often, setting it to 1 can produce reasonable results. In cases where performance is inadequate due to insufficient number of effective pairs being generated, set ``lambdarank_num_pair_per_sample`` to a higher value. As more document pairs are generated, more effective pairs will be generated as well.
 
-On the other hand, if you are prioritizing the top :math:`k` documents, the ``lambdarank_num_pair_per_sample`` should be set slightly higher than :math:`k` (with a few more documents) to obtain a good training result.
+On the other hand, if you are prioritizing the top :math:`k` documents, the ``lambdarank_num_pair_per_sample`` should be set slightly higher than :math:`k` (with a few more documents) to obtain a good training result. Lastly, XGBoost employs additional regularization for learning to rank objectives, which can be disabled by setting the ``lambdarank_normalization`` to ``False``.
+
 
 **Summary** If you have large amount of training data:
 
diff --git a/doc/tutorials/spark_estimator.rst b/doc/tutorials/spark_estimator.rst
index 8bd1dcd97..4e608440a 100644
--- a/doc/tutorials/spark_estimator.rst
+++ b/doc/tutorials/spark_estimator.rst
@@ -28,7 +28,7 @@ We can create a ``SparkXGBRegressor`` estimator like:
 .. code-block:: python
 
   from xgboost.spark import SparkXGBRegressor
-  spark_reg_estimator = SparkXGBRegressor(
+  xgb_regressor = SparkXGBRegressor(
     features_col="features",
     label_col="label",
     num_workers=2,
@@ -61,7 +61,7 @@ type or spark array type.
 
 .. code-block:: python
 
-  transformed_test_spark_dataframe = xgb_regressor.predict(test_spark_dataframe)
+  transformed_test_spark_dataframe = xgb_regressor_model.transform(test_spark_dataframe)
 
 
 The above snippet code returns a ``transformed_test_spark_dataframe`` that contains the input
diff --git a/doc/xgboost_doc.yml b/doc/xgboost_doc.yml
index 90b877e73..177e8758f 100644
--- a/doc/xgboost_doc.yml
+++ b/doc/xgboost_doc.yml
@@ -1,15 +1,23 @@
 name: xgboost_docs
 dependencies:
-  - python
+  - python=3.10
   - pip
   - pygraphviz
   - sphinx
+  - sphinx-gallery
   - recommonmark
   - mock
   - sh
   - matplotlib
+  - numpy
+  - scipy
+  - scikit-learn
+  - myst-parser
+  - pyspark
   - pip:
     - breathe
     - sphinx_rtd_theme
     - pydot-ng
     - graphviz
+    - ray[train]
+    - xgboost_ray
diff --git a/include/xgboost/base.h b/include/xgboost/base.h
index 1c4b6568e..d6379d0d0 100644
--- a/include/xgboost/base.h
+++ b/include/xgboost/base.h
@@ -1,20 +1,18 @@
 /**
- * Copyright 2015-2023 by XGBoost Contributors
+ * Copyright 2015-2024, XGBoost Contributors
  * \file base.h
  * \brief Defines configuration macros and basic types for xgboost.
  */
 #ifndef XGBOOST_BASE_H_
 #define XGBOOST_BASE_H_
 
-#include <dmlc/base.h>
-#include <dmlc/omp.h>
+#include <dmlc/omp.h>  // for omp_uint, omp_ulong
 
-#include <cmath>
-#include <cstdint>
-#include <iostream>
-#include <string>
-#include <utility>
-#include <vector>
+#include <cstdint>  // for int32_t, uint64_t, int16_t
+#include <ostream>  // for ostream
+#include <string>   // for string
+#include <utility>  // for pair
+#include <vector>   // for vector
 
 /*!
  * \brief string flag for R library, to leave hooks when needed.
@@ -37,7 +35,7 @@
  * \brief Whether to customize global PRNG.
  */
 #ifndef XGBOOST_CUSTOMIZE_GLOBAL_PRNG
-#define XGBOOST_CUSTOMIZE_GLOBAL_PRNG XGBOOST_STRICT_R_MODE
+#define XGBOOST_CUSTOMIZE_GLOBAL_PRNG 0
 #endif  // XGBOOST_CUSTOMIZE_GLOBAL_PRNG
 
 /*!
@@ -86,34 +84,31 @@
 
 #endif  // !defined(XGBOOST_MM_PREFETCH_PRESENT) && !defined()
 
-/*! \brief namespace of xgboost*/
 namespace xgboost {
-
 /*! \brief unsigned integer type used for feature index. */
-using bst_uint = uint32_t;  // NOLINT
+using bst_uint = std::uint32_t;  // NOLINT
 /*! \brief unsigned long integers */
-using bst_ulong = uint64_t;  // NOLINT
+using bst_ulong = std::uint64_t;  // NOLINT
 /*! \brief float type, used for storing statistics */
 using bst_float = float;  // NOLINT
 /*! \brief Categorical value type. */
-using bst_cat_t = int32_t;  // NOLINT
+using bst_cat_t = std::int32_t;  // NOLINT
 /*! \brief Type for data column (feature) index. */
-using bst_feature_t = uint32_t;  // NOLINT
-/*! \brief Type for histogram bin index. */
-using bst_bin_t = int32_t;  // NOLINT
-/*! \brief Type for data row index.
- *
- * Be careful `std::size_t' is implementation-defined.  Meaning that the binary
- * representation of DMatrix might not be portable across platform.  Booster model should
- * be portable as parameters are floating points.
+using bst_feature_t = std::uint32_t;  // NOLINT
+/**
+ * @brief Type for histogram bin index.  We sometimes use -1 to indicate invalid bin.
  */
-using bst_row_t = std::size_t;   // NOLINT
+using bst_bin_t = std::int32_t;  // NOLINT
+/**
+ * @brief Type for data row index (sample).
+ */
+using bst_idx_t = std::uint64_t;  // NOLINT
 /*! \brief Type for tree node index. */
 using bst_node_t = std::int32_t;      // NOLINT
 /*! \brief Type for ranking group index. */
 using bst_group_t = std::uint32_t;  // NOLINT
 /**
- * \brief Type for indexing into output targets.
+ * @brief Type for indexing into output targets.
  */
 using bst_target_t = std::uint32_t;  // NOLINT
 /**
@@ -306,8 +301,7 @@ class GradientPairInt64 {
   XGBOOST_DEVICE bool operator==(const GradientPairInt64 &rhs) const {
     return grad_ == rhs.grad_ && hess_ == rhs.hess_;
   }
-  friend std::ostream &operator<<(std::ostream &os,
-                                  const GradientPairInt64 &g) {
+  friend std::ostream &operator<<(std::ostream &os, const GradientPairInt64 &g) {
     os << g.GetQuantisedGrad() << "/" << g.GetQuantisedHess();
     return os;
   }
@@ -323,7 +317,7 @@ using omp_ulong = dmlc::omp_ulong;  // NOLINT
 /*! \brief define unsigned int for openmp loop */
 using bst_omp_uint = dmlc::omp_uint;  // NOLINT
 /*! \brief Type used for representing version number in binary form.*/
-using XGBoostVersionT = int32_t;
+using XGBoostVersionT = std::int32_t;
 }  // namespace xgboost
 
 #endif  // XGBOOST_BASE_H_
diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h
index 795c78946..19b93c644 100644
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2015~2023 by XGBoost Contributors
+ * Copyright 2015-2024, XGBoost Contributors
  * \file c_api.h
  * \author Tianqi Chen
  * \brief C API of XGBoost, used for interfacing to other languages.
@@ -639,21 +639,14 @@ XGB_DLL int XGDMatrixSetInfoFromInterface(DMatrixHandle handle,
  * \param len length of array
  * \return 0 when success, -1 when failure happens
  */
-XGB_DLL int XGDMatrixSetFloatInfo(DMatrixHandle handle,
-                                  const char *field,
-                                  const float *array,
+XGB_DLL int XGDMatrixSetFloatInfo(DMatrixHandle handle, const char *field, const float *array,
                                   bst_ulong len);
-/*!
- * \brief set uint32 vector to a content in info
- * \param handle a instance of data matrix
- * \param field field name
- * \param array pointer to unsigned int vector
- * \param len length of array
- * \return 0 when success, -1 when failure happens
+/**
+ * @deprecated since 2.1.0
+ *
+ * Use @ref XGDMatrixSetInfoFromInterface instead.
  */
-XGB_DLL int XGDMatrixSetUIntInfo(DMatrixHandle handle,
-                                 const char *field,
-                                 const unsigned *array,
+XGB_DLL int XGDMatrixSetUIntInfo(DMatrixHandle handle, const char *field, const unsigned *array,
                                  bst_ulong len);
 
 /*!
@@ -725,42 +718,13 @@ XGB_DLL int XGDMatrixGetStrFeatureInfo(DMatrixHandle handle, const char *field,
                                        bst_ulong *size,
                                        const char ***out_features);
 
-/*!
- * \brief Set meta info from dense matrix.  Valid field names are:
+/**
+ * @deprecated since 2.1.0
  *
- *  - label
- *  - weight
- *  - base_margin
- *  - group
- *  - label_lower_bound
- *  - label_upper_bound
- *  - feature_weights
- *
- * \param handle An instance of data matrix
- * \param field  Field name
- * \param data   Pointer to consecutive memory storing data.
- * \param size   Size of the data, this is relative to size of type.  (Meaning NOT number
- *               of bytes.)
- * \param type   Indicator of data type.  This is defined in xgboost::DataType enum class.
- *    - float    = 1
- *    - double   = 2
- *    - uint32_t = 3
- *    - uint64_t = 4
- * \return 0 when success, -1 when failure happens
+ * Use @ref XGDMatrixSetInfoFromInterface instead.
  */
-XGB_DLL int XGDMatrixSetDenseInfo(DMatrixHandle handle, const char *field,
-                                  void const *data, bst_ulong size, int type);
-
-/*!
- * \brief (deprecated) Use XGDMatrixSetUIntInfo instead. Set group of the training matrix
- * \param handle a instance of data matrix
- * \param group pointer to group size
- * \param len length of array
- * \return 0 when success, -1 when failure happens
- */
-XGB_DLL int XGDMatrixSetGroup(DMatrixHandle handle,
-                              const unsigned *group,
-                              bst_ulong len);
+XGB_DLL int XGDMatrixSetDenseInfo(DMatrixHandle handle, const char *field, void const *data,
+                                  bst_ulong size, int type);
 
 /*!
  * \brief get float info vector from matrix.
@@ -1591,7 +1555,7 @@ XGB_DLL int XGTrackerCreate(char const *config, TrackerHandle *handle);
 
 /**
  * @brief Get the arguments needed for running workers. This should be called after
- *        XGTrackerRun() and XGTrackerWait()
+ *        XGTrackerRun().
  *
  * @param handle The handle to the tracker.
  * @param args The arguments returned as a JSON document.
@@ -1601,16 +1565,19 @@ XGB_DLL int XGTrackerCreate(char const *config, TrackerHandle *handle);
 XGB_DLL int XGTrackerWorkerArgs(TrackerHandle handle, char const **args);
 
 /**
- * @brief Run the tracker.
+ * @brief Start the tracker. The tracker runs in the background and this function returns
+ *        once the tracker is started.
  *
  * @param handle The handle to the tracker.
+ * @param config Unused at the moment, preserved for the future.
  *
  * @return 0 for success, -1 for failure.
  */
-XGB_DLL int XGTrackerRun(TrackerHandle handle);
+XGB_DLL int XGTrackerRun(TrackerHandle handle, char const *config);
 
 /**
- * @brief Wait for the tracker to finish, should be called after XGTrackerRun().
+ * @brief Wait for the tracker to finish, should be called after XGTrackerRun(). This
+ *        function will block until the tracker task is finished or timeout is reached.
  *
  * @param handle The handle to the tracker.
  * @param config JSON encoded configuration. No argument is required yet, preserved for
@@ -1618,11 +1585,12 @@ XGB_DLL int XGTrackerRun(TrackerHandle handle);
  *
  * @return 0 for success, -1 for failure.
  */
-XGB_DLL int XGTrackerWait(TrackerHandle handle, char const *config);
+XGB_DLL int XGTrackerWaitFor(TrackerHandle handle, char const *config);
 
 /**
- * @brief Free a tracker instance. XGTrackerWait() is called internally. If the tracker
- *        cannot close properly, manual interruption is required.
+ * @brief Free a tracker instance. This should be called after XGTrackerWaitFor(). If the
+ *        tracker is not properly waited, this function will shutdown all connections with
+ *        the tracker, potentially leading to undefined behavior.
  *
  * @param handle The handle to the tracker.
  *
diff --git a/include/xgboost/collective/result.h b/include/xgboost/collective/result.h
index 507171dd4..23e70a8e6 100644
--- a/include/xgboost/collective/result.h
+++ b/include/xgboost/collective/result.h
@@ -1,13 +1,13 @@
 /**
- *  Copyright 2023, XGBoost Contributors
+ *  Copyright 2023-2024, XGBoost Contributors
  */
 #pragma once
 
-#include <memory>   // for unique_ptr
-#include <sstream>  // for stringstream
-#include <stack>    // for stack
-#include <string>   // for string
-#include <utility>  // for move
+#include <cstdint>       // for int32_t
+#include <memory>        // for unique_ptr
+#include <string>        // for string
+#include <system_error>  // for error_code
+#include <utility>       // for move
 
 namespace xgboost::collective {
 namespace detail {
@@ -46,48 +46,19 @@ struct ResultImpl {
     return cur_eq;
   }
 
-  [[nodiscard]] std::string Report() {
-    std::stringstream ss;
-    ss << "\n- " << this->message;
-    if (this->errc != std::error_code{}) {
-      ss << " system error:" << this->errc.message();
-    }
+  [[nodiscard]] std::string Report() const;
+  [[nodiscard]] std::error_code Code() const;
 
-    auto ptr = prev.get();
-    while (ptr) {
-      ss << "\n- ";
-      ss << ptr->message;
-
-      if (ptr->errc != std::error_code{}) {
-        ss << " " << ptr->errc.message();
-      }
-      ptr = ptr->prev.get();
-    }
-
-    return ss.str();
-  }
-  [[nodiscard]] auto Code() const {
-    // Find the root error.
-    std::stack<ResultImpl const*> stack;
-    auto ptr = this;
-    while (ptr) {
-      stack.push(ptr);
-      if (ptr->prev) {
-        ptr = ptr->prev.get();
-      } else {
-        break;
-      }
-    }
-    while (!stack.empty()) {
-      auto frame = stack.top();
-      stack.pop();
-      if (frame->errc != std::error_code{}) {
-        return frame->errc;
-      }
-    }
-    return std::error_code{};
-  }
+  void Concat(std::unique_ptr<ResultImpl> rhs);
 };
+
+#if (!defined(__GNUC__) && !defined(__clang__)) || defined(__MINGW32__)
+#define __builtin_FILE() nullptr
+#define __builtin_LINE() (-1)
+std::string MakeMsg(std::string&& msg, char const*, std::int32_t);
+#else
+std::string MakeMsg(std::string&& msg, char const* file, std::int32_t line);
+#endif
 }  // namespace detail
 
 /**
@@ -129,8 +100,21 @@ struct Result {
     }
     return *impl_ == *that.impl_;
   }
+
+  friend Result operator+(Result&& lhs, Result&& rhs);
 };
 
+[[nodiscard]] inline Result operator+(Result&& lhs, Result&& rhs) {
+  if (lhs.OK()) {
+    return std::forward<Result>(rhs);
+  }
+  if (rhs.OK()) {
+    return std::forward<Result>(lhs);
+  }
+  lhs.impl_->Concat(std::move(rhs.impl_));
+  return std::forward<Result>(lhs);
+}
+
 /**
  * @brief Return success.
  */
@@ -138,32 +122,43 @@ struct Result {
 /**
  * @brief Return failure.
  */
-[[nodiscard]] inline auto Fail(std::string msg) { return Result{std::move(msg)}; }
+[[nodiscard]] inline auto Fail(std::string msg, char const* file = __builtin_FILE(),
+                               std::int32_t line = __builtin_LINE()) {
+  return Result{detail::MakeMsg(std::move(msg), file, line)};
+}
 /**
  * @brief Return failure with `errno`.
  */
-[[nodiscard]] inline auto Fail(std::string msg, std::error_code errc) {
-  return Result{std::move(msg), std::move(errc)};
+[[nodiscard]] inline auto Fail(std::string msg, std::error_code errc,
+                               char const* file = __builtin_FILE(),
+                               std::int32_t line = __builtin_LINE()) {
+  return Result{detail::MakeMsg(std::move(msg), file, line), std::move(errc)};
 }
 /**
  * @brief Return failure with a previous error.
  */
-[[nodiscard]] inline auto Fail(std::string msg, Result&& prev) {
-  return Result{std::move(msg), std::forward<Result>(prev)};
+[[nodiscard]] inline auto Fail(std::string msg, Result&& prev, char const* file = __builtin_FILE(),
+                               std::int32_t line = __builtin_LINE()) {
+  return Result{detail::MakeMsg(std::move(msg), file, line), std::forward<Result>(prev)};
 }
 /**
  * @brief Return failure with a previous error and a new `errno`.
  */
-[[nodiscard]] inline auto Fail(std::string msg, std::error_code errc, Result&& prev) {
-  return Result{std::move(msg), std::move(errc), std::forward<Result>(prev)};
+[[nodiscard]] inline auto Fail(std::string msg, std::error_code errc, Result&& prev,
+                               char const* file = __builtin_FILE(),
+                               std::int32_t line = __builtin_LINE()) {
+  return Result{detail::MakeMsg(std::move(msg), file, line), std::move(errc),
+                std::forward<Result>(prev)};
 }
 
 // We don't have monad, a simple helper would do.
 template <typename Fn>
-Result operator<<(Result&& r, Fn&& fn) {
+[[nodiscard]] std::enable_if_t<std::is_invocable_v<Fn>, Result> operator<<(Result&& r, Fn&& fn) {
   if (!r.OK()) {
     return std::forward<Result>(r);
   }
   return fn();
 }
+
+void SafeColl(Result const& rc);
 }  // namespace xgboost::collective
diff --git a/include/xgboost/collective/socket.h b/include/xgboost/collective/socket.h
index 844534110..0e098052c 100644
--- a/include/xgboost/collective/socket.h
+++ b/include/xgboost/collective/socket.h
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2022-2023, XGBoost Contributors
+ * Copyright (c) 2022-2024, XGBoost Contributors
  */
 #pragma once
 
@@ -12,7 +12,6 @@
 #include <cstddef>       // std::size_t
 #include <cstdint>       // std::int32_t, std::uint16_t
 #include <cstring>       // memset
-#include <limits>        // std::numeric_limits
 #include <string>        // std::string
 #include <system_error>  // std::error_code, std::system_category
 #include <utility>       // std::swap
@@ -125,6 +124,21 @@ inline std::int32_t CloseSocket(SocketT fd) {
 #endif
 }
 
+inline std::int32_t ShutdownSocket(SocketT fd) {
+#if defined(_WIN32)
+  auto rc = shutdown(fd, SD_BOTH);
+  if (rc != 0 && LastError() == WSANOTINITIALISED) {
+    return 0;
+  }
+#else
+  auto rc = shutdown(fd, SHUT_RDWR);
+  if (rc != 0 && LastError() == ENOTCONN) {
+    return 0;
+  }
+#endif
+  return rc;
+}
+
 inline bool ErrorWouldBlock(std::int32_t errsv) noexcept(true) {
 #ifdef _WIN32
   return errsv == WSAEWOULDBLOCK;
@@ -436,41 +450,62 @@ class TCPSocket {
    * \brief Accept new connection, returns a new TCP socket for the new connection.
    */
   TCPSocket Accept() {
-    HandleT newfd = accept(Handle(), nullptr, nullptr);
+    SockAddress addr;
+    TCPSocket newsock;
+    auto rc = this->Accept(&newsock, &addr);
+    SafeColl(rc);
+    return newsock;
+  }
+
+  [[nodiscard]] Result Accept(TCPSocket *out, SockAddress *addr) {
 #if defined(_WIN32)
     auto interrupt = WSAEINTR;
 #else
     auto interrupt = EINTR;
 #endif
-    if (newfd == InvalidSocket() && system::LastError() != interrupt) {
-      system::ThrowAtError("accept");
+    if (this->Domain() == SockDomain::kV4) {
+      struct sockaddr_in caddr;
+      socklen_t caddr_len = sizeof(caddr);
+      HandleT newfd = accept(Handle(), reinterpret_cast<sockaddr *>(&caddr), &caddr_len);
+      if (newfd == InvalidSocket() && system::LastError() != interrupt) {
+        return system::FailWithCode("Failed to accept.");
+      }
+      *addr = SockAddress{SockAddrV4{caddr}};
+      *out = TCPSocket{newfd};
+    } else {
+      struct sockaddr_in6 caddr;
+      socklen_t caddr_len = sizeof(caddr);
+      HandleT newfd = accept(Handle(), reinterpret_cast<sockaddr *>(&caddr), &caddr_len);
+      if (newfd == InvalidSocket() && system::LastError() != interrupt) {
+        return system::FailWithCode("Failed to accept.");
+      }
+      *addr = SockAddress{SockAddrV6{caddr}};
+      *out = TCPSocket{newfd};
     }
-    TCPSocket newsock{newfd};
-    return newsock;
-  }
-
-  [[nodiscard]] Result Accept(TCPSocket *out, SockAddrV4 *addr) {
-    struct sockaddr_in caddr;
-    socklen_t caddr_len = sizeof(caddr);
-    HandleT newfd = accept(Handle(), reinterpret_cast<sockaddr *>(&caddr), &caddr_len);
-    if (newfd == InvalidSocket()) {
-      return system::FailWithCode("Failed to accept.");
+    // On MacOS, this is automatically set to async socket if the parent socket is async
+    // We make sure all socket are blocking by default.
+    //
+    // On Windows, a closed socket is returned during shutdown. We guard against it when
+    // setting non-blocking.
+    if (!out->IsClosed()) {
+      return out->NonBlocking(false);
     }
-    *addr = SockAddrV4{caddr};
-    *out = TCPSocket{newfd};
     return Success();
   }
 
   ~TCPSocket() {
     if (!IsClosed()) {
-      Close();
+      auto rc = this->Close();
+      if (!rc.OK()) {
+        LOG(WARNING) << rc.Report();
+      }
     }
   }
 
   TCPSocket(TCPSocket const &that) = delete;
   TCPSocket(TCPSocket &&that) noexcept(true) { std::swap(this->handle_, that.handle_); }
   TCPSocket &operator=(TCPSocket const &that) = delete;
-  TCPSocket &operator=(TCPSocket &&that) {
+  TCPSocket &operator=(TCPSocket &&that) noexcept(true) {
     std::swap(this->handle_, that.handle_);
     return *this;
   }
@@ -479,36 +514,49 @@ class TCPSocket {
    */
   [[nodiscard]] HandleT const &Handle() const { return handle_; }
   /**
-   * \brief Listen to incoming requests. Should be called after bind.
+   * @brief Listen to incoming requests. Should be called after bind.
    */
-  void Listen(std::int32_t backlog = 16) { xgboost_CHECK_SYS_CALL(listen(handle_, backlog), 0); }
+  [[nodiscard]] Result Listen(std::int32_t backlog = 16) {
+    if (listen(handle_, backlog) != 0) {
+      return system::FailWithCode("Failed to listen.");
+    }
+    return Success();
+  }
   /**
-   * \brief Bind socket to INADDR_ANY, return the port selected by the OS.
+   * @brief Bind socket to INADDR_ANY, return the port selected by the OS.
    */
-  [[nodiscard]] in_port_t BindHost() {
+  [[nodiscard]] Result BindHost(std::int32_t* p_out) {
+    // Use int32 instead of in_port_t for consistency. We take port as parameter from
+    // users using other languages, the port is usually stored and passed around as int.
     if (Domain() == SockDomain::kV6) {
       auto addr = SockAddrV6::InaddrAny();
       auto handle = reinterpret_cast<sockaddr const *>(&addr.Handle());
-      xgboost_CHECK_SYS_CALL(
-          bind(handle_, handle, sizeof(std::remove_reference_t<decltype(addr.Handle())>)), 0);
+      if (bind(handle_, handle, sizeof(std::remove_reference_t<decltype(addr.Handle())>)) != 0) {
+        return system::FailWithCode("bind failed.");
+      }
 
       sockaddr_in6 res_addr;
       socklen_t addrlen = sizeof(res_addr);
-      xgboost_CHECK_SYS_CALL(
-          getsockname(handle_, reinterpret_cast<sockaddr *>(&res_addr), &addrlen), 0);
-      return ntohs(res_addr.sin6_port);
+      if (getsockname(handle_, reinterpret_cast<sockaddr *>(&res_addr), &addrlen) != 0) {
+        return system::FailWithCode("getsockname failed.");
+      }
+      *p_out = ntohs(res_addr.sin6_port);
     } else {
       auto addr = SockAddrV4::InaddrAny();
       auto handle = reinterpret_cast<sockaddr const *>(&addr.Handle());
-      xgboost_CHECK_SYS_CALL(
-          bind(handle_, handle, sizeof(std::remove_reference_t<decltype(addr.Handle())>)), 0);
+      if (bind(handle_, handle, sizeof(std::remove_reference_t<decltype(addr.Handle())>)) != 0) {
+        return system::FailWithCode("bind failed.");
+      }
 
       sockaddr_in res_addr;
       socklen_t addrlen = sizeof(res_addr);
-      xgboost_CHECK_SYS_CALL(
-          getsockname(handle_, reinterpret_cast<sockaddr *>(&res_addr), &addrlen), 0);
-      return ntohs(res_addr.sin_port);
+      if (getsockname(handle_, reinterpret_cast<sockaddr *>(&res_addr), &addrlen) != 0) {
+        return system::FailWithCode("getsockname failed.");
+      }
+      *p_out = ntohs(res_addr.sin_port);
     }
+
+    return Success();
   }
 
   [[nodiscard]] auto Port() const {
@@ -621,26 +669,49 @@ class TCPSocket {
    */
   std::size_t Send(StringView str);
   /**
-   * \brief Receive string, format is matched with the Python socket wrapper in RABIT.
+   * @brief Receive string, format is matched with the Python socket wrapper in RABIT.
    */
-  std::size_t Recv(std::string *p_str);
+  [[nodiscard]] Result Recv(std::string *p_str);
   /**
-   * \brief Close the socket, called automatically in destructor if the socket is not closed.
+   * @brief Close the socket, called automatically in destructor if the socket is not closed.
    */
-  void Close() {
+  [[nodiscard]] Result Close() {
     if (InvalidSocket() != handle_) {
-#if defined(_WIN32)
       auto rc = system::CloseSocket(handle_);
+#if defined(_WIN32)
       // it's possible that we close TCP sockets after finalizing WSA due to detached thread.
       if (rc != 0 && system::LastError() != WSANOTINITIALISED) {
-        system::ThrowAtError("close", rc);
+        return system::FailWithCode("Failed to close the socket.");
       }
 #else
-      xgboost_CHECK_SYS_CALL(system::CloseSocket(handle_), 0);
+      if (rc != 0) {
+        return system::FailWithCode("Failed to close the socket.");
+      }
 #endif
       handle_ = InvalidSocket();
     }
+    return Success();
   }
+  /**
+   * @brief Call shutdown on the socket.
+   */
+  [[nodiscard]] Result Shutdown() {
+    if (this->IsClosed()) {
+      return Success();
+    }
+    auto rc = system::ShutdownSocket(this->Handle());
+#if defined(_WIN32)
+    // Windows cannot shutdown a socket if it's not connected.
+    if (rc == -1 && system::LastError() == WSAENOTCONN) {
+      return Success();
+    }
+#endif
+    if (rc != 0) {
+      return system::FailWithCode("Failed to shutdown socket.");
+    }
+    return Success();
+  }
+
   /**
    * \brief Create a TCP socket on specified domain.
    */
diff --git a/include/xgboost/data.h b/include/xgboost/data.h
index 08d3d119a..ec06a9c86 100644
--- a/include/xgboost/data.h
+++ b/include/xgboost/data.h
@@ -19,7 +19,6 @@
 #include <algorithm>
 #include <limits>
 #include <memory>
-#include <numeric>
 #include <string>
 #include <utility>
 #include <vector>
@@ -137,14 +136,6 @@ class MetaInfo {
    * \param fo The output stream.
    */
   void SaveBinary(dmlc::Stream* fo) const;
-  /*!
-   * \brief Set information in the meta info.
-   * \param key The key of the information.
-   * \param dptr The data pointer of the source array.
-   * \param dtype The type of the source data.
-   * \param num Number of elements in the source array.
-   */
-  void SetInfo(Context const& ctx, const char* key, const void* dptr, DataType dtype, size_t num);
   /*!
    * \brief Set information in the meta info with array interface.
    * \param key The key of the information.
@@ -315,7 +306,7 @@ struct BatchParam {
 struct HostSparsePageView {
   using Inst = common::Span<Entry const>;
 
-  common::Span<bst_row_t const> offset;
+  common::Span<bst_idx_t const> offset;
   common::Span<Entry const> data;
 
   Inst operator[](size_t i) const {
@@ -333,7 +324,7 @@ struct HostSparsePageView {
 class SparsePage {
  public:
   // Offset for each row.
-  HostDeviceVector<bst_row_t> offset;
+  HostDeviceVector<bst_idx_t> offset;
   /*! \brief the data of the segments */
   HostDeviceVector<Entry> data;
 
@@ -517,10 +508,6 @@ class DMatrix {
   DMatrix()  = default;
   /*! \brief meta information of the dataset */
   virtual MetaInfo& Info() = 0;
-  virtual void SetInfo(const char* key, const void* dptr, DataType dtype, size_t num) {
-    auto const& ctx = *this->Ctx();
-    this->Info().SetInfo(ctx, key, dptr, dtype, num);
-  }
   virtual void SetInfo(const char* key, std::string const& interface_str) {
     auto const& ctx = *this->Ctx();
     this->Info().SetInfo(ctx, key, StringView{interface_str});
diff --git a/include/xgboost/json.h b/include/xgboost/json.h
index a5872ec3a..1416b8899 100644
--- a/include/xgboost/json.h
+++ b/include/xgboost/json.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019-2023 by XGBoost Contributors
+ * Copyright 2019-2024, XGBoost Contributors
  */
 #ifndef XGBOOST_JSON_H_
 #define XGBOOST_JSON_H_
@@ -42,7 +42,8 @@ class Value {
     kBoolean,
     kNull,
     // typed array for ubjson
-    kNumberArray,
+    kF32Array,
+    kF64Array,
     kU8Array,
     kI32Array,
     kI64Array
@@ -59,9 +60,7 @@ class Value {
   virtual Json& operator[](int ind);
 
   virtual bool operator==(Value const& rhs) const = 0;
-#if !defined(__APPLE__)
   virtual Value& operator=(Value const& rhs) = delete;
-#endif  // !defined(__APPLE__)
 
   std::string TypeStr() const;
 
@@ -104,6 +103,7 @@ class JsonString : public Value {
   std::string&       GetString()       & { return str_; }
 
   bool operator==(Value const& rhs) const override;
+  Value& operator=(Value const& rhs) override = delete;
 
   static bool IsClassOf(Value const* value) {
     return value->Type() == ValueKind::kString;
@@ -133,6 +133,7 @@ class JsonArray : public Value {
   std::vector<Json>&       GetArray()       & { return vec_; }
 
   bool operator==(Value const& rhs) const override;
+  Value& operator=(Value const& rhs) override = delete;
 
   static bool IsClassOf(Value const* value) {
     return value->Type() == ValueKind::kArray;
@@ -157,6 +158,7 @@ class JsonTypedArray : public Value {
   JsonTypedArray(JsonTypedArray&& that) noexcept : Value{kind}, vec_{std::move(that.vec_)} {}
 
   bool operator==(Value const& rhs) const override;
+  Value& operator=(Value const& rhs) override = delete;
 
   void Set(size_t i, T v) { vec_[i] = v; }
   size_t Size() const { return vec_.size(); }
@@ -173,7 +175,11 @@ class JsonTypedArray : public Value {
 /**
  * @brief Typed UBJSON array for 32-bit floating point.
  */
-using F32Array = JsonTypedArray<float, Value::ValueKind::kNumberArray>;
+using F32Array = JsonTypedArray<float, Value::ValueKind::kF32Array>;
+/**
+ * @brief Typed UBJSON array for 64-bit floating point.
+ */
+using F64Array = JsonTypedArray<double, Value::ValueKind::kF64Array>;
 /**
  * @brief Typed UBJSON array for uint8_t.
  */
@@ -211,6 +217,7 @@ class JsonObject : public Value {
   Map& GetObject() & { return object_; }
 
   bool operator==(Value const& rhs) const override;
+  Value& operator=(Value const& rhs) override = delete;
 
   static bool IsClassOf(Value const* value) { return value->Type() == ValueKind::kObject; }
   ~JsonObject() override = default;
@@ -244,6 +251,7 @@ class JsonNumber : public Value {
   Float&       GetNumber()       & { return number_; }
 
   bool operator==(Value const& rhs) const override;
+  Value& operator=(Value const& rhs) override = delete;
 
   static bool IsClassOf(Value const* value) {
     return value->Type() == ValueKind::kNumber;
@@ -282,6 +290,7 @@ class JsonInteger : public Value {
       : Value{ValueKind::kInteger}, integer_{that.integer_} {}
 
   bool operator==(Value const& rhs) const override;
+  Value& operator=(Value const& rhs) override = delete;
 
   Int const& GetInteger() &&      { return integer_; }
   Int const& GetInteger() const & { return integer_; }
@@ -302,6 +311,7 @@ class JsonNull : public Value {
   void Save(JsonWriter* writer) const override;
 
   bool operator==(Value const& rhs) const override;
+  Value& operator=(Value const& rhs) override = delete;
 
   static bool IsClassOf(Value const* value) {
     return value->Type() == ValueKind::kNull;
@@ -331,6 +341,7 @@ class JsonBoolean : public Value {
   bool&       GetBoolean()       & { return boolean_; }
 
   bool operator==(Value const& rhs) const override;
+  Value& operator=(Value const& rhs) override = delete;
 
   static bool IsClassOf(Value const* value) {
     return value->Type() == ValueKind::kBoolean;
@@ -457,9 +468,9 @@ class Json {
   Json& operator[](int ind)                 const { return (*ptr_)[ind]; }
 
   /*! \brief Return the reference to stored Json value. */
-  Value const& GetValue() const & { return *ptr_; }
-  Value const& GetValue() &&      { return *ptr_; }
-  Value&       GetValue() &       { return *ptr_; }
+  [[nodiscard]] Value const& GetValue() const& { return *ptr_; }
+  Value const& GetValue() && { return *ptr_; }
+  Value& GetValue() & { return *ptr_; }
 
   bool operator==(Json const& rhs) const {
     return *ptr_ == *(rhs.ptr_);
@@ -472,7 +483,7 @@ class Json {
     return os;
   }
 
-  IntrusivePtr<Value> const& Ptr() const { return ptr_; }
+  [[nodiscard]] IntrusivePtr<Value> const& Ptr() const { return ptr_; }
 
  private:
   IntrusivePtr<Value> ptr_{new JsonNull};
diff --git a/include/xgboost/json_io.h b/include/xgboost/json_io.h
index 3a73d170a..ce3d25c37 100644
--- a/include/xgboost/json_io.h
+++ b/include/xgboost/json_io.h
@@ -142,6 +142,7 @@ class JsonWriter {
 
   virtual void Visit(JsonArray  const* arr);
   virtual void Visit(F32Array  const* arr);
+  virtual void Visit(F64Array const*) { LOG(FATAL) << "Only UBJSON format can handle f64 array."; }
   virtual void Visit(U8Array  const* arr);
   virtual void Visit(I32Array  const* arr);
   virtual void Visit(I64Array  const* arr);
@@ -244,7 +245,8 @@ class UBJReader : public JsonReader {
  */
 class UBJWriter : public JsonWriter {
   void Visit(JsonArray const* arr) override;
-  void Visit(F32Array  const* arr) override;
+  void Visit(F32Array const* arr) override;
+  void Visit(F64Array const* arr) override;
   void Visit(U8Array  const* arr) override;
   void Visit(I32Array  const* arr) override;
   void Visit(I64Array  const* arr) override;
diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h
index 26a072e52..79810d4d0 100644
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -190,13 +190,14 @@ constexpr auto ArrToTuple(T (&arr)[N]) {
 // uint division optimization inspired by the CIndexer in cupy.  Division operation is
 // slow on both CPU and GPU, especially 64 bit integer.  So here we first try to avoid 64
 // bit when the index is smaller, then try to avoid division when it's exp of 2.
-template <typename I, int32_t D>
+template <typename I, std::int32_t D>
 LINALG_HD auto UnravelImpl(I idx, common::Span<size_t const, D> shape) {
-  size_t index[D]{0};
+  std::size_t index[D]{0};
   static_assert(std::is_signed<decltype(D)>::value,
                 "Don't change the type without changing the for loop.");
+  auto const sptr = shape.data();
   for (int32_t dim = D; --dim > 0;) {
-    auto s = static_cast<std::remove_const_t<std::remove_reference_t<I>>>(shape[dim]);
+    auto s = static_cast<std::remove_const_t<std::remove_reference_t<I>>>(sptr[dim]);
     if (s & (s - 1)) {
       auto t = idx / s;
       index[dim] = idx - t * s;
@@ -295,6 +296,9 @@ class TensorView {
   using ShapeT = std::size_t[kDim];
   using StrideT = ShapeT;
 
+  using element_type = T;                  // NOLINT
+  using value_type = std::remove_cv_t<T>;  // NOLINT
+
  private:
   StrideT stride_{1};
   ShapeT shape_{0};
@@ -314,7 +318,7 @@ class TensorView {
   }
 
   template <size_t old_dim, size_t new_dim, int32_t D, typename I>
-  LINALG_HD size_t MakeSliceDim(size_t new_shape[D], size_t new_stride[D],
+  LINALG_HD size_t MakeSliceDim(std::size_t new_shape[D], std::size_t new_stride[D],
                                 detail::RangeTag<I> &&range) const {
     static_assert(new_dim < D);
     static_assert(old_dim < kDim);
@@ -528,9 +532,10 @@ class TensorView {
   LINALG_HD auto Stride(size_t i) const { return stride_[i]; }
 
   /**
-   * \brief Number of items in the tensor.
+   * @brief Number of items in the tensor.
    */
   [[nodiscard]] LINALG_HD std::size_t Size() const { return size_; }
+  [[nodiscard]] bool Empty() const { return Size() == 0; }
   /**
    * \brief Whether this is a contiguous array, both C and F contiguous returns true.
    */
@@ -741,6 +746,14 @@ auto ArrayInterfaceStr(TensorView<T, D> const &t) {
   return str;
 }
 
+template <typename T>
+auto Make1dInterface(T const *vec, std::size_t len) {
+  Context ctx;
+  auto t = linalg::MakeTensorView(&ctx, common::Span{vec, len}, len);
+  auto str = linalg::ArrayInterfaceStr(t);
+  return str;
+}
+
 /**
  * \brief A tensor storage. To use it for other functionality like slicing one needs to
  *        obtain a view first.  This way we can use it on both host and device.
@@ -865,7 +878,9 @@ class Tensor {
   auto HostView() { return this->View(DeviceOrd::CPU()); }
   auto HostView() const { return this->View(DeviceOrd::CPU()); }
 
-  [[nodiscard]] size_t Size() const { return data_.Size(); }
+  [[nodiscard]] std::size_t Size() const { return data_.Size(); }
+  [[nodiscard]] bool Empty() const { return Size() == 0; }
+
   auto Shape() const { return common::Span<size_t const, kDim>{shape_}; }
   auto Shape(size_t i) const { return shape_[i]; }
 
diff --git a/include/xgboost/span.h b/include/xgboost/span.h
index b0c1a5c1e..468f5ff50 100644
--- a/include/xgboost/span.h
+++ b/include/xgboost/span.h
@@ -30,9 +30,8 @@
 #define XGBOOST_SPAN_H_
 
 #include <xgboost/base.h>
-#include <xgboost/logging.h>
 
-#include <cinttypes>  // size_t
+#include <cstddef>  // size_t
 #include <cstdio>
 #include <iterator>
 #include <limits>  // numeric_limits
@@ -75,8 +74,7 @@
 
 #endif  // defined(_MSC_VER) && _MSC_VER < 1910
 
-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 
 #if defined(__CUDA_ARCH__)
 // Usual logging facility is not available inside device code.
@@ -738,14 +736,14 @@ class IterSpan {
     return {data() + _offset, _count == dynamic_extent ? size() - _offset : _count};
   }
   [[nodiscard]] XGBOOST_DEVICE constexpr iterator begin() const noexcept {  // NOLINT
-    return {this, 0};
+    return it_;
   }
   [[nodiscard]] XGBOOST_DEVICE constexpr iterator end() const noexcept {  // NOLINT
-    return {this, size()};
+    return it_ + size();
   }
 };
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
+
 
 #if defined(_MSC_VER) &&_MSC_VER < 1910
 #undef constexpr
diff --git a/include/xgboost/tree_model.h b/include/xgboost/tree_model.h
index 4c475da2e..32b93c5ca 100644
--- a/include/xgboost/tree_model.h
+++ b/include/xgboost/tree_model.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2014-2023 by Contributors
+ * Copyright 2014-2024, XGBoost Contributors
  * \file tree_model.h
  * \brief model structure for tree
  * \author Tianqi Chen
@@ -688,6 +688,9 @@ class RegTree : public Model {
     }
     return (*this)[nidx].DefaultLeft();
   }
+  [[nodiscard]] bst_node_t DefaultChild(bst_node_t nidx) const {
+    return this->DefaultLeft(nidx) ? this->LeftChild(nidx) : this->RightChild(nidx);
+  }
   [[nodiscard]] bool IsRoot(bst_node_t nidx) const {
     if (IsMultiTarget()) {
       return nidx == kRoot;
diff --git a/jvm-packages/create_jni.py b/jvm-packages/create_jni.py
index 395bc79b0..ff7bba693 100755
--- a/jvm-packages/create_jni.py
+++ b/jvm-packages/create_jni.py
@@ -83,44 +83,59 @@ def native_build(args):
     with cd(".."):
         build_dir = 'build-gpu' if cli_args.use_cuda == 'ON' or cli_args.use_hip == 'ON' else 'build'
         maybe_makedirs(build_dir)
+
+        if sys.platform == "linux":
+            maybe_parallel_build = " -- -j $(nproc)"
+        else:
+            maybe_parallel_build = ""
+
+        if cli_args.log_capi_invocation == "ON":
+            CONFIG["LOG_CAPI_INVOCATION"] = "ON"
+
+        if cli_args.use_cuda == "ON":
+            CONFIG["USE_CUDA"] = "ON"
+            CONFIG["USE_NCCL"] = "ON"
+            CONFIG["USE_DLOPEN_NCCL"] = "OFF"
+        elif cli_args.use_hip== 'ON':
+            CONFIG['USE_HIP'] = 'ON'
+            CONFIG['USE_RCCL'] = 'ON'
+            CONFIG["USE_DLOPEN_RCCL"] = "OFF"
+
+        args = ["-D{0}:BOOL={1}".format(k, v) for k, v in CONFIG.items()]
+
+        # if enviorment set rabit_mock
+        if os.getenv("RABIT_MOCK", None) is not None:
+            args.append("-DRABIT_MOCK:BOOL=ON")
+
+        # if enviorment set GPU_ARCH_FLAG
+        gpu_arch_flag = os.getenv("GPU_ARCH_FLAG", None)
+        if gpu_arch_flag is not None:
+            args.append("%s" % gpu_arch_flag)
+
         with cd(build_dir):
-            if sys.platform == "win32":
-                # Force x64 build on Windows.
-                maybe_generator = " -A x64"
-            else:
-                maybe_generator = ""
-            if sys.platform == "linux":
-                maybe_parallel_build = " -- -j $(nproc)"
-            else:
-                maybe_parallel_build = ""
-
-            if cli_args.log_capi_invocation == "ON":
-                CONFIG["LOG_CAPI_INVOCATION"] = "ON"
-
-            if cli_args.use_cuda == "ON":
-                CONFIG["USE_CUDA"] = "ON"
-                CONFIG["USE_NCCL"] = "ON"
-                CONFIG["USE_DLOPEN_NCCL"] = "OFF"
-            elif cli_args.use_hip== 'ON':
-                CONFIG['USE_HIP'] = 'ON'
-                CONFIG['USE_RCCL'] = 'ON'
-                CONFIG["USE_DLOPEN_RCCL"] = "OFF"
-
-            args = ["-D{0}:BOOL={1}".format(k, v) for k, v in CONFIG.items()]
-
-            # if enviorment set rabit_mock
-            if os.getenv("RABIT_MOCK", None) is not None:
-                args.append("-DRABIT_MOCK:BOOL=ON")
-
-            # if enviorment set GPU_ARCH_FLAG
-            gpu_arch_flag = os.getenv("GPU_ARCH_FLAG", None)
-            if gpu_arch_flag is not None:
-                args.append("%s" % gpu_arch_flag)
-
             lib_dir = os.path.join(os.pardir, "lib")
             if os.path.exists(lib_dir):
                 shutil.rmtree(lib_dir)
-            run("cmake .. " + " ".join(args) + maybe_generator)
+
+            # Same trick as Python build, just test all possible generators.
+            if sys.platform == "win32":
+                supported_generators = (
+                    "",  # empty, decided by cmake
+                    '-G"Visual Studio 17 2022" -A x64',
+                    '-G"Visual Studio 16 2019" -A x64',
+                    '-G"Visual Studio 15 2017" -A x64',
+                )
+                for generator in supported_generators:
+                    try:
+                        run("cmake .. " + " ".join(args + [generator]))
+                        break
+                    except subprocess.CalledProcessError as e:
+                        print(f"Failed to build with generator: {generator}", e)
+                        with cd(os.path.pardir):
+                            shutil.rmtree(build_dir)
+                            maybe_makedirs(build_dir)
+            else:
+                run("cmake .. " + " ".join(args))
             run("cmake --build . --config Release" + maybe_parallel_build)
 
         with cd("demo/CLI/regression"):
diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index 5b6f82b6a..70e054d3a 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -33,7 +33,7 @@
         <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
         <maven.compiler.source>1.8</maven.compiler.source>
         <maven.compiler.target>1.8</maven.compiler.target>
-        <flink.version>1.18.0</flink.version>
+        <flink.version>1.19.0</flink.version>
         <junit.version>4.13.2</junit.version>
         <spark.version>3.4.1</spark.version>
         <spark.version.gpu>3.4.1</spark.version.gpu>
@@ -46,9 +46,9 @@
         <cudf.version>23.12.1</cudf.version>
         <spark.rapids.version>23.12.1</spark.rapids.version>
         <cudf.classifier>cuda12</cudf.classifier>
+        <scalatest.version>3.2.18</scalatest.version>
+        <scala-collection-compat.version>2.12.0</scala-collection-compat.version>
         <use.hip>OFF</use.hip>
-        <scalatest.version>3.2.17</scalatest.version>
-        <scala-collection-compat.version>2.11.0</scala-collection-compat.version>
 
         <!-- SPARK-36796 for JDK-17 test-->
         <extraJavaTestArgs>
@@ -124,7 +124,7 @@
                     <plugin>
                         <groupId>org.apache.maven.plugins</groupId>
                         <artifactId>maven-jar-plugin</artifactId>
-                        <version>3.3.0</version>
+                        <version>3.4.0</version>
                         <executions>
                             <execution>
                                 <id>empty-javadoc-jar</id>
@@ -153,7 +153,7 @@
                     <plugin>
                         <groupId>org.apache.maven.plugins</groupId>
                         <artifactId>maven-gpg-plugin</artifactId>
-                        <version>3.1.0</version>
+                        <version>3.2.3</version>
                         <executions>
                             <execution>
                                 <id>sign-artifacts</id>
@@ -167,7 +167,7 @@
                     <plugin>
                         <groupId>org.apache.maven.plugins</groupId>
                         <artifactId>maven-source-plugin</artifactId>
-                        <version>3.3.0</version>
+                        <version>3.3.1</version>
                         <executions>
                             <execution>
                                 <id>attach-sources</id>
@@ -205,7 +205,7 @@
                     <plugin>
                         <groupId>org.apache.maven.plugins</groupId>
                         <artifactId>maven-assembly-plugin</artifactId>
-                        <version>3.6.0</version>
+                        <version>3.7.1</version>
                         <configuration>
                             <descriptorRefs>
                                 <descriptorRef>jar-with-dependencies</descriptorRef>
@@ -446,7 +446,7 @@
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>
                 <artifactId>maven-surefire-plugin</artifactId>
-                <version>3.2.2</version>
+                <version>3.2.5</version>
                 <configuration>
                     <skipTests>false</skipTests>
                     <useSystemClassLoader>false</useSystemClassLoader>
@@ -488,12 +488,12 @@
         <dependency>
             <groupId>com.esotericsoftware</groupId>
             <artifactId>kryo</artifactId>
-            <version>5.5.0</version>
+            <version>5.6.0</version>
         </dependency>
         <dependency>
             <groupId>commons-logging</groupId>
             <artifactId>commons-logging</artifactId>
-            <version>1.3.0</version>
+            <version>1.3.1</version>
         </dependency>
         <dependency>
             <groupId>org.scalatest</groupId>
diff --git a/jvm-packages/xgboost4j-gpu/pom.xml b/jvm-packages/xgboost4j-gpu/pom.xml
index 2dc36d52d..26ad9cafd 100644
--- a/jvm-packages/xgboost4j-gpu/pom.xml
+++ b/jvm-packages/xgboost4j-gpu/pom.xml
@@ -72,7 +72,7 @@
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>
                 <artifactId>maven-javadoc-plugin</artifactId>
-                <version>3.6.2</version>
+                <version>3.6.3</version>
                 <configuration>
                     <show>protected</show>
                     <nohelp>true</nohelp>
@@ -88,7 +88,7 @@
             <plugin>
                 <artifactId>exec-maven-plugin</artifactId>
                 <groupId>org.codehaus.mojo</groupId>
-                <version>3.1.0</version>
+                <version>3.2.0</version>
                 <executions>
                     <execution>
                         <id>native</id>
@@ -115,7 +115,7 @@
             <plugin>
                 <groupId>org.apache.maven.plugins</groupId>
                 <artifactId>maven-jar-plugin</artifactId>
-                <version>3.3.0</version>
+                <version>3.4.0</version>
                 <executions>
                     <execution>
                         <goals>
diff --git a/jvm-packages/xgboost4j-tester/generate_pom.py b/jvm-packages/xgboost4j-tester/generate_pom.py
index b9c274c28..eb7cf94b3 100644
--- a/jvm-packages/xgboost4j-tester/generate_pom.py
+++ b/jvm-packages/xgboost4j-tester/generate_pom.py
@@ -22,7 +22,7 @@ pom_template = """
     <scala.version>{scala_version}</scala.version>
     <scalatest.version>3.2.15</scalatest.version>
     <scala.binary.version>{scala_binary_version}</scala.binary.version>
-    <kryo.version>5.5.0</kryo.version>
+    <kryo.version>5.6.0</kryo.version>
   </properties>
 
   <dependencies>
diff --git a/jvm-packages/xgboost4j/pom.xml b/jvm-packages/xgboost4j/pom.xml
index 7eb186919..5012eaf14 100644
--- a/jvm-packages/xgboost4j/pom.xml
+++ b/jvm-packages/xgboost4j/pom.xml
@@ -60,7 +60,7 @@
           <plugin>
               <groupId>org.apache.maven.plugins</groupId>
               <artifactId>maven-javadoc-plugin</artifactId>
-              <version>3.6.2</version>
+              <version>3.6.3</version>
               <configuration>
                   <show>protected</show>
                   <nohelp>true</nohelp>
@@ -76,7 +76,7 @@
           <plugin>
               <artifactId>exec-maven-plugin</artifactId>
               <groupId>org.codehaus.mojo</groupId>
-              <version>3.1.0</version>
+              <version>3.2.0</version>
               <executions>
                   <execution>
                       <id>native</id>
@@ -99,7 +99,7 @@
           <plugin>
               <groupId>org.apache.maven.plugins</groupId>
               <artifactId>maven-jar-plugin</artifactId>
-              <version>3.3.0</version>
+              <version>3.4.0</version>
               <executions>
                   <execution>
                       <goals>
diff --git a/jvm-packages/xgboost4j/src/native/xgboost4j.cpp b/jvm-packages/xgboost4j/src/native/xgboost4j.cpp
index 332b1a127..9ba944d5a 100644
--- a/jvm-packages/xgboost4j/src/native/xgboost4j.cpp
+++ b/jvm-packages/xgboost4j/src/native/xgboost4j.cpp
@@ -408,7 +408,8 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixSetFloatI
 
   jfloat* array = jenv->GetFloatArrayElements(jarray, NULL);
   bst_ulong len = (bst_ulong)jenv->GetArrayLength(jarray);
-  int ret = XGDMatrixSetFloatInfo(handle, field, (float const *)array, len);
+  auto str = xgboost::linalg::Make1dInterface(array, len);
+  int ret = XGDMatrixSetInfoFromInterface(handle, field, str.c_str());
   JVM_CHECK_CALL(ret);
   //release
   if (field) jenv->ReleaseStringUTFChars(jfield, field);
@@ -427,7 +428,8 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGDMatrixSetUIntIn
   const char*  field = jenv->GetStringUTFChars(jfield, 0);
   jint* array = jenv->GetIntArrayElements(jarray, NULL);
   bst_ulong len = (bst_ulong)jenv->GetArrayLength(jarray);
-  int ret = XGDMatrixSetUIntInfo(handle, (char const *)field, (unsigned int const *)array, len);
+  auto str = xgboost::linalg::Make1dInterface(array, len);
+  int ret = XGDMatrixSetInfoFromInterface(handle, field, str.c_str());
   JVM_CHECK_CALL(ret);
   //release
   if (field) jenv->ReleaseStringUTFChars(jfield, (const char *)field);
@@ -730,8 +732,8 @@ JNIEXPORT jint JNICALL Java_ml_dmlc_xgboost4j_java_XGBoostJNI_XGBoosterPredictFr
   if (jmargin) {
     margin = jenv->GetFloatArrayElements(jmargin, nullptr);
     JVM_CHECK_CALL(XGProxyDMatrixCreate(&proxy));
-    JVM_CHECK_CALL(
-        XGDMatrixSetFloatInfo(proxy, "base_margin", margin, jenv->GetArrayLength(jmargin)));
+    auto str = xgboost::linalg::Make1dInterface(margin, jenv->GetArrayLength(jmargin));
+    JVM_CHECK_CALL(XGDMatrixSetInfoFromInterface(proxy, "base_margin", str.c_str()));
   }
 
   bst_ulong const *out_shape;
diff --git a/plugin/CMakeLists.txt b/plugin/CMakeLists.txt
index e575f1a41..5d20e120e 100644
--- a/plugin/CMakeLists.txt
+++ b/plugin/CMakeLists.txt
@@ -1,10 +1,7 @@
 if(PLUGIN_SYCL)
   set(CMAKE_CXX_COMPILER "icpx")
-  add_library(plugin_sycl OBJECT
-    ${xgboost_SOURCE_DIR}/plugin/sycl/objective/regression_obj.cc
-    ${xgboost_SOURCE_DIR}/plugin/sycl/objective/multiclass_obj.cc
-    ${xgboost_SOURCE_DIR}/plugin/sycl/device_manager.cc
-    ${xgboost_SOURCE_DIR}/plugin/sycl/predictor/predictor.cc)
+  file(GLOB_RECURSE SYCL_SOURCES "sycl/*.cc")
+  add_library(plugin_sycl OBJECT ${SYCL_SOURCES})
   target_include_directories(plugin_sycl
     PRIVATE
     ${xgboost_SOURCE_DIR}/include
diff --git a/plugin/federated/federated_coll.cc b/plugin/federated/federated_coll.cc
index b3dc23dba..34670715a 100644
--- a/plugin/federated/federated_coll.cc
+++ b/plugin/federated/federated_coll.cc
@@ -89,19 +89,15 @@ Coll *FederatedColl::MakeCUDAVar() {
 
 [[nodiscard]] Result FederatedColl::Broadcast(Comm const &comm, common::Span<std::int8_t> data,
                                               std::int32_t root) {
-  if (comm.Rank() == root) {
-    return BroadcastImpl(comm, &this->sequence_number_, data, root);
-  } else {
-    return BroadcastImpl(comm, &this->sequence_number_, data, root);
-  }
+  return BroadcastImpl(comm, &this->sequence_number_, data, root);
 }
 
-[[nodiscard]] Result FederatedColl::Allgather(Comm const &comm, common::Span<std::int8_t> data,
-                                              std::int64_t size) {
+[[nodiscard]] Result FederatedColl::Allgather(Comm const &comm, common::Span<std::int8_t> data) {
   using namespace federated;  // NOLINT
   auto fed = dynamic_cast<FederatedComm const *>(&comm);
   CHECK(fed);
   auto stub = fed->Handle();
+  auto size = data.size_bytes() / comm.World();
 
   auto offset = comm.Rank() * size;
   auto segment = data.subspan(offset, size);
diff --git a/plugin/federated/federated_coll.cu b/plugin/federated/federated_coll.cu
index a922e1c11..3f604c50d 100644
--- a/plugin/federated/federated_coll.cu
+++ b/plugin/federated/federated_coll.cu
@@ -53,8 +53,7 @@ Coll *FederatedColl::MakeCUDAVar() {
   };
 }
 
-[[nodiscard]] Result CUDAFederatedColl::Allgather(Comm const &comm, common::Span<std::int8_t> data,
-                                                  std::int64_t size) {
+[[nodiscard]] Result CUDAFederatedColl::Allgather(Comm const &comm, common::Span<std::int8_t> data) {
   auto cufed = dynamic_cast<CUDAFederatedComm const *>(&comm);
   CHECK(cufed);
   std::vector<std::int8_t> h_data(data.size());
@@ -63,7 +62,7 @@ Coll *FederatedColl::MakeCUDAVar() {
     return GetCUDAResult(
         cudaMemcpy(h_data.data(), data.data(), data.size(), cudaMemcpyDeviceToHost));
   } << [&] {
-    return p_impl_->Allgather(comm, common::Span{h_data.data(), h_data.size()}, size);
+    return p_impl_->Allgather(comm, common::Span{h_data.data(), h_data.size()});
   } << [&] {
     return GetCUDAResult(cudaMemcpyAsync(data.data(), h_data.data(), data.size(),
                                          cudaMemcpyHostToDevice, cufed->Stream()));
diff --git a/plugin/federated/federated_coll.cuh b/plugin/federated/federated_coll.cuh
index a1121d88f..6a690a33d 100644
--- a/plugin/federated/federated_coll.cuh
+++ b/plugin/federated/federated_coll.cuh
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023, XGBoost contributors
+ * Copyright 2023-2024, XGBoost contributors
  */
 #include "../../src/collective/comm.h"  // for Comm, Coll
 #include "federated_coll.h"             // for FederatedColl
@@ -16,8 +16,7 @@ class CUDAFederatedColl : public Coll {
                                  ArrayInterfaceHandler::Type type, Op op) override;
   [[nodiscard]] Result Broadcast(Comm const &comm, common::Span<std::int8_t> data,
                                  std::int32_t root) override;
-  [[nodiscard]] Result Allgather(Comm const &, common::Span<std::int8_t> data,
-                                 std::int64_t size) override;
+  [[nodiscard]] Result Allgather(Comm const &, common::Span<std::int8_t> data) override;
   [[nodiscard]] Result AllgatherV(Comm const &comm, common::Span<std::int8_t const> data,
                                   common::Span<std::int64_t const> sizes,
                                   common::Span<std::int64_t> recv_segments,
diff --git a/plugin/federated/federated_coll.h b/plugin/federated/federated_coll.h
index c261b01e1..12443a3e1 100644
--- a/plugin/federated/federated_coll.h
+++ b/plugin/federated/federated_coll.h
@@ -1,12 +1,9 @@
 /**
- * Copyright 2023, XGBoost contributors
+ * Copyright 2023-2024, XGBoost contributors
  */
 #pragma once
 #include "../../src/collective/coll.h"    // for Coll
 #include "../../src/collective/comm.h"    // for Comm
-#include "../../src/common/io.h"          // for ReadAll
-#include "../../src/common/json_utils.h"  // for OptionalArg
-#include "xgboost/json.h"                 // for Json
 
 namespace xgboost::collective {
 class FederatedColl : public Coll {
@@ -20,8 +17,7 @@ class FederatedColl : public Coll {
                                  ArrayInterfaceHandler::Type type, Op op) override;
   [[nodiscard]] Result Broadcast(Comm const &comm, common::Span<std::int8_t> data,
                                  std::int32_t root) override;
-  [[nodiscard]] Result Allgather(Comm const &, common::Span<std::int8_t> data,
-                                 std::int64_t) override;
+  [[nodiscard]] Result Allgather(Comm const &, common::Span<std::int8_t> data) override;
   [[nodiscard]] Result AllgatherV(Comm const &comm, common::Span<std::int8_t const> data,
                                   common::Span<std::int64_t const> sizes,
                                   common::Span<std::int64_t> recv_segments,
diff --git a/plugin/federated/federated_comm.cuh b/plugin/federated/federated_comm.cuh
index 58c52f67e..85cecb3eb 100644
--- a/plugin/federated/federated_comm.cuh
+++ b/plugin/federated/federated_comm.cuh
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
 #pragma once
 
@@ -9,7 +9,6 @@
 #include "../../src/common/device_helpers.cuh"  // for CUDAStreamView
 #include "federated_comm.h"                     // for FederatedComm
 #include "xgboost/context.h"                    // for Context
-#include "xgboost/logging.h"
 
 namespace xgboost::collective {
 class CUDAFederatedComm : public FederatedComm {
diff --git a/plugin/federated/federated_comm.h b/plugin/federated/federated_comm.h
index 750d94abd..b39e1878a 100644
--- a/plugin/federated/federated_comm.h
+++ b/plugin/federated/federated_comm.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023, XGBoost contributors
+ * Copyright 2023-2024, XGBoost contributors
  */
 #pragma once
 
@@ -11,7 +11,6 @@
 #include <string>   // for string
 
 #include "../../src/collective/comm.h"    // for HostComm
-#include "../../src/common/json_utils.h"  // for OptionalArg
 #include "xgboost/json.h"
 
 namespace xgboost::collective {
@@ -51,6 +50,10 @@ class FederatedComm : public HostComm {
                          std::int32_t rank) {
     this->Init(host, port, world, rank, {}, {}, {});
   }
+  [[nodiscard]] Result Shutdown() final {
+    this->ResetState();
+    return Success();
+  }
   ~FederatedComm() override { stub_.reset(); }
 
   [[nodiscard]] std::shared_ptr<Channel> Chan(std::int32_t) const override {
@@ -65,5 +68,13 @@ class FederatedComm : public HostComm {
   [[nodiscard]] federated::Federated::Stub* Handle() const { return stub_.get(); }
 
   [[nodiscard]] Comm* MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll> pimpl) const override;
+  /**
+   * @brief Get a string ID for the current process.
+   */
+  [[nodiscard]] Result ProcessorName(std::string* out) const final {
+    auto rank = this->Rank();
+    *out = "rank:" + std::to_string(rank);
+    return Success();
+  };
 };
 }  // namespace xgboost::collective
diff --git a/plugin/federated/federated_server.h b/plugin/federated/federated_server.h
index de760d9d8..4692ad6c2 100644
--- a/plugin/federated/federated_server.h
+++ b/plugin/federated/federated_server.h
@@ -1,22 +1,18 @@
 /**
- * Copyright 2022-2023, XGBoost contributors
+ * Copyright 2022-2024, XGBoost contributors
  */
 #pragma once
 
 #include <federated.old.grpc.pb.h>
 
 #include <cstdint>  // for int32_t
-#include <future>   // for future
 
 #include "../../src/collective/in_memory_handler.h"
-#include "../../src/collective/tracker.h"  // for Tracker
-#include "xgboost/collective/result.h"     // for Result
 
 namespace xgboost::federated {
 class FederatedService final : public Federated::Service {
  public:
-  explicit FederatedService(std::int32_t world_size)
-      : handler_{static_cast<std::size_t>(world_size)} {}
+  explicit FederatedService(std::int32_t world_size) : handler_{world_size} {}
 
   grpc::Status Allgather(grpc::ServerContext* context, AllgatherRequest const* request,
                          AllgatherReply* reply) override;
diff --git a/plugin/federated/federated_tracker.cc b/plugin/federated/federated_tracker.cc
index 37b6c3639..5051d43cb 100644
--- a/plugin/federated/federated_tracker.cc
+++ b/plugin/federated/federated_tracker.cc
@@ -125,14 +125,14 @@ Result FederatedTracker::Shutdown() {
 
 [[nodiscard]] Json FederatedTracker::WorkerArgs() const {
   auto rc = this->WaitUntilReady();
-  CHECK(rc.OK()) << rc.Report();
+  SafeColl(rc);
 
   std::string host;
   rc = GetHostAddress(&host);
   CHECK(rc.OK());
   Json args{Object{}};
-  args["DMLC_TRACKER_URI"] = String{host};
-  args["DMLC_TRACKER_PORT"] = this->Port();
+  args["dmlc_tracker_uri"] = String{host};
+  args["dmlc_tracker_port"] = this->Port();
   return args;
 }
 }  // namespace xgboost::collective
diff --git a/plugin/federated/federated_tracker.h b/plugin/federated/federated_tracker.h
index 33592fefe..ac46b6eaa 100644
--- a/plugin/federated/federated_tracker.h
+++ b/plugin/federated/federated_tracker.h
@@ -17,8 +17,7 @@ namespace xgboost::collective {
 namespace federated {
 class FederatedService final : public Federated::Service {
  public:
-  explicit FederatedService(std::int32_t world_size)
-      : handler_{static_cast<std::size_t>(world_size)} {}
+  explicit FederatedService(std::int32_t world_size) : handler_{world_size} {}
 
   grpc::Status Allgather(grpc::ServerContext* context, AllgatherRequest const* request,
                          AllgatherReply* reply) override;
diff --git a/plugin/sycl/common/hist_util.cc b/plugin/sycl/common/hist_util.cc
new file mode 100644
index 000000000..fd813a92c
--- /dev/null
+++ b/plugin/sycl/common/hist_util.cc
@@ -0,0 +1,334 @@
+/*!
+ * Copyright 2017-2023 by Contributors
+ * \file hist_util.cc
+ */
+#include <vector>
+#include <limits>
+#include <algorithm>
+
+#include "../data/gradient_index.h"
+#include "hist_util.h"
+
+#include <CL/sycl.hpp>
+
+namespace xgboost {
+namespace sycl {
+namespace common {
+
+/*!
+ * \brief Fill histogram with zeroes
+ */
+template<typename GradientSumT>
+void InitHist(::sycl::queue qu, GHistRow<GradientSumT, MemoryType::on_device>* hist,
+              size_t size, ::sycl::event* event) {
+  *event = qu.fill(hist->Begin(),
+                   xgboost::detail::GradientPairInternal<GradientSumT>(), size, *event);
+}
+template void InitHist(::sycl::queue qu,
+                       GHistRow<float,  MemoryType::on_device>* hist,
+                       size_t size, ::sycl::event* event);
+template void InitHist(::sycl::queue qu,
+                       GHistRow<double, MemoryType::on_device>* hist,
+                       size_t size, ::sycl::event* event);
+
+/*!
+ * \brief Compute Subtraction: dst = src1 - src2
+ */
+template<typename GradientSumT>
+::sycl::event SubtractionHist(::sycl::queue qu,
+                            GHistRow<GradientSumT, MemoryType::on_device>* dst,
+                            const GHistRow<GradientSumT, MemoryType::on_device>& src1,
+                            const GHistRow<GradientSumT, MemoryType::on_device>& src2,
+                            size_t size, ::sycl::event event_priv) {
+  GradientSumT* pdst = reinterpret_cast<GradientSumT*>(dst->Data());
+  const GradientSumT* psrc1 = reinterpret_cast<const GradientSumT*>(src1.DataConst());
+  const GradientSumT* psrc2 = reinterpret_cast<const GradientSumT*>(src2.DataConst());
+
+  auto event_final = qu.submit([&](::sycl::handler& cgh) {
+    cgh.depends_on(event_priv);
+    cgh.parallel_for<>(::sycl::range<1>(2 * size), [pdst, psrc1, psrc2](::sycl::item<1> pid) {
+      const size_t i = pid.get_id(0);
+      pdst[i] = psrc1[i] - psrc2[i];
+    });
+  });
+  return event_final;
+}
+template ::sycl::event SubtractionHist(::sycl::queue qu,
+                              GHistRow<float, MemoryType::on_device>* dst,
+                              const GHistRow<float, MemoryType::on_device>& src1,
+                              const GHistRow<float, MemoryType::on_device>& src2,
+                              size_t size, ::sycl::event event_priv);
+template ::sycl::event SubtractionHist(::sycl::queue qu,
+                              GHistRow<double, MemoryType::on_device>* dst,
+                              const GHistRow<double, MemoryType::on_device>& src1,
+                              const GHistRow<double, MemoryType::on_device>& src2,
+                              size_t size, ::sycl::event event_priv);
+
+// Kernel with buffer using
+template<typename FPType, typename BinIdxType, bool isDense>
+::sycl::event BuildHistKernel(::sycl::queue qu,
+                            const USMVector<GradientPair, MemoryType::on_device>& gpair_device,
+                            const RowSetCollection::Elem& row_indices,
+                            const GHistIndexMatrix& gmat,
+                            GHistRow<FPType, MemoryType::on_device>* hist,
+                            GHistRow<FPType, MemoryType::on_device>* hist_buffer,
+                            ::sycl::event event_priv) {
+  const size_t size = row_indices.Size();
+  const size_t* rid = row_indices.begin;
+  const size_t n_columns = isDense ? gmat.nfeatures : gmat.row_stride;
+  const GradientPair::ValueT* pgh =
+    reinterpret_cast<const GradientPair::ValueT*>(gpair_device.DataConst());
+  const BinIdxType* gradient_index = gmat.index.data<BinIdxType>();
+  const uint32_t* offsets = gmat.index.Offset();
+  FPType* hist_data = reinterpret_cast<FPType*>(hist->Data());
+  const size_t nbins = gmat.nbins;
+
+  const size_t max_work_group_size =
+    qu.get_device().get_info<::sycl::info::device::max_work_group_size>();
+  const size_t work_group_size = n_columns < max_work_group_size ? n_columns : max_work_group_size;
+
+  const size_t max_nblocks = hist_buffer->Size() / (nbins * 2);
+  const size_t min_block_size = 128;
+  size_t nblocks = std::min(max_nblocks, size / min_block_size + !!(size % min_block_size));
+  const size_t block_size = size / nblocks + !!(size % nblocks);
+  FPType* hist_buffer_data = reinterpret_cast<FPType*>(hist_buffer->Data());
+
+  auto event_fill = qu.fill(hist_buffer_data, FPType(0), nblocks * nbins * 2, event_priv);
+  auto event_main = qu.submit([&](::sycl::handler& cgh) {
+    cgh.depends_on(event_fill);
+    cgh.parallel_for<>(::sycl::nd_range<2>(::sycl::range<2>(nblocks, work_group_size),
+                                           ::sycl::range<2>(1, work_group_size)),
+                       [=](::sycl::nd_item<2> pid) {
+      size_t block = pid.get_global_id(0);
+      size_t feat = pid.get_global_id(1);
+
+      FPType* hist_local = hist_buffer_data + block * nbins * 2;
+      for (size_t idx = 0; idx < block_size; ++idx) {
+        size_t i = block * block_size + idx;
+        if (i < size) {
+          const size_t icol_start = n_columns * rid[i];
+          const size_t idx_gh = rid[i];
+
+          pid.barrier(::sycl::access::fence_space::local_space);
+          const BinIdxType* gr_index_local = gradient_index + icol_start;
+
+          for (size_t j = feat; j < n_columns; j += work_group_size) {
+            uint32_t idx_bin = static_cast<uint32_t>(gr_index_local[j]);
+            if constexpr (isDense) {
+              idx_bin += offsets[j];
+            }
+            if (idx_bin < nbins) {
+              hist_local[2 * idx_bin]   += pgh[2 * idx_gh];
+              hist_local[2 * idx_bin+1] += pgh[2 * idx_gh+1];
+            }
+          }
+        }
+      }
+    });
+  });
+
+  auto event_save = qu.submit([&](::sycl::handler& cgh) {
+    cgh.depends_on(event_main);
+    cgh.parallel_for<>(::sycl::range<1>(nbins), [=](::sycl::item<1> pid) {
+      size_t idx_bin = pid.get_id(0);
+
+      FPType gsum = 0.0f;
+      FPType hsum = 0.0f;
+
+      for (size_t j = 0; j < nblocks; ++j) {
+        gsum += hist_buffer_data[j * nbins * 2 + 2 * idx_bin];
+        hsum += hist_buffer_data[j * nbins * 2 + 2 * idx_bin + 1];
+      }
+
+      hist_data[2 * idx_bin] = gsum;
+      hist_data[2 * idx_bin + 1] = hsum;
+    });
+  });
+  return event_save;
+}
+
+// Kernel with atomic using
+template<typename FPType, typename BinIdxType, bool isDense>
+::sycl::event BuildHistKernel(::sycl::queue qu,
+                            const USMVector<GradientPair, MemoryType::on_device>& gpair_device,
+                            const RowSetCollection::Elem& row_indices,
+                            const GHistIndexMatrix& gmat,
+                            GHistRow<FPType, MemoryType::on_device>* hist,
+                            ::sycl::event event_priv) {
+  const size_t size = row_indices.Size();
+  const size_t* rid = row_indices.begin;
+  const size_t n_columns = isDense ? gmat.nfeatures : gmat.row_stride;
+  const GradientPair::ValueT* pgh =
+    reinterpret_cast<const GradientPair::ValueT*>(gpair_device.DataConst());
+  const BinIdxType* gradient_index = gmat.index.data<BinIdxType>();
+  const uint32_t* offsets = gmat.index.Offset();
+  FPType* hist_data = reinterpret_cast<FPType*>(hist->Data());
+  const size_t nbins = gmat.nbins;
+
+  const size_t max_work_group_size =
+    qu.get_device().get_info<::sycl::info::device::max_work_group_size>();
+  const size_t feat_local = n_columns < max_work_group_size ? n_columns : max_work_group_size;
+
+  auto event_fill = qu.fill(hist_data, FPType(0), nbins * 2, event_priv);
+  auto event_main = qu.submit([&](::sycl::handler& cgh) {
+    cgh.depends_on(event_fill);
+    cgh.parallel_for<>(::sycl::range<2>(size, feat_local),
+                      [=](::sycl::item<2> pid) {
+      size_t i = pid.get_id(0);
+      size_t feat = pid.get_id(1);
+
+      const size_t icol_start = n_columns * rid[i];
+      const size_t idx_gh = rid[i];
+
+      const BinIdxType* gr_index_local = gradient_index + icol_start;
+
+      for (size_t j = feat; j < n_columns; j += feat_local) {
+        uint32_t idx_bin = static_cast<uint32_t>(gr_index_local[j]);
+        if constexpr (isDense) {
+          idx_bin += offsets[j];
+        }
+        if (idx_bin < nbins) {
+          AtomicRef<FPType> gsum(hist_data[2 * idx_bin]);
+          AtomicRef<FPType> hsum(hist_data[2 * idx_bin + 1]);
+          gsum.fetch_add(pgh[2 * idx_gh]);
+          hsum.fetch_add(pgh[2 * idx_gh + 1]);
+        }
+      }
+    });
+  });
+  return event_main;
+}
+
+template<typename FPType, typename BinIdxType>
+::sycl::event BuildHistDispatchKernel(
+                ::sycl::queue qu,
+                const USMVector<GradientPair, MemoryType::on_device>& gpair_device,
+                const RowSetCollection::Elem& row_indices,
+                const GHistIndexMatrix& gmat,
+                GHistRow<FPType, MemoryType::on_device>* hist,
+                bool isDense,
+                GHistRow<FPType, MemoryType::on_device>* hist_buffer,
+                ::sycl::event events_priv,
+                bool force_atomic_use) {
+  const size_t size = row_indices.Size();
+  const size_t n_columns = isDense ? gmat.nfeatures : gmat.row_stride;
+  const size_t nbins = gmat.nbins;
+
+  // max cycle size, while atomics are still effective
+  const size_t max_cycle_size_atomics = nbins;
+  const size_t cycle_size = size;
+
+  // TODO(razdoburdin): replace the add-hock dispatching criteria by more sutable one
+  bool use_atomic = (size < nbins) || (gmat.max_num_bins == gmat.nbins / n_columns);
+
+  // force_atomic_use flag is used only for testing
+  use_atomic = use_atomic || force_atomic_use;
+  if (!use_atomic) {
+    if (isDense) {
+      return BuildHistKernel<FPType, BinIdxType, true>(qu, gpair_device, row_indices,
+                                                       gmat, hist, hist_buffer,
+                                                       events_priv);
+    } else {
+      return BuildHistKernel<FPType, uint32_t, false>(qu, gpair_device, row_indices,
+                                                      gmat, hist, hist_buffer,
+                                                      events_priv);
+    }
+  } else {
+    if (isDense) {
+      return BuildHistKernel<FPType, BinIdxType, true>(qu, gpair_device, row_indices,
+                                                       gmat, hist, events_priv);
+    } else {
+      return BuildHistKernel<FPType, uint32_t, false>(qu, gpair_device, row_indices,
+                                                      gmat, hist, events_priv);
+    }
+  }
+}
+
+template<typename FPType>
+::sycl::event BuildHistKernel(::sycl::queue qu,
+                            const USMVector<GradientPair, MemoryType::on_device>& gpair_device,
+                            const RowSetCollection::Elem& row_indices,
+                            const GHistIndexMatrix& gmat, const bool isDense,
+                            GHistRow<FPType, MemoryType::on_device>* hist,
+                            GHistRow<FPType, MemoryType::on_device>* hist_buffer,
+                            ::sycl::event event_priv,
+                            bool force_atomic_use) {
+  const bool is_dense = isDense;
+  switch (gmat.index.GetBinTypeSize()) {
+    case BinTypeSize::kUint8BinsTypeSize:
+      return BuildHistDispatchKernel<FPType, uint8_t>(qu, gpair_device, row_indices,
+                                                      gmat, hist, is_dense, hist_buffer,
+                                                      event_priv, force_atomic_use);
+      break;
+    case BinTypeSize::kUint16BinsTypeSize:
+      return BuildHistDispatchKernel<FPType, uint16_t>(qu, gpair_device, row_indices,
+                                                       gmat, hist, is_dense, hist_buffer,
+                                                       event_priv, force_atomic_use);
+      break;
+    case BinTypeSize::kUint32BinsTypeSize:
+      return BuildHistDispatchKernel<FPType, uint32_t>(qu, gpair_device, row_indices,
+                                                       gmat, hist, is_dense, hist_buffer,
+                                                       event_priv, force_atomic_use);
+      break;
+    default:
+      CHECK(false);  // no default behavior
+  }
+}
+
+template <typename GradientSumT>
+::sycl::event GHistBuilder<GradientSumT>::BuildHist(
+              const USMVector<GradientPair, MemoryType::on_device>& gpair_device,
+              const RowSetCollection::Elem& row_indices,
+              const GHistIndexMatrix &gmat,
+              GHistRowT<MemoryType::on_device>* hist,
+              bool isDense,
+              GHistRowT<MemoryType::on_device>* hist_buffer,
+              ::sycl::event event_priv,
+              bool force_atomic_use) {
+  return BuildHistKernel<GradientSumT>(qu_, gpair_device, row_indices, gmat,
+                                       isDense, hist, hist_buffer, event_priv,
+                                       force_atomic_use);
+}
+
+template
+::sycl::event GHistBuilder<float>::BuildHist(
+              const USMVector<GradientPair, MemoryType::on_device>& gpair_device,
+              const RowSetCollection::Elem& row_indices,
+              const GHistIndexMatrix& gmat,
+              GHistRow<float, MemoryType::on_device>* hist,
+              bool isDense,
+              GHistRow<float, MemoryType::on_device>* hist_buffer,
+              ::sycl::event event_priv,
+              bool force_atomic_use);
+template
+::sycl::event GHistBuilder<double>::BuildHist(
+              const USMVector<GradientPair, MemoryType::on_device>& gpair_device,
+              const RowSetCollection::Elem& row_indices,
+              const GHistIndexMatrix& gmat,
+              GHistRow<double, MemoryType::on_device>* hist,
+              bool isDense,
+              GHistRow<double, MemoryType::on_device>* hist_buffer,
+              ::sycl::event event_priv,
+              bool force_atomic_use);
+
+template<typename GradientSumT>
+void GHistBuilder<GradientSumT>::SubtractionTrick(GHistRowT<MemoryType::on_device>* self,
+                                                  const GHistRowT<MemoryType::on_device>& sibling,
+                                                  const GHistRowT<MemoryType::on_device>& parent) {
+  const size_t size = self->Size();
+  CHECK_EQ(sibling.Size(), size);
+  CHECK_EQ(parent.Size(), size);
+
+  SubtractionHist(qu_, self, parent, sibling, size, ::sycl::event());
+}
+template
+void GHistBuilder<float>::SubtractionTrick(GHistRow<float, MemoryType::on_device>* self,
+                                           const GHistRow<float, MemoryType::on_device>& sibling,
+                                           const GHistRow<float, MemoryType::on_device>& parent);
+template
+void GHistBuilder<double>::SubtractionTrick(GHistRow<double, MemoryType::on_device>* self,
+                                            const GHistRow<double, MemoryType::on_device>& sibling,
+                                            const GHistRow<double, MemoryType::on_device>& parent);
+}  // namespace common
+}  // namespace sycl
+}  // namespace xgboost
diff --git a/plugin/sycl/common/hist_util.h b/plugin/sycl/common/hist_util.h
new file mode 100644
index 000000000..7c7af71ae
--- /dev/null
+++ b/plugin/sycl/common/hist_util.h
@@ -0,0 +1,89 @@
+/*!
+ * Copyright 2017-2023 by Contributors
+ * \file hist_util.h
+ */
+#ifndef PLUGIN_SYCL_COMMON_HIST_UTIL_H_
+#define PLUGIN_SYCL_COMMON_HIST_UTIL_H_
+
+#include <vector>
+#include <unordered_map>
+#include <memory>
+
+#include "../data.h"
+#include "row_set.h"
+
+#include "../../src/common/hist_util.h"
+#include "../data/gradient_index.h"
+
+#include <CL/sycl.hpp>
+
+namespace xgboost {
+namespace sycl {
+namespace common {
+
+template<typename GradientSumT, MemoryType memory_type = MemoryType::shared>
+using GHistRow = USMVector<xgboost::detail::GradientPairInternal<GradientSumT>, memory_type>;
+
+using BinTypeSize = ::xgboost::common::BinTypeSize;
+
+class ColumnMatrix;
+
+/*!
+ * \brief Fill histogram with zeroes
+ */
+template<typename GradientSumT>
+void InitHist(::sycl::queue qu,
+              GHistRow<GradientSumT, MemoryType::on_device>* hist,
+              size_t size, ::sycl::event* event);
+
+/*!
+ * \brief Compute subtraction: dst = src1 - src2
+ */
+template<typename GradientSumT>
+::sycl::event SubtractionHist(::sycl::queue qu,
+                              GHistRow<GradientSumT, MemoryType::on_device>* dst,
+                              const GHistRow<GradientSumT, MemoryType::on_device>& src1,
+                              const GHistRow<GradientSumT, MemoryType::on_device>& src2,
+                              size_t size, ::sycl::event event_priv);
+
+/*!
+ * \brief Builder for histograms of gradient statistics
+ */
+template<typename GradientSumT>
+class GHistBuilder {
+ public:
+  template<MemoryType memory_type = MemoryType::shared>
+  using GHistRowT = GHistRow<GradientSumT, memory_type>;
+
+  GHistBuilder() = default;
+  GHistBuilder(::sycl::queue qu, uint32_t nbins) : qu_{qu}, nbins_{nbins} {}
+
+  // Construct a histogram via histogram aggregation
+  ::sycl::event BuildHist(const USMVector<GradientPair, MemoryType::on_device>& gpair_device,
+                          const RowSetCollection::Elem& row_indices,
+                          const GHistIndexMatrix& gmat,
+                          GHistRowT<MemoryType::on_device>* HistCollection,
+                          bool isDense,
+                          GHistRowT<MemoryType::on_device>* hist_buffer,
+                          ::sycl::event event,
+                          bool force_atomic_use = false);
+
+  // Construct a histogram via subtraction trick
+  void SubtractionTrick(GHistRowT<MemoryType::on_device>* self,
+                        const GHistRowT<MemoryType::on_device>& sibling,
+                        const GHistRowT<MemoryType::on_device>& parent);
+
+  uint32_t GetNumBins() const {
+      return nbins_;
+  }
+
+ private:
+  /*! \brief Number of all bins over all features */
+  uint32_t nbins_ { 0 };
+
+  ::sycl::queue qu_;
+};
+}  // namespace common
+}  // namespace sycl
+}  // namespace xgboost
+#endif  // PLUGIN_SYCL_COMMON_HIST_UTIL_H_
diff --git a/plugin/sycl/common/partition_builder.h b/plugin/sycl/common/partition_builder.h
index 37d1af241..c520ff31f 100644
--- a/plugin/sycl/common/partition_builder.h
+++ b/plugin/sycl/common/partition_builder.h
@@ -21,6 +21,9 @@
 #pragma GCC diagnostic pop
 
 #include "../data.h"
+#include "row_set.h"
+#include "../data/gradient_index.h"
+#include "../tree/expand_entry.h"
 
 #include <CL/sycl.hpp>
 
@@ -28,6 +31,87 @@ namespace xgboost {
 namespace sycl {
 namespace common {
 
+// split row indexes (rid_span) to 2 parts (both stored in rid_buf) depending
+// on comparison of indexes values (idx_span) and split point (split_cond)
+// Handle dense columns
+template <bool default_left, typename BinIdxType>
+inline ::sycl::event PartitionDenseKernel(
+                                 ::sycl::queue* qu,
+                                 const GHistIndexMatrix& gmat,
+                                 const RowSetCollection::Elem& rid_span,
+                                 const size_t fid,
+                                 const int32_t split_cond,
+                                 xgboost::common::Span<size_t>* rid_buf,
+                                 size_t* parts_size,
+                                 ::sycl::event event) {
+  const size_t row_stride = gmat.row_stride;
+  const BinIdxType* gradient_index = gmat.index.data<BinIdxType>();
+  const size_t* rid = rid_span.begin;
+  const size_t range_size = rid_span.Size();
+  const size_t offset = gmat.cut.Ptrs()[fid];
+
+  size_t* p_rid_buf = rid_buf->data();
+
+  return qu->submit([&](::sycl::handler& cgh) {
+    cgh.depends_on(event);
+    cgh.parallel_for<>(::sycl::range<1>(range_size), [=](::sycl::item<1> nid) {
+      const size_t id = rid[nid.get_id(0)];
+      const int32_t value = static_cast<int32_t>(gradient_index[id * row_stride + fid] + offset);
+      const bool is_left = value <= split_cond;
+      if (is_left) {
+        AtomicRef<size_t> n_left(parts_size[0]);
+        p_rid_buf[n_left.fetch_add(1)] = id;
+      } else {
+        AtomicRef<size_t> n_right(parts_size[1]);
+        p_rid_buf[range_size - n_right.fetch_add(1) - 1] = id;
+      }
+    });
+  });
+}
+
+// split row indexes (rid_span) to 2 parts (both stored in rid_buf) depending
+// on comparison of indexes values (idx_span) and split point (split_cond)
+// Handle sparce columns
+template <bool default_left, typename BinIdxType>
+inline ::sycl::event PartitionSparseKernel(::sycl::queue* qu,
+                                  const GHistIndexMatrix& gmat,
+                                  const RowSetCollection::Elem& rid_span,
+                                  const size_t fid,
+                                  const int32_t split_cond,
+                                  xgboost::common::Span<size_t>* rid_buf,
+                                  size_t* parts_size,
+                                  ::sycl::event event) {
+  const size_t row_stride = gmat.row_stride;
+  const BinIdxType* gradient_index = gmat.index.data<BinIdxType>();
+  const size_t* rid = rid_span.begin;
+  const size_t range_size = rid_span.Size();
+  const uint32_t* cut_ptrs = gmat.cut_device.Ptrs().DataConst();
+
+  size_t* p_rid_buf = rid_buf->data();
+  return qu->submit([&](::sycl::handler& cgh) {
+    cgh.depends_on(event);
+    cgh.parallel_for<>(::sycl::range<1>(range_size), [=](::sycl::item<1> nid) {
+      const size_t id = rid[nid.get_id(0)];
+
+      const BinIdxType* gr_index_local = gradient_index + row_stride * id;
+      const int32_t fid_local = std::lower_bound(gr_index_local,
+                                                 gr_index_local + row_stride,
+                                                 cut_ptrs[fid]) - gr_index_local;
+      const bool is_left = (fid_local >= row_stride ||
+                            gr_index_local[fid_local] >= cut_ptrs[fid + 1]) ?
+                              default_left :
+                              gr_index_local[fid_local] <= split_cond;
+      if (is_left) {
+        AtomicRef<size_t> n_left(parts_size[0]);
+        p_rid_buf[n_left.fetch_add(1)] = id;
+      } else {
+        AtomicRef<size_t> n_right(parts_size[1]);
+        p_rid_buf[range_size - n_right.fetch_add(1) - 1] = id;
+      }
+    });
+  });
+}
+
 // The builder is required for samples partition to left and rights children for set of nodes
 class PartitionBuilder {
  public:
@@ -53,7 +137,6 @@ class PartitionBuilder {
     return result_rows_[2 * nid];
   }
 
-
   size_t GetNRightElems(int nid) const {
     return result_rows_[2 * nid + 1];
   }
@@ -72,19 +155,97 @@ class PartitionBuilder {
     return { data_.Data() + nodes_offsets_[nid], nodes_offsets_[nid + 1] - nodes_offsets_[nid] };
   }
 
+  template <typename BinIdxType>
+  ::sycl::event Partition(const int32_t split_cond,
+                        const GHistIndexMatrix& gmat,
+                        const RowSetCollection::Elem& rid_span,
+                        const xgboost::RegTree::Node& node,
+                        xgboost::common::Span<size_t>* rid_buf,
+                        size_t* parts_size,
+                        ::sycl::event event) {
+    const bst_uint fid = node.SplitIndex();
+    const bool default_left = node.DefaultLeft();
+
+    if (gmat.IsDense()) {
+      if (default_left) {
+        return PartitionDenseKernel<true, BinIdxType>(qu_, gmat, rid_span, fid,
+                                                      split_cond, rid_buf, parts_size, event);
+      } else {
+        return PartitionDenseKernel<false, BinIdxType>(qu_, gmat, rid_span, fid,
+                                                      split_cond, rid_buf, parts_size, event);
+      }
+    } else {
+      if (default_left) {
+        return PartitionSparseKernel<true, BinIdxType>(qu_, gmat, rid_span, fid,
+                                                      split_cond, rid_buf, parts_size, event);
+      } else {
+        return PartitionSparseKernel<false, BinIdxType>(qu_, gmat, rid_span, fid,
+                                                        split_cond, rid_buf, parts_size, event);
+      }
+    }
+  }
+
+  // Entry point for Partition
+  void Partition(const GHistIndexMatrix& gmat,
+                 const std::vector<tree::ExpandEntry> nodes,
+                 const RowSetCollection& row_set_collection,
+                 const std::vector<int32_t>& split_conditions,
+                 RegTree* p_tree,
+                 ::sycl::event* general_event) {
+    nodes_events_.resize(n_nodes_);
+
+    parts_size_.ResizeAndFill(qu_, 2 * n_nodes_, 0, general_event);
+
+    for (size_t node_in_set = 0; node_in_set < n_nodes_; node_in_set++) {
+      const int32_t nid = nodes[node_in_set].nid;
+      ::sycl::event& node_event = nodes_events_[node_in_set];
+      const auto& rid_span = row_set_collection[nid];
+      if (rid_span.Size() > 0) {
+        const RegTree::Node& node = (*p_tree)[nid];
+        xgboost::common::Span<size_t> rid_buf = GetData(node_in_set);
+        size_t* part_size = parts_size_.Data() + 2 * node_in_set;
+        int32_t split_condition = split_conditions[node_in_set];
+        switch (gmat.index.GetBinTypeSize()) {
+          case common::BinTypeSize::kUint8BinsTypeSize:
+            node_event = Partition<uint8_t>(split_condition, gmat, rid_span, node,
+                                            &rid_buf, part_size, *general_event);
+            break;
+          case common::BinTypeSize::kUint16BinsTypeSize:
+            node_event = Partition<uint16_t>(split_condition, gmat, rid_span, node,
+                                            &rid_buf, part_size, *general_event);
+            break;
+          case common::BinTypeSize::kUint32BinsTypeSize:
+            node_event = Partition<uint32_t>(split_condition, gmat, rid_span, node,
+                                            &rid_buf, part_size, *general_event);
+            break;
+          default:
+            CHECK(false);  // no default behavior
+        }
+      } else {
+        node_event = ::sycl::event();
+      }
+    }
+
+    *general_event = qu_->memcpy(result_rows_.data(),
+                                 parts_size_.DataConst(),
+                                 sizeof(size_t) * 2 * n_nodes_,
+                                 nodes_events_);
+  }
+
   void MergeToArray(size_t nid,
                     size_t* data_result,
-                    ::sycl::event event) {
+                    ::sycl::event* event) {
     size_t n_nodes_total = GetNLeftElems(nid) + GetNRightElems(nid);
     if (n_nodes_total > 0) {
       const size_t* data = data_.Data() + nodes_offsets_[nid];
-      qu_->memcpy(data_result, data, sizeof(size_t) * n_nodes_total, event);
+      qu_->memcpy(data_result, data, sizeof(size_t) * n_nodes_total, *event);
     }
   }
 
  protected:
   std::vector<size_t> nodes_offsets_;
   std::vector<size_t> result_rows_;
+  std::vector<::sycl::event> nodes_events_;
   size_t n_nodes_;
 
   USMVector<size_t, MemoryType::on_device> parts_size_;
diff --git a/plugin/sycl/common/row_set.h b/plugin/sycl/common/row_set.h
new file mode 100644
index 000000000..574adbf8d
--- /dev/null
+++ b/plugin/sycl/common/row_set.h
@@ -0,0 +1,123 @@
+/*!
+ * Copyright 2017-2023 XGBoost contributors
+ */
+#ifndef PLUGIN_SYCL_COMMON_ROW_SET_H_
+#define PLUGIN_SYCL_COMMON_ROW_SET_H_
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
+#pragma GCC diagnostic ignored "-W#pragma-messages"
+#include <xgboost/data.h>
+#pragma GCC diagnostic pop
+#include <algorithm>
+#include <vector>
+#include <utility>
+
+#include "../data.h"
+
+#include <CL/sycl.hpp>
+
+namespace xgboost {
+namespace sycl {
+namespace common {
+
+
+/*! \brief Collection of rowsets stored on device in USM memory */
+class RowSetCollection {
+ public:
+  /*! \brief data structure to store an instance set, a subset of
+   *  rows (instances) associated with a particular node in a decision
+   *  tree. */
+  struct Elem {
+    const size_t* begin{nullptr};
+    const size_t* end{nullptr};
+    bst_node_t node_id{-1};  // id of node associated with this instance set; -1 means uninitialized
+    Elem()
+         = default;
+    Elem(const size_t* begin,
+         const size_t* end,
+         bst_node_t node_id = -1)
+        : begin(begin), end(end), node_id(node_id) {}
+
+
+    inline size_t Size() const {
+      return end - begin;
+    }
+  };
+
+  inline size_t Size() const {
+    return elem_of_each_node_.size();
+  }
+
+  /*! \brief return corresponding element set given the node_id */
+  inline const Elem& operator[](unsigned node_id) const {
+    const Elem& e = elem_of_each_node_[node_id];
+    CHECK(e.begin != nullptr)
+        << "access element that is not in the set";
+    return e;
+  }
+
+  /*! \brief return corresponding element set given the node_id */
+  inline Elem& operator[](unsigned node_id) {
+    Elem& e = elem_of_each_node_[node_id];
+    return e;
+  }
+
+  // clear up things
+  inline void Clear() {
+    elem_of_each_node_.clear();
+  }
+  // initialize node id 0->everything
+  inline void Init() {
+    CHECK_EQ(elem_of_each_node_.size(), 0U);
+
+    const size_t* begin = row_indices_.Begin();
+    const size_t* end = row_indices_.End();
+    elem_of_each_node_.emplace_back(Elem(begin, end, 0));
+  }
+
+  auto& Data() { return row_indices_; }
+
+  // split rowset into two
+  inline void AddSplit(unsigned node_id,
+                       unsigned left_node_id,
+                       unsigned right_node_id,
+                       size_t n_left,
+                       size_t n_right) {
+    const Elem e = elem_of_each_node_[node_id];
+    CHECK(e.begin != nullptr);
+    size_t* all_begin = row_indices_.Begin();
+    size_t* begin = all_begin + (e.begin - all_begin);
+
+
+    CHECK_EQ(n_left + n_right, e.Size());
+    CHECK_LE(begin + n_left, e.end);
+    CHECK_EQ(begin + n_left + n_right, e.end);
+
+
+    if (left_node_id >= elem_of_each_node_.size()) {
+      elem_of_each_node_.resize(left_node_id + 1, Elem(nullptr, nullptr, -1));
+    }
+    if (right_node_id >= elem_of_each_node_.size()) {
+      elem_of_each_node_.resize(right_node_id + 1, Elem(nullptr, nullptr, -1));
+    }
+
+
+    elem_of_each_node_[left_node_id] = Elem(begin, begin + n_left, left_node_id);
+    elem_of_each_node_[right_node_id] = Elem(begin + n_left, e.end, right_node_id);
+    elem_of_each_node_[node_id] = Elem(nullptr, nullptr, -1);
+  }
+
+ private:
+  // stores the row indexes in the set
+  USMVector<size_t, MemoryType::on_device> row_indices_;
+  // vector: node_id -> elements
+  std::vector<Elem> elem_of_each_node_;
+};
+
+}  // namespace common
+}  // namespace sycl
+}  // namespace xgboost
+
+
+#endif  // PLUGIN_SYCL_COMMON_ROW_SET_H_
diff --git a/plugin/sycl/data.h b/plugin/sycl/data.h
index 489fde989..f420ef470 100644
--- a/plugin/sycl/data.h
+++ b/plugin/sycl/data.h
@@ -26,8 +26,13 @@
 
 namespace xgboost {
 namespace sycl {
-enum class MemoryType { shared, on_device};
+template <typename T>
+using AtomicRef = ::sycl::atomic_ref<T,
+                                    ::sycl::memory_order::relaxed,
+                                    ::sycl::memory_scope::device,
+                                    ::sycl::access::address_space::ext_intel_global_device_space>;
 
+enum class MemoryType { shared, on_device};
 
 template <typename T>
 class USMDeleter {
@@ -166,20 +171,20 @@ class USMVector {
     }
   }
 
-  ::sycl::event ResizeAndFill(::sycl::queue* qu, size_t size_new, int v) {
+  void ResizeAndFill(::sycl::queue* qu, size_t size_new, int v, ::sycl::event* event) {
     if (size_new <= size_) {
       size_ = size_new;
-      return qu->memset(data_.get(), v, size_new * sizeof(T));
+      *event = qu->memset(data_.get(), v, size_new * sizeof(T), *event);
     } else if (size_new <= capacity_) {
       size_ = size_new;
-      return qu->memset(data_.get(), v, size_new * sizeof(T));
+      *event = qu->memset(data_.get(), v, size_new * sizeof(T), *event);
     } else {
       size_t size_old = size_;
       auto data_old = data_;
       size_ = size_new;
       capacity_ = size_new;
       data_ = allocate_memory_(qu, size_);
-      return qu->memset(data_.get(), v, size_new * sizeof(T));
+      *event = qu->memset(data_.get(), v, size_new * sizeof(T), *event);
     }
   }
 
@@ -206,11 +211,16 @@ class USMVector {
 struct DeviceMatrix {
   DMatrix* p_mat;  // Pointer to the original matrix on the host
   ::sycl::queue qu_;
-  USMVector<size_t> row_ptr;
+  USMVector<size_t, MemoryType::on_device> row_ptr;
   USMVector<Entry, MemoryType::on_device> data;
   size_t total_offset;
 
-  DeviceMatrix(::sycl::queue qu, DMatrix* dmat) : p_mat(dmat), qu_(qu) {
+  DeviceMatrix() = default;
+
+  void Init(::sycl::queue qu, DMatrix* dmat) {
+    qu_ = qu;
+    p_mat = dmat;
+
     size_t num_row = 0;
     size_t num_nonzero = 0;
     for (auto &batch : dmat->GetBatches<SparsePage>()) {
@@ -221,27 +231,41 @@ struct DeviceMatrix {
     }
 
     row_ptr.Resize(&qu_, num_row + 1);
+    size_t* rows = row_ptr.Data();
     data.Resize(&qu_, num_nonzero);
 
     size_t data_offset = 0;
+    ::sycl::event event;
     for (auto &batch : dmat->GetBatches<SparsePage>()) {
       const auto& data_vec = batch.data.HostVector();
       const auto& offset_vec = batch.offset.HostVector();
       size_t batch_size = batch.Size();
       if (batch_size > 0) {
-        std::copy(offset_vec.data(), offset_vec.data() + batch_size,
-                  row_ptr.Data() + batch.base_rowid);
-        if (batch.base_rowid > 0) {
-          for (size_t i = 0; i < batch_size; i++)
-            row_ptr[i + batch.base_rowid] += batch.base_rowid;
+        const auto base_rowid = batch.base_rowid;
+        event = qu.memcpy(row_ptr.Data() + base_rowid, offset_vec.data(),
+                          sizeof(size_t) * batch_size, event);
+        if (base_rowid > 0) {
+          qu.submit([&](::sycl::handler& cgh) {
+            cgh.depends_on(event);
+            cgh.parallel_for<>(::sycl::range<1>(batch_size), [=](::sycl::id<1> pid) {
+              int row_id = pid[0];
+              rows[row_id] += base_rowid;
+            });
+          });
         }
-        qu.memcpy(data.Data() + data_offset,
-                  data_vec.data(),
-                  offset_vec[batch_size] * sizeof(Entry)).wait();
+        event = qu.memcpy(data.Data() + data_offset, data_vec.data(),
+                          sizeof(Entry) * offset_vec[batch_size], event);
         data_offset += offset_vec[batch_size];
+        qu.wait();
       }
     }
-    row_ptr[num_row] = data_offset;
+    qu.submit([&](::sycl::handler& cgh) {
+      cgh.depends_on(event);
+      cgh.single_task<>([=] {
+        rows[num_row] = data_offset;
+      });
+    });
+    qu.wait();
     total_offset = data_offset;
   }
 
diff --git a/plugin/sycl/data/gradient_index.cc b/plugin/sycl/data/gradient_index.cc
new file mode 100644
index 000000000..e193b6689
--- /dev/null
+++ b/plugin/sycl/data/gradient_index.cc
@@ -0,0 +1,177 @@
+/*!
+ * Copyright 2017-2024 by Contributors
+ * \file gradient_index.cc
+ */
+#include <vector>
+#include <limits>
+#include <algorithm>
+
+#include "gradient_index.h"
+
+#include <CL/sycl.hpp>
+
+namespace xgboost {
+namespace sycl {
+namespace common {
+
+uint32_t SearchBin(const bst_float* cut_values, const uint32_t* cut_ptrs, Entry const& e) {
+  auto beg = cut_ptrs[e.index];
+  auto end = cut_ptrs[e.index + 1];
+  auto it = std::upper_bound(cut_values + beg, cut_values + end, e.fvalue);
+  uint32_t idx = it - cut_values;
+  if (idx == end) {
+    idx -= 1;
+  }
+  return idx;
+}
+
+template <typename BinIdxType>
+void mergeSort(BinIdxType* begin, BinIdxType* end, BinIdxType* buf) {
+  const size_t total_len = end - begin;
+  for (size_t block_len = 1; block_len < total_len; block_len <<= 1) {
+    for (size_t cur_block = 0; cur_block + block_len < total_len; cur_block += 2 * block_len) {
+      size_t start = cur_block;
+      size_t mid = start + block_len;
+      size_t finish = mid + block_len < total_len ? mid + block_len : total_len;
+      size_t left_pos = start;
+      size_t right_pos = mid;
+      size_t pos = start;
+      while (left_pos < mid || right_pos < finish) {
+        if (left_pos < mid && (right_pos == finish || begin[left_pos] < begin[right_pos])) {
+          buf[pos++] = begin[left_pos++];
+        } else {
+          buf[pos++] = begin[right_pos++];
+        }
+      }
+      for (size_t i = start; i < finish; i++) begin[i] = buf[i];
+    }
+  }
+}
+
+template <typename BinIdxType>
+void GHistIndexMatrix::SetIndexData(::sycl::queue qu,
+                                    BinIdxType* index_data,
+                                    const DeviceMatrix &dmat,
+                                    size_t nbins,
+                                    size_t row_stride,
+                                    uint32_t* offsets) {
+  if (nbins == 0) return;
+  const xgboost::Entry *data_ptr = dmat.data.DataConst();
+  const bst_idx_t *offset_vec = dmat.row_ptr.DataConst();
+  const size_t num_rows = dmat.row_ptr.Size() - 1;
+  const bst_float* cut_values = cut_device.Values().DataConst();
+  const uint32_t* cut_ptrs = cut_device.Ptrs().DataConst();
+  size_t* hit_count_ptr = hit_count_buff.Data();
+
+  // Sparse case only
+  if (!offsets) {
+    // sort_buff has type uint8_t
+    sort_buff.Resize(&qu, num_rows * row_stride * sizeof(BinIdxType));
+  }
+  BinIdxType* sort_data = reinterpret_cast<BinIdxType*>(sort_buff.Data());
+
+  auto event = qu.submit([&](::sycl::handler& cgh) {
+    cgh.parallel_for<>(::sycl::range<1>(num_rows), [=](::sycl::item<1> pid) {
+      const size_t i = pid.get_id(0);
+      const size_t ibegin = offset_vec[i];
+      const size_t iend = offset_vec[i + 1];
+      const size_t size = iend - ibegin;
+      const size_t start = i * row_stride;
+      for (bst_uint j = 0; j < size; ++j) {
+        uint32_t idx = SearchBin(cut_values, cut_ptrs, data_ptr[ibegin + j]);
+        index_data[start + j] = offsets ? idx - offsets[j] : idx;
+        AtomicRef<size_t> hit_count_ref(hit_count_ptr[idx]);
+        hit_count_ref.fetch_add(1);
+      }
+      if (!offsets) {
+        // Sparse case only
+        mergeSort<BinIdxType>(index_data + start, index_data + start + size, sort_data + start);
+        for (bst_uint j = size; j < row_stride; ++j) {
+          index_data[start + j] = nbins;
+        }
+      }
+    });
+  });
+  qu.memcpy(hit_count.data(), hit_count_ptr, nbins * sizeof(size_t), event);
+  qu.wait();
+}
+
+void GHistIndexMatrix::ResizeIndex(size_t n_index, bool isDense) {
+  if ((max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint8_t>::max())) && isDense) {
+    index.SetBinTypeSize(BinTypeSize::kUint8BinsTypeSize);
+    index.Resize((sizeof(uint8_t)) * n_index);
+  } else if ((max_num_bins - 1 > static_cast<int>(std::numeric_limits<uint8_t>::max())  &&
+    max_num_bins - 1 <= static_cast<int>(std::numeric_limits<uint16_t>::max())) && isDense) {
+    index.SetBinTypeSize(BinTypeSize::kUint16BinsTypeSize);
+    index.Resize((sizeof(uint16_t)) * n_index);
+  } else {
+    index.SetBinTypeSize(BinTypeSize::kUint32BinsTypeSize);
+    index.Resize((sizeof(uint32_t)) * n_index);
+  }
+}
+
+void GHistIndexMatrix::Init(::sycl::queue qu,
+                            Context const * ctx,
+                            const DeviceMatrix& p_fmat_device,
+                            int max_bins) {
+  nfeatures = p_fmat_device.p_mat->Info().num_col_;
+
+  cut = xgboost::common::SketchOnDMatrix(ctx, p_fmat_device.p_mat, max_bins);
+  cut_device.Init(qu, cut);
+
+  max_num_bins = max_bins;
+  const uint32_t nbins = cut.Ptrs().back();
+  this->nbins = nbins;
+  hit_count.resize(nbins, 0);
+  hit_count_buff.Resize(&qu, nbins, 0);
+
+  this->p_fmat = p_fmat_device.p_mat;
+  const bool isDense = p_fmat_device.p_mat->IsDense();
+  this->isDense_ = isDense;
+
+  index.setQueue(qu);
+
+  row_stride = 0;
+  for (const auto& batch : p_fmat_device.p_mat->GetBatches<SparsePage>()) {
+    const auto& row_offset = batch.offset.ConstHostVector();
+    for (auto i = 1ull; i < row_offset.size(); i++) {
+      row_stride = std::max(row_stride, static_cast<size_t>(row_offset[i] - row_offset[i - 1]));
+    }
+  }
+
+  const size_t n_offsets = cut_device.Ptrs().Size() - 1;
+  const size_t n_rows = p_fmat_device.row_ptr.Size() - 1;
+  const size_t n_index = n_rows * row_stride;
+  ResizeIndex(n_index, isDense);
+
+  CHECK_GT(cut_device.Values().Size(), 0U);
+
+  uint32_t* offsets = nullptr;
+  if (isDense) {
+    index.ResizeOffset(n_offsets);
+    offsets = index.Offset();
+    qu.memcpy(offsets, cut_device.Ptrs().DataConst(),
+              sizeof(uint32_t) * n_offsets).wait_and_throw();
+  }
+
+  if (isDense) {
+    BinTypeSize curent_bin_size = index.GetBinTypeSize();
+    if (curent_bin_size == BinTypeSize::kUint8BinsTypeSize) {
+      SetIndexData(qu, index.data<uint8_t>(), p_fmat_device, nbins, row_stride, offsets);
+
+    } else if (curent_bin_size == BinTypeSize::kUint16BinsTypeSize) {
+      SetIndexData(qu, index.data<uint16_t>(), p_fmat_device, nbins, row_stride, offsets);
+    } else {
+      CHECK_EQ(curent_bin_size, BinTypeSize::kUint32BinsTypeSize);
+      SetIndexData(qu, index.data<uint32_t>(), p_fmat_device, nbins, row_stride, offsets);
+    }
+  /* For sparse DMatrix we have to store index of feature for each bin
+     in index field to chose right offset. So offset is nullptr and index is not reduced */
+  } else {
+    SetIndexData(qu, index.data<uint32_t>(), p_fmat_device, nbins, row_stride, offsets);
+  }
+}
+
+}  // namespace common
+}  // namespace sycl
+}  // namespace xgboost
diff --git a/plugin/sycl/data/gradient_index.h b/plugin/sycl/data/gradient_index.h
new file mode 100644
index 000000000..13577025c
--- /dev/null
+++ b/plugin/sycl/data/gradient_index.h
@@ -0,0 +1,216 @@
+/*!
+ * Copyright 2017-2024 by Contributors
+ * \file gradient_index.h
+ */
+#ifndef PLUGIN_SYCL_DATA_GRADIENT_INDEX_H_
+#define PLUGIN_SYCL_DATA_GRADIENT_INDEX_H_
+
+#include <vector>
+
+#include "../data.h"
+#include "../../src/common/hist_util.h"
+
+#include <CL/sycl.hpp>
+
+namespace xgboost {
+namespace sycl {
+namespace common {
+
+/*!
+ * \brief SYCL implementation of HistogramCuts stored in USM buffers to provide access from device kernels
+ */
+class HistogramCuts {
+ protected:
+  using BinIdx = uint32_t;
+
+ public:
+  HistogramCuts() {}
+
+  explicit HistogramCuts(::sycl::queue qu) {}
+
+  ~HistogramCuts() {
+  }
+
+  void Init(::sycl::queue qu, xgboost::common::HistogramCuts const& cuts) {
+    qu_ = qu;
+    cut_values_.Init(&qu_, cuts.cut_values_.HostVector());
+    cut_ptrs_.Init(&qu_, cuts.cut_ptrs_.HostVector());
+    min_vals_.Init(&qu_, cuts.min_vals_.HostVector());
+  }
+
+  // Getters for USM buffers to pass pointers into device kernels
+  const USMVector<uint32_t>& Ptrs()      const { return cut_ptrs_;   }
+  const USMVector<float>&    Values()    const { return cut_values_; }
+  const USMVector<float>&    MinValues() const { return min_vals_;   }
+
+ private:
+  USMVector<bst_float> cut_values_;
+  USMVector<uint32_t> cut_ptrs_;
+  USMVector<float> min_vals_;
+  ::sycl::queue qu_;
+};
+
+using BinTypeSize = ::xgboost::common::BinTypeSize;
+
+/*!
+ * \brief Index data and offsets stored in USM buffers to provide access from device kernels
+ */
+struct Index {
+  Index() {
+    SetBinTypeSize(binTypeSize_);
+  }
+  Index(const Index& i) = delete;
+  Index& operator=(Index i) = delete;
+  Index(Index&& i) = delete;
+  Index& operator=(Index&& i) = delete;
+  uint32_t operator[](size_t i) const {
+    if (!offset_.Empty()) {
+      return func_(data_.DataConst(), i) + offset_[i%p_];
+    } else {
+      return func_(data_.DataConst(), i);
+    }
+  }
+  void SetBinTypeSize(BinTypeSize binTypeSize) {
+    binTypeSize_ = binTypeSize;
+    switch (binTypeSize) {
+      case BinTypeSize::kUint8BinsTypeSize:
+        func_ = &GetValueFromUint8;
+        break;
+      case BinTypeSize::kUint16BinsTypeSize:
+        func_ = &GetValueFromUint16;
+        break;
+      case BinTypeSize::kUint32BinsTypeSize:
+        func_ = &GetValueFromUint32;
+        break;
+      default:
+        CHECK(binTypeSize == BinTypeSize::kUint8BinsTypeSize  ||
+              binTypeSize == BinTypeSize::kUint16BinsTypeSize ||
+              binTypeSize == BinTypeSize::kUint32BinsTypeSize);
+    }
+  }
+  BinTypeSize GetBinTypeSize() const {
+    return binTypeSize_;
+  }
+
+  template<typename T>
+  T* data() {
+    return reinterpret_cast<T*>(data_.Data());
+  }
+
+  template<typename T>
+  const T* data() const {
+    return reinterpret_cast<const T*>(data_.DataConst());
+  }
+
+  uint32_t* Offset() {
+    return offset_.Data();
+  }
+
+  const uint32_t* Offset() const {
+    return offset_.DataConst();
+  }
+
+  size_t Size() const {
+    return data_.Size() / (binTypeSize_);
+  }
+
+  void Resize(const size_t nBytesData) {
+    data_.Resize(&qu_, nBytesData);
+  }
+
+  void ResizeOffset(const size_t nDisps) {
+    offset_.Resize(&qu_, nDisps);
+    p_ = nDisps;
+  }
+
+  uint8_t* begin() const {
+    return data_.Begin();
+  }
+
+  uint8_t* end() const {
+    return data_.End();
+  }
+
+  void setQueue(::sycl::queue qu) {
+    qu_ = qu;
+  }
+
+ private:
+  static uint32_t GetValueFromUint8(const uint8_t* t, size_t i) {
+    return reinterpret_cast<const uint8_t*>(t)[i];
+  }
+  static uint32_t GetValueFromUint16(const uint8_t* t, size_t i) {
+    return reinterpret_cast<const uint16_t*>(t)[i];
+  }
+  static uint32_t GetValueFromUint32(const uint8_t* t, size_t i) {
+    return reinterpret_cast<const uint32_t*>(t)[i];
+  }
+
+  using Func = uint32_t (*)(const uint8_t*, size_t);
+
+  USMVector<uint8_t, MemoryType::on_device> data_;
+  // size of this field is equal to number of features
+  USMVector<uint32_t, MemoryType::on_device> offset_;
+  BinTypeSize binTypeSize_ {BinTypeSize::kUint8BinsTypeSize};
+  size_t p_ {1};
+  Func func_;
+
+  ::sycl::queue qu_;
+};
+
+/*!
+ * \brief Preprocessed global index matrix, in CSR format, stored in USM buffers
+ *
+ *  Transform floating values to integer index in histogram
+ */
+struct GHistIndexMatrix {
+  /*! \brief row pointer to rows by element position */
+  /*! \brief The index data */
+  Index index;
+  /*! \brief hit count of each index */
+  std::vector<size_t> hit_count;
+  /*! \brief buffers for calculations */
+  USMVector<size_t, MemoryType::on_device> hit_count_buff;
+  USMVector<uint8_t, MemoryType::on_device> sort_buff;
+  /*! \brief The corresponding cuts */
+  xgboost::common::HistogramCuts cut;
+  HistogramCuts cut_device;
+  DMatrix* p_fmat;
+  size_t max_num_bins;
+  size_t nbins;
+  size_t nfeatures;
+  size_t row_stride;
+
+  // Create a global histogram matrix based on a given DMatrix device wrapper
+  void Init(::sycl::queue qu, Context const * ctx,
+            const sycl::DeviceMatrix& p_fmat_device, int max_num_bins);
+
+  template <typename BinIdxType>
+  void SetIndexData(::sycl::queue qu, BinIdxType* index_data,
+                    const sycl::DeviceMatrix &dmat_device,
+                    size_t nbins, size_t row_stride, uint32_t* offsets);
+
+  void ResizeIndex(size_t n_index, bool isDense);
+
+  inline void GetFeatureCounts(size_t* counts) const {
+    auto nfeature = cut_device.Ptrs().Size() - 1;
+    for (unsigned fid = 0; fid < nfeature; ++fid) {
+      auto ibegin = cut_device.Ptrs()[fid];
+      auto iend = cut_device.Ptrs()[fid + 1];
+      for (auto i = ibegin; i < iend; ++i) {
+        *(counts + fid) += hit_count[i];
+      }
+    }
+  }
+  inline bool IsDense() const {
+    return isDense_;
+  }
+
+ private:
+  bool isDense_;
+};
+
+}  // namespace common
+}  // namespace sycl
+}  // namespace xgboost
+#endif  // PLUGIN_SYCL_DATA_GRADIENT_INDEX_H_
diff --git a/plugin/sycl/predictor/predictor.cc b/plugin/sycl/predictor/predictor.cc
index dd56dd3bd..943949c2a 100755
--- a/plugin/sycl/predictor/predictor.cc
+++ b/plugin/sycl/predictor/predictor.cc
@@ -280,7 +280,8 @@ class Predictor : public xgboost::Predictor {
                     uint32_t tree_end = 0) const override {
     ::sycl::queue qu = device_manager.GetQueue(ctx_->Device());
     // TODO(razdoburdin): remove temporary workaround after cache fix
-    sycl::DeviceMatrix device_matrix(qu, dmat);
+    sycl::DeviceMatrix device_matrix;
+    device_matrix.Init(qu, dmat);
 
     auto* out_preds = &predts->predictions;
     if (tree_end == 0) {
diff --git a/plugin/sycl/tree/expand_entry.h b/plugin/sycl/tree/expand_entry.h
new file mode 100644
index 000000000..2520ff95d
--- /dev/null
+++ b/plugin/sycl/tree/expand_entry.h
@@ -0,0 +1,50 @@
+/*!
+ * Copyright 2017-2024 by Contributors
+ * \file expand_entry.h
+ */
+#ifndef PLUGIN_SYCL_TREE_EXPAND_ENTRY_H_
+#define PLUGIN_SYCL_TREE_EXPAND_ENTRY_H_
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
+#include "../../src/tree/constraints.h"
+#pragma GCC diagnostic pop
+#include "../../src/tree/hist/expand_entry.h"
+
+namespace xgboost {
+namespace sycl {
+namespace tree {
+/* tree growing policies */
+struct ExpandEntry : public xgboost::tree::ExpandEntryImpl<ExpandEntry> {
+  static constexpr bst_node_t kRootNid  = 0;
+
+  xgboost::tree::SplitEntry split;
+
+  ExpandEntry(int nid, int depth) : ExpandEntryImpl{nid, depth} {}
+
+  inline bst_node_t GetSiblingId(const xgboost::RegTree* p_tree) const {
+    CHECK_EQ((*p_tree)[nid].IsRoot(), false);
+    const size_t parent_id = (*p_tree)[nid].Parent();
+    return GetSiblingId(p_tree, parent_id);
+  }
+
+  inline bst_node_t GetSiblingId(const xgboost::RegTree* p_tree, size_t parent_id) const {
+    return p_tree->IsLeftChild(nid) ? p_tree->RightChild(parent_id)
+                                    : p_tree->LeftChild(parent_id);
+  }
+
+  bool IsValidImpl(xgboost::tree::TrainParam const &param, int32_t num_leaves) const {
+    if (split.loss_chg <= kRtEps) return false;
+    if (split.loss_chg < param.min_split_loss) return false;
+    if (param.max_depth > 0 && depth == param.max_depth) return false;
+    if (param.max_leaves > 0 && num_leaves == param.max_leaves) return false;
+
+    return true;
+  }
+};
+
+}  // namespace tree
+}  // namespace sycl
+}  // namespace xgboost
+
+#endif  // PLUGIN_SYCL_TREE_EXPAND_ENTRY_H_
diff --git a/plugin/sycl/tree/param.h b/plugin/sycl/tree/param.h
new file mode 100644
index 000000000..1b47d83a4
--- /dev/null
+++ b/plugin/sycl/tree/param.h
@@ -0,0 +1,55 @@
+/*!
+ * Copyright 2014-2024 by Contributors
+ */
+#ifndef PLUGIN_SYCL_TREE_PARAM_H_
+#define PLUGIN_SYCL_TREE_PARAM_H_
+
+
+#include <cmath>
+#include <cstring>
+#include <limits>
+#include <string>
+#include <vector>
+
+
+#include "xgboost/parameter.h"
+#include "xgboost/data.h"
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
+#include "../src/tree/param.h"
+#pragma GCC diagnostic pop
+
+#include <CL/sycl.hpp>
+
+namespace xgboost {
+namespace sycl {
+namespace tree {
+
+
+/*! \brief Wrapper for necessary training parameters for regression tree to access on device */
+/* The original structure xgboost::tree::TrainParam can't be used,
+ * since std::vector are not copyable on sycl-devices.
+ */
+struct TrainParam {
+  float min_child_weight;
+  float reg_lambda;
+  float reg_alpha;
+  float max_delta_step;
+
+  TrainParam() {}
+
+  explicit TrainParam(const xgboost::tree::TrainParam& param) {
+    reg_lambda = param.reg_lambda;
+    reg_alpha = param.reg_alpha;
+    min_child_weight = param.min_child_weight;
+    max_delta_step = param.max_delta_step;
+  }
+};
+
+template <typename GradType>
+using GradStats = xgboost::detail::GradientPairInternal<GradType>;
+
+}  // namespace tree
+}  // namespace sycl
+}  // namespace xgboost
+#endif  // PLUGIN_SYCL_TREE_PARAM_H_
diff --git a/plugin/sycl/tree/split_evaluator.h b/plugin/sycl/tree/split_evaluator.h
new file mode 100644
index 000000000..2f1e8c7c4
--- /dev/null
+++ b/plugin/sycl/tree/split_evaluator.h
@@ -0,0 +1,208 @@
+/*!
+ * Copyright 2018-2024 by Contributors
+ */
+
+#ifndef PLUGIN_SYCL_TREE_SPLIT_EVALUATOR_H_
+#define PLUGIN_SYCL_TREE_SPLIT_EVALUATOR_H_
+
+#include <dmlc/registry.h>
+#include <xgboost/base.h>
+#include <utility>
+#include <vector>
+#include <limits>
+
+#include "param.h"
+#include "../data.h"
+
+#include "xgboost/tree_model.h"
+#include "xgboost/host_device_vector.h"
+#include "xgboost/context.h"
+#include "../../src/common/transform.h"
+#include "../../src/common/math.h"
+#include "../../src/tree/param.h"
+
+#include <CL/sycl.hpp>
+
+namespace xgboost {
+namespace sycl {
+namespace tree {
+
+/*! \brief SYCL implementation of TreeEvaluator, with USM memory for temporary buffer to access on device.
+ *         It also contains own implementation of SplitEvaluator for device compilation, because some of the
+           functions from the original SplitEvaluator are currently not supported
+ */
+
+template<typename GradType>
+class TreeEvaluator {
+  // hist and exact use parent id to calculate constraints.
+  static constexpr bst_node_t kRootParentId =
+      (-1 & static_cast<bst_node_t>((1U << 31) - 1));
+
+  USMVector<GradType> lower_bounds_;
+  USMVector<GradType> upper_bounds_;
+  USMVector<int> monotone_;
+  TrainParam param_;
+  ::sycl::queue qu_;
+  bool has_constraint_;
+
+ public:
+  void Reset(::sycl::queue qu, xgboost::tree::TrainParam const& p, bst_feature_t n_features) {
+    qu_ = qu;
+
+    has_constraint_ = false;
+    for (const auto& constraint : p.monotone_constraints) {
+      if (constraint != 0) {
+        has_constraint_ = true;
+        break;
+      }
+    }
+
+    if (has_constraint_) {
+      monotone_.Resize(&qu_, n_features, 0);
+      qu_.memcpy(monotone_.Data(), p.monotone_constraints.data(),
+                 sizeof(int) * p.monotone_constraints.size());
+      qu_.wait();
+
+      lower_bounds_.Resize(&qu_, p.MaxNodes(), std::numeric_limits<GradType>::lowest());
+      upper_bounds_.Resize(&qu_, p.MaxNodes(), std::numeric_limits<GradType>::max());
+    }
+    param_ = TrainParam(p);
+  }
+
+  bool HasConstraint() const {
+    return has_constraint_;
+  }
+
+  TreeEvaluator(::sycl::queue qu, xgboost::tree::TrainParam const& p, bst_feature_t n_features) {
+    Reset(qu, p, n_features);
+  }
+
+  struct SplitEvaluator {
+    const int* constraints;
+    const GradType* lower;
+    const GradType* upper;
+    bool has_constraint;
+    TrainParam param;
+
+    GradType CalcSplitGain(bst_node_t nidx,
+                        bst_feature_t fidx,
+                        const GradStats<GradType>& left,
+                        const GradStats<GradType>& right) const {
+      const GradType negative_infinity = -std::numeric_limits<GradType>::infinity();
+      GradType wleft = this->CalcWeight(nidx, left);
+      GradType wright = this->CalcWeight(nidx, right);
+
+      GradType gain = this->CalcGainGivenWeight(nidx, left,  wleft) +
+                      this->CalcGainGivenWeight(nidx, right, wright);
+      if (!has_constraint) {
+        return gain;
+      }
+
+      int constraint = constraints[fidx];
+      if (constraint == 0) {
+        return gain;
+      } else if (constraint > 0) {
+        return wleft <= wright ? gain : negative_infinity;
+      } else {
+        return wleft >= wright ? gain : negative_infinity;
+      }
+    }
+
+    inline static GradType ThresholdL1(GradType w, float alpha) {
+      if (w > + alpha) {
+        return w - alpha;
+      }
+      if (w < - alpha) {
+        return w + alpha;
+      }
+      return 0.0;
+    }
+
+    inline GradType CalcWeight(GradType sum_grad, GradType sum_hess) const {
+      if (sum_hess < param.min_child_weight || sum_hess <= 0.0) {
+        return 0.0;
+      }
+      GradType dw = -this->ThresholdL1(sum_grad, param.reg_alpha) / (sum_hess + param.reg_lambda);
+      if (param.max_delta_step != 0.0f && std::abs(dw) > param.max_delta_step) {
+        dw = ::sycl::copysign((GradType)param.max_delta_step, dw);
+      }
+      return dw;
+    }
+
+    inline GradType CalcWeight(bst_node_t nodeid, const GradStats<GradType>& stats) const {
+      GradType w = this->CalcWeight(stats.GetGrad(), stats.GetHess());
+      if (!has_constraint) {
+        return w;
+      }
+
+      if (nodeid == kRootParentId) {
+        return w;
+      } else if (w < lower[nodeid]) {
+        return lower[nodeid];
+      } else if (w > upper[nodeid]) {
+        return upper[nodeid];
+      } else {
+        return w;
+      }
+    }
+
+    inline GradType CalcGainGivenWeight(GradType sum_grad, GradType sum_hess, GradType w) const {
+      return -(2.0f * sum_grad * w + (sum_hess + param.reg_lambda) * xgboost::common::Sqr(w));
+    }
+
+    inline GradType CalcGainGivenWeight(bst_node_t nid, const GradStats<GradType>& stats,
+                                        GradType w) const {
+      if (stats.GetHess() <= 0) {
+        return .0f;
+      }
+      // Avoiding tree::CalcGainGivenWeight can significantly reduce avg floating point error.
+      if (param.max_delta_step == 0.0f && has_constraint == false) {
+        return xgboost::common::Sqr(this->ThresholdL1(stats.GetGrad(), param.reg_alpha)) /
+               (stats.GetHess() + param.reg_lambda);
+      }
+      return this->CalcGainGivenWeight(stats.GetGrad(), stats.GetHess(), w);
+    }
+
+    GradType CalcGain(bst_node_t nid, const GradStats<GradType>& stats) const {
+      return this->CalcGainGivenWeight(nid, stats, this->CalcWeight(nid, stats));
+    }
+  };
+
+ public:
+  /* Get a view to the evaluator that can be passed down to device. */
+  auto GetEvaluator() const {
+    return SplitEvaluator{monotone_.DataConst(),
+                          lower_bounds_.DataConst(),
+                          upper_bounds_.DataConst(),
+                          has_constraint_,
+                          param_};
+  }
+
+  void AddSplit(bst_node_t nodeid, bst_node_t leftid, bst_node_t rightid,
+                bst_feature_t f, GradType left_weight, GradType right_weight) {
+    if (!has_constraint_) {
+      return;
+    }
+
+    lower_bounds_[leftid] = lower_bounds_[nodeid];
+    upper_bounds_[leftid] = upper_bounds_[nodeid];
+
+    lower_bounds_[rightid] = lower_bounds_[nodeid];
+    upper_bounds_[rightid] = upper_bounds_[nodeid];
+    int32_t c = monotone_[f];
+    GradType mid = (left_weight + right_weight) / 2;
+
+    if (c < 0) {
+      lower_bounds_[leftid] = mid;
+      upper_bounds_[rightid] = mid;
+    } else if (c > 0) {
+      upper_bounds_[leftid] = mid;
+      lower_bounds_[rightid] = mid;
+    }
+  }
+};
+}  // namespace tree
+}  // namespace sycl
+}  // namespace xgboost
+
+#endif  // PLUGIN_SYCL_TREE_SPLIT_EVALUATOR_H_
diff --git a/plugin/sycl/tree/updater_quantile_hist.cc b/plugin/sycl/tree/updater_quantile_hist.cc
new file mode 100644
index 000000000..98a42c3c8
--- /dev/null
+++ b/plugin/sycl/tree/updater_quantile_hist.cc
@@ -0,0 +1,55 @@
+/*!
+ * Copyright 2017-2024 by Contributors
+ * \file updater_quantile_hist.cc
+ */
+#include <vector>
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
+#pragma GCC diagnostic ignored "-W#pragma-messages"
+#include "xgboost/tree_updater.h"
+#pragma GCC diagnostic pop
+
+#include "xgboost/logging.h"
+
+#include "updater_quantile_hist.h"
+#include "../data.h"
+
+namespace xgboost {
+namespace sycl {
+namespace tree {
+
+DMLC_REGISTRY_FILE_TAG(updater_quantile_hist_sycl);
+
+DMLC_REGISTER_PARAMETER(HistMakerTrainParam);
+
+void QuantileHistMaker::Configure(const Args& args) {
+  const DeviceOrd device_spec = ctx_->Device();
+  qu_ = device_manager.GetQueue(device_spec);
+
+  param_.UpdateAllowUnknown(args);
+  hist_maker_param_.UpdateAllowUnknown(args);
+}
+
+void QuantileHistMaker::Update(xgboost::tree::TrainParam const *param,
+                               linalg::Matrix<GradientPair>* gpair,
+                               DMatrix *dmat,
+                               xgboost::common::Span<HostDeviceVector<bst_node_t>> out_position,
+                               const std::vector<RegTree *> &trees) {
+  LOG(FATAL) << "Not Implemented yet";
+}
+
+bool QuantileHistMaker::UpdatePredictionCache(const DMatrix* data,
+                                              linalg::MatrixView<float> out_preds) {
+  LOG(FATAL) << "Not Implemented yet";
+}
+
+XGBOOST_REGISTER_TREE_UPDATER(QuantileHistMaker, "grow_quantile_histmaker_sycl")
+.describe("Grow tree using quantized histogram with SYCL.")
+.set_body(
+    [](Context const* ctx, ObjInfo const * task) {
+      return new QuantileHistMaker(ctx, task);
+    });
+}  // namespace tree
+}  // namespace sycl
+}  // namespace xgboost
diff --git a/plugin/sycl/tree/updater_quantile_hist.h b/plugin/sycl/tree/updater_quantile_hist.h
new file mode 100644
index 000000000..93a50de3e
--- /dev/null
+++ b/plugin/sycl/tree/updater_quantile_hist.h
@@ -0,0 +1,91 @@
+/*!
+ * Copyright 2017-2024 by Contributors
+ * \file updater_quantile_hist.h
+ */
+#ifndef PLUGIN_SYCL_TREE_UPDATER_QUANTILE_HIST_H_
+#define PLUGIN_SYCL_TREE_UPDATER_QUANTILE_HIST_H_
+
+#include <dmlc/timer.h>
+#include <xgboost/tree_updater.h>
+
+#include <vector>
+
+#include "../data/gradient_index.h"
+#include "../common/hist_util.h"
+#include "../common/row_set.h"
+#include "../common/partition_builder.h"
+#include "split_evaluator.h"
+#include "../device_manager.h"
+
+#include "xgboost/data.h"
+#include "xgboost/json.h"
+#include "../../src/tree/constraints.h"
+#include "../../src/common/random.h"
+
+namespace xgboost {
+namespace sycl {
+namespace tree {
+
+// training parameters specific to this algorithm
+struct HistMakerTrainParam
+    : public XGBoostParameter<HistMakerTrainParam> {
+  bool single_precision_histogram = false;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(HistMakerTrainParam) {
+    DMLC_DECLARE_FIELD(single_precision_histogram).set_default(false).describe(
+        "Use single precision to build histograms.");
+  }
+};
+
+/*! \brief construct a tree using quantized feature values with SYCL backend*/
+class QuantileHistMaker: public TreeUpdater {
+ public:
+  QuantileHistMaker(Context const* ctx, ObjInfo const * task) :
+                             TreeUpdater(ctx), task_{task} {
+    updater_monitor_.Init("SYCLQuantileHistMaker");
+  }
+  void Configure(const Args& args) override;
+
+  void Update(xgboost::tree::TrainParam const *param,
+              linalg::Matrix<GradientPair>* gpair,
+              DMatrix* dmat,
+              xgboost::common::Span<HostDeviceVector<bst_node_t>> out_position,
+              const std::vector<RegTree*>& trees) override;
+
+  bool UpdatePredictionCache(const DMatrix* data,
+                             linalg::MatrixView<float> out_preds) override;
+
+  void LoadConfig(Json const& in) override {
+    auto const& config = get<Object const>(in);
+    FromJson(config.at("train_param"), &this->param_);
+    FromJson(config.at("sycl_hist_train_param"), &this->hist_maker_param_);
+  }
+
+  void SaveConfig(Json* p_out) const override {
+    auto& out = *p_out;
+    out["train_param"] = ToJson(param_);
+    out["sycl_hist_train_param"] = ToJson(hist_maker_param_);
+  }
+
+  char const* Name() const override {
+    return "grow_quantile_histmaker_sycl";
+  }
+
+ protected:
+  HistMakerTrainParam hist_maker_param_;
+  // training parameter
+  xgboost::tree::TrainParam param_;
+
+  xgboost::common::Monitor updater_monitor_;
+
+  ::sycl::queue qu_;
+  DeviceManager device_manager;
+  ObjInfo const *task_{nullptr};
+};
+
+
+}  // namespace tree
+}  // namespace sycl
+}  // namespace xgboost
+
+#endif  // PLUGIN_SYCL_TREE_UPDATER_QUANTILE_HIST_H_
diff --git a/python-package/packager/nativelib.py b/python-package/packager/nativelib.py
index 0227cff37..42b510eef 100644
--- a/python-package/packager/nativelib.py
+++ b/python-package/packager/nativelib.py
@@ -32,7 +32,10 @@ def build_libxgboost(
     build_dir: pathlib.Path,
     build_config: BuildConfiguration,
 ) -> pathlib.Path:
-    """Build libxgboost in a temporary directory and obtain the path to built libxgboost"""
+    """Build libxgboost in a temporary directory and obtain the path to built
+    libxgboost.
+
+    """
     logger = logging.getLogger("xgboost.packager.build_libxgboost")
 
     if not cpp_src_dir.is_dir():
@@ -51,8 +54,8 @@ def build_libxgboost(
         cmake_cmd.extend(build_config.get_cmake_args())
 
         # Flag for cross-compiling for Apple Silicon
-        # We use environment variable because it's the only way to pass down custom flags
-        # through the cibuildwheel package, which calls `pip wheel` command.
+        # We use environment variable because it's the only way to pass down custom
+        # flags through the cibuildwheel package, which calls `pip wheel` command.
         if "CIBW_TARGET_OSX_ARM64" in os.environ:
             cmake_cmd.append("-DCMAKE_OSX_ARCHITECTURES=arm64")
 
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 27331d3de..36e4bdcf0 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -804,10 +804,11 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes,too-many-public-m
 
             Otherwise, one can pass a list-like input with the same length as number
             of columns in `data`, with the following possible values:
-             - "c", which represents categorical columns.
-             - "q", which represents numeric columns.
-             - "int", which represents integer columns.
-             - "i", which represents boolean columns.
+
+            - "c", which represents categorical columns.
+            - "q", which represents numeric columns.
+            - "int", which represents integer columns.
+            - "i", which represents boolean columns.
 
             Note that, while categorical types are treated differently from
             the rest for model fitting purposes, the other types do not influence
@@ -861,9 +862,9 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes,too-many-public-m
         self.nthread = nthread if nthread is not None else -1
         self.silent = silent
 
-        # force into void_p, mac need to pass things in as void_p
-        if data is None:
-            self.handle: Optional[ctypes.c_void_p] = None
+        if isinstance(data, ctypes.c_void_p):
+            # Used for constructing DMatrix slice.
+            self.handle = data
             return
 
         from .data import _is_iter, dispatch_data_backend
@@ -925,9 +926,10 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes,too-many-public-m
         self.handle = handle
 
     def __del__(self) -> None:
-        if hasattr(self, "handle") and self.handle:
+        if hasattr(self, "handle"):
+            assert self.handle is not None
             _check_call(_LIB.XGDMatrixFree(self.handle))
-            self.handle = None
+            del self.handle
 
     @_deprecate_positional_args
     def set_info(
@@ -1281,19 +1283,19 @@ class DMatrix:  # pylint: disable=too-many-instance-attributes,too-many-public-m
         """
         from .data import _maybe_np_slice
 
-        res = DMatrix(None)
-        res.handle = ctypes.c_void_p()
+        handle = ctypes.c_void_p()
+
         rindex = _maybe_np_slice(rindex, dtype=np.int32)
         _check_call(
             _LIB.XGDMatrixSliceDMatrixEx(
                 self.handle,
                 c_array(ctypes.c_int, rindex),
                 c_bst_ulong(len(rindex)),
-                ctypes.byref(res.handle),
+                ctypes.byref(handle),
                 ctypes.c_int(1 if allow_groups else 0),
             )
         )
-        return res
+        return DMatrix(handle)
 
     @property
     def feature_names(self) -> Optional[FeatureNames]:
diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py
index 49a0f43b7..12b576566 100644
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -909,9 +909,19 @@ def _transform_cudf_df(
     enable_categorical: bool,
 ) -> Tuple[ctypes.c_void_p, list, Optional[FeatureNames], Optional[FeatureTypes]]:
     try:
-        from cudf.api.types import is_categorical_dtype
+        from cudf.api.types import is_bool_dtype, is_categorical_dtype
     except ImportError:
         from cudf.utils.dtypes import is_categorical_dtype
+        from pandas.api.types import is_bool_dtype
+
+    # Work around https://github.com/dmlc/xgboost/issues/10181
+    if _is_cudf_ser(data):
+        if is_bool_dtype(data.dtype):
+            data = data.astype(np.uint8)
+    else:
+        data = data.astype(
+            {col: np.uint8 for col in data.select_dtypes(include="bool")}
+        )
 
     if _is_cudf_ser(data):
         dtypes = [data.dtype]
@@ -1053,10 +1063,10 @@ def _is_dlpack(data: DataType) -> bool:
 
 
 def _transform_dlpack(data: DataType) -> bool:
-    from cupy import fromDlpack  # pylint: disable=E0401
+    from cupy import from_dlpack  # pylint: disable=E0401
 
     assert "used_dltensor" not in str(data)
-    data = fromDlpack(data)
+    data = from_dlpack(data)
     return data
 
 
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index 5d651948c..c4713a9e4 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -5,12 +5,14 @@ import json
 import os
 import warnings
 from concurrent.futures import ThreadPoolExecutor
+from inspect import signature
 from typing import (
     Any,
     Callable,
     Dict,
     List,
     Optional,
+    Protocol,
     Sequence,
     Tuple,
     Type,
@@ -67,14 +69,20 @@ def _can_use_qdm(tree_method: Optional[str]) -> bool:
     return tree_method in ("hist", "gpu_hist", None, "auto")
 
 
-SklObjective = Optional[
-    Union[str, Callable[[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]]
-]
+class _SklObjWProto(Protocol):  # pylint: disable=too-few-public-methods
+    def __call__(
+        self,
+        y_true: ArrayLike,
+        y_pred: ArrayLike,
+        sample_weight: Optional[ArrayLike],
+    ) -> Tuple[ArrayLike, ArrayLike]: ...
 
 
-def _objective_decorator(
-    func: Callable[[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]
-) -> Objective:
+_SklObjProto = Callable[[ArrayLike, ArrayLike], Tuple[np.ndarray, np.ndarray]]
+SklObjective = Optional[Union[str, _SklObjWProto, _SklObjProto]]
+
+
+def _objective_decorator(func: Union[_SklObjWProto, _SklObjProto]) -> Objective:
     """Decorate an objective function
 
     Converts an objective function using the typical sklearn metrics
@@ -89,6 +97,8 @@ def _objective_decorator(
             The target values
         y_pred: array_like of shape [n_samples]
             The predicted values
+        sample_weight :
+            Optional sample weight, None or a ndarray.
 
     Returns
     -------
@@ -103,10 +113,25 @@ def _objective_decorator(
             ``dmatrix.get_label()``
     """
 
+    parameters = signature(func).parameters
+    supports_sw = "sample_weight" in parameters
+
     def inner(preds: np.ndarray, dmatrix: DMatrix) -> Tuple[np.ndarray, np.ndarray]:
-        """internal function"""
+        """Internal function."""
+        sample_weight = dmatrix.get_weight()
         labels = dmatrix.get_label()
-        return func(labels, preds)
+
+        if sample_weight.size > 0 and not supports_sw:
+            raise ValueError(
+                "Custom objective doesn't have the `sample_weight` parameter while"
+                " sample_weight is used."
+            )
+        if sample_weight.size > 0:
+            fnw = cast(_SklObjWProto, func)
+            return fnw(labels, preds, sample_weight=sample_weight)
+
+        fn = cast(_SklObjProto, func)
+        return fn(labels, preds)
 
     return inner
 
@@ -172,75 +197,121 @@ def ltr_metric_decorator(func: Callable, n_jobs: Optional[int]) -> Metric:
     return inner
 
 
-__estimator_doc = """
-    n_estimators : Optional[int]
+__estimator_doc = f"""
+    n_estimators : {Optional[int]}
         Number of gradient boosted trees.  Equivalent to number of boosting
         rounds.
 """
 
 __model_doc = f"""
-    max_depth :  Optional[int]
+    max_depth :  {Optional[int]}
+
         Maximum tree depth for base learners.
-    max_leaves :
+
+    max_leaves : {Optional[int]}
+
         Maximum number of leaves; 0 indicates no limit.
-    max_bin :
+
+    max_bin : {Optional[int]}
+
         If using histogram-based algorithm, maximum number of bins per feature
-    grow_policy :
-        Tree growing policy. 0: favor splitting at nodes closest to the node, i.e. grow
-        depth-wise. 1: favor splitting at nodes with highest loss change.
-    learning_rate : Optional[float]
+
+    grow_policy : {Optional[str]}
+
+        Tree growing policy.
+
+        - depthwise: Favors splitting at nodes closest to the node,
+        - lossguide: Favors splitting at nodes with highest loss change.
+
+    learning_rate : {Optional[float]}
+
         Boosting learning rate (xgb's "eta")
-    verbosity : Optional[int]
+
+    verbosity : {Optional[int]}
+
         The degree of verbosity. Valid values are 0 (silent) - 3 (debug).
 
     objective : {SklObjective}
 
         Specify the learning task and the corresponding learning objective or a custom
-        objective function to be used. For custom objective, see
-        :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more
-        information.
+        objective function to be used.
+
+        For custom objective, see :doc:`/tutorials/custom_metric_obj` and
+        :ref:`custom-obj-metric` for more information, along with the end note for
+        function signatures.
+
+    booster: {Optional[str]}
+
+        Specify which booster to use: ``gbtree``, ``gblinear`` or ``dart``.
+
+    tree_method : {Optional[str]}
 
-    booster: Optional[str]
-        Specify which booster to use: `gbtree`, `gblinear` or `dart`.
-    tree_method: Optional[str]
         Specify which tree method to use.  Default to auto.  If this parameter is set to
         default, XGBoost will choose the most conservative option available.  It's
         recommended to study this option from the parameters document :doc:`tree method
         </treemethod>`
-    n_jobs : Optional[int]
+
+    n_jobs : {Optional[int]}
+
         Number of parallel threads used to run xgboost.  When used with other
         Scikit-Learn algorithms like grid search, you may choose which algorithm to
         parallelize and balance the threads.  Creating thread contention will
         significantly slow down both algorithms.
-    gamma : Optional[float]
-        (min_split_loss) Minimum loss reduction required to make a further partition on a
-        leaf node of the tree.
-    min_child_weight : Optional[float]
+
+    gamma : {Optional[float]}
+
+        (min_split_loss) Minimum loss reduction required to make a further partition on
+        a leaf node of the tree.
+
+    min_child_weight : {Optional[float]}
+
         Minimum sum of instance weight(hessian) needed in a child.
-    max_delta_step : Optional[float]
+
+    max_delta_step : {Optional[float]}
+
         Maximum delta step we allow each tree's weight estimation to be.
-    subsample : Optional[float]
+
+    subsample : {Optional[float]}
+
         Subsample ratio of the training instance.
-    sampling_method :
+
+    sampling_method : {Optional[str]}
+
         Sampling method. Used only by the GPU version of ``hist`` tree method.
-          - ``uniform``: select random training instances uniformly.
-          - ``gradient_based`` select random training instances with higher probability
+
+        - ``uniform``: Select random training instances uniformly.
+        - ``gradient_based``: Select random training instances with higher probability
             when the gradient and hessian are larger. (cf. CatBoost)
-    colsample_bytree : Optional[float]
+
+    colsample_bytree : {Optional[float]}
+
         Subsample ratio of columns when constructing each tree.
-    colsample_bylevel : Optional[float]
+
+    colsample_bylevel : {Optional[float]}
+
         Subsample ratio of columns for each level.
-    colsample_bynode : Optional[float]
+
+    colsample_bynode : {Optional[float]}
+
         Subsample ratio of columns for each split.
-    reg_alpha : Optional[float]
+
+    reg_alpha : {Optional[float]}
+
         L1 regularization term on weights (xgb's alpha).
-    reg_lambda : Optional[float]
+
+    reg_lambda : {Optional[float]}
+
         L2 regularization term on weights (xgb's lambda).
-    scale_pos_weight : Optional[float]
+
+    scale_pos_weight : {Optional[float]}
         Balancing of positive and negative weights.
-    base_score : Optional[float]
+
+    base_score : {Optional[float]}
+
         The initial prediction score of all instances, global bias.
-    random_state : Optional[Union[numpy.random.RandomState, numpy.random.Generator, int]]
+
+    random_state : {Optional[Union[np.random.RandomState, np.random.Generator, int]]}
+
         Random number seed.
 
         .. note::
@@ -248,34 +319,44 @@ __model_doc = f"""
            Using gblinear booster with shotgun updater is nondeterministic as
            it uses Hogwild algorithm.
 
-    missing : float, default np.nan
-        Value in the data which needs to be present as a missing value.
-    num_parallel_tree: Optional[int]
+    missing : float
+
+        Value in the data which needs to be present as a missing value. Default to
+        :py:data:`numpy.nan`.
+
+    num_parallel_tree: {Optional[int]}
+
         Used for boosting random forest.
-    monotone_constraints : Optional[Union[Dict[str, int], str]]
+
+    monotone_constraints : {Optional[Union[Dict[str, int], str]]}
+
         Constraint of variable monotonicity.  See :doc:`tutorial </tutorials/monotonic>`
         for more information.
-    interaction_constraints : Optional[Union[str, List[Tuple[str]]]]
+
+    interaction_constraints : {Optional[Union[str, List[Tuple[str]]]]}
+
         Constraints for interaction representing permitted interactions.  The
         constraints must be specified in the form of a nested list, e.g. ``[[0, 1], [2,
         3, 4]]``, where each inner list is a group of indices of features that are
         allowed to interact with each other.  See :doc:`tutorial
         </tutorials/feature_interaction_constraint>` for more information
-    importance_type: Optional[str]
+
+    importance_type: {Optional[str]}
+
         The feature importance type for the feature_importances\\_ property:
 
         * For tree model, it's either "gain", "weight", "cover", "total_gain" or
           "total_cover".
-        * For linear model, only "weight" is defined and it's the normalized coefficients
-          without bias.
+        * For linear model, only "weight" is defined and it's the normalized
+          coefficients without bias.
 
-    device : Optional[str]
+    device : {Optional[str]}
 
         .. versionadded:: 2.0.0
 
         Device ordinal, available options are `cpu`, `cuda`, and `gpu`.
 
-    validate_parameters : Optional[bool]
+    validate_parameters : {Optional[bool]}
 
         Give warnings for unknown parameter.
 
@@ -283,14 +364,14 @@ __model_doc = f"""
 
         See the same parameter of :py:class:`DMatrix` for details.
 
-    feature_types : Optional[FeatureTypes]
+    feature_types : {Optional[FeatureTypes]}
 
         .. versionadded:: 1.7.0
 
         Used for specifying feature types without constructing a dataframe. See
         :py:class:`DMatrix` for details.
 
-    max_cat_to_onehot : Optional[int]
+    max_cat_to_onehot : {Optional[int]}
 
         .. versionadded:: 1.6.0
 
@@ -303,7 +384,7 @@ __model_doc = f"""
         categorical feature support. See :doc:`Categorical Data
         </tutorials/categorical>` and :ref:`cat-param` for details.
 
-    max_cat_threshold : Optional[int]
+    max_cat_threshold : {Optional[int]}
 
         .. versionadded:: 1.7.0
 
@@ -314,7 +395,7 @@ __model_doc = f"""
         needs to be set to have categorical feature support. See :doc:`Categorical Data
         </tutorials/categorical>` and :ref:`cat-param` for details.
 
-    multi_strategy : Optional[str]
+    multi_strategy : {Optional[str]}
 
         .. versionadded:: 2.0.0
 
@@ -327,7 +408,7 @@ __model_doc = f"""
         - ``one_output_per_tree``: One model for each target.
         - ``multi_output_tree``:  Use multi-target trees.
 
-    eval_metric : Optional[Union[str, List[str], Callable]]
+    eval_metric : {Optional[Union[str, List[str], Callable]]}
 
         .. versionadded:: 1.6.0
 
@@ -360,7 +441,7 @@ __model_doc = f"""
             )
             reg.fit(X, y, eval_set=[(X, y)])
 
-    early_stopping_rounds : Optional[int]
+    early_stopping_rounds : {Optional[int]}
 
         .. versionadded:: 1.6.0
 
@@ -383,7 +464,8 @@ __model_doc = f"""
           early stopping.  If there's more than one metric in **eval_metric**, the last
           metric will be used for early stopping.
 
-    callbacks : Optional[List[TrainingCallback]]
+    callbacks : {Optional[List[TrainingCallback]]}
+
         List of callback functions that are applied at end of each iteration.
         It is possible to use predefined callbacks by using
         :ref:`Callback API <callback_api>`.
@@ -402,7 +484,8 @@ __model_doc = f"""
                 reg = xgboost.XGBRegressor(**params, callbacks=callbacks)
                 reg.fit(X, y)
 
-    kwargs : dict, optional
+    kwargs : {Optional[Any]}
+
         Keyword arguments for XGBoost Booster object.  Full documentation of parameters
         can be found :doc:`here </parameter>`.
         Attempting to set a parameter via the constructor args and \\*\\*kwargs
@@ -419,13 +502,16 @@ __custom_obj_note = """
         .. note::  Custom objective function
 
             A custom objective function can be provided for the ``objective``
-            parameter. In this case, it should have the signature
-            ``objective(y_true, y_pred) -> grad, hess``:
+            parameter. In this case, it should have the signature ``objective(y_true,
+            y_pred) -> [grad, hess]`` or ``objective(y_true, y_pred, *, sample_weight)
+            -> [grad, hess]``:
 
             y_true: array_like of shape [n_samples]
                 The target values
             y_pred: array_like of shape [n_samples]
                 The predicted values
+            sample_weight :
+                Optional sample weights.
 
             grad: array_like of shape [n_samples]
                 The value of the gradient for each sample point.
diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py
index eb226611d..2f24effe5 100644
--- a/python-package/xgboost/spark/core.py
+++ b/python-package/xgboost/spark/core.py
@@ -95,6 +95,7 @@ from .utils import (
     deserialize_xgb_model,
     get_class_name,
     get_logger,
+    get_logger_level,
     serialize_booster,
     use_cuda,
 )
@@ -181,6 +182,8 @@ pred = Pred("prediction", "rawPrediction", "probability", "predContrib")
 
 _INIT_BOOSTER_SAVE_PATH = "init_booster.json"
 
+_LOG_TAG = "XGBoost-PySpark"
+
 
 class _SparkXGBParams(
     HasFeaturesCol,
@@ -344,15 +347,14 @@ class _SparkXGBParams(
                 predict_params[param.name] = self.getOrDefault(param)
         return predict_params
 
-    def _validate_gpu_params(self) -> None:
+    def _validate_gpu_params(
+        self, spark_version: str, conf: SparkConf, is_local: bool = False
+    ) -> None:
         """Validate the gpu parameters and gpu configurations"""
 
         if self._run_on_gpu():
-            ss = _get_spark_session()
-            sc = ss.sparkContext
-
-            if _is_local(sc):
-                # Support GPU training in Spark local mode is just for debugging
+            if is_local:
+                # Supporting GPU training in Spark local mode is just for debugging
                 # purposes, so it's okay for printing the below warning instead of
                 # checking the real gpu numbers and raising the exception.
                 get_logger(self.__class__.__name__).warning(
@@ -361,33 +363,41 @@ class _SparkXGBParams(
                     self.getOrDefault(self.num_workers),
                 )
             else:
-                executor_gpus = sc.getConf().get("spark.executor.resource.gpu.amount")
+                executor_gpus = conf.get("spark.executor.resource.gpu.amount")
                 if executor_gpus is None:
                     raise ValueError(
                         "The `spark.executor.resource.gpu.amount` is required for training"
                         " on GPU."
                     )
-
-                if not (
-                    ss.version >= "3.4.0"
-                    and _is_standalone_or_localcluster(sc.getConf())
+                gpu_per_task = conf.get("spark.task.resource.gpu.amount")
+                if gpu_per_task is not None and float(gpu_per_task) > 1.0:
+                    get_logger(self.__class__.__name__).warning(
+                        "The configuration assigns %s GPUs to each Spark task, but each "
+                        "XGBoost training task only utilizes 1 GPU, which will lead to "
+                        "unnecessary GPU waste",
+                        gpu_per_task,
+                    )
+                # For 3.5.1+, Spark supports task stage-level scheduling for
+                #                          Yarn/K8s/Standalone/Local cluster
+                # From 3.4.0 ~ 3.5.0, Spark only supports task stage-level scheduing for
+                #                           Standalone/Local cluster
+                # For spark below 3.4.0, Task stage-level scheduling is not supported.
+                #
+                # With stage-level scheduling, spark.task.resource.gpu.amount is not required
+                # to be set explicitly. Or else, spark.task.resource.gpu.amount is a must-have and
+                # must be set to 1.0
+                if spark_version < "3.4.0" or (
+                    "3.4.0" <= spark_version < "3.5.1"
+                    and not _is_standalone_or_localcluster(conf)
                 ):
-                    # We will enable stage-level scheduling in spark 3.4.0+ which doesn't
-                    # require spark.task.resource.gpu.amount to be set explicitly
-                    gpu_per_task = sc.getConf().get("spark.task.resource.gpu.amount")
                     if gpu_per_task is not None:
                         if float(gpu_per_task) < 1.0:
                             raise ValueError(
-                                "XGBoost doesn't support GPU fractional configurations. "
-                                "Please set `spark.task.resource.gpu.amount=spark.executor"
-                                ".resource.gpu.amount`"
-                            )
-
-                        if float(gpu_per_task) > 1.0:
-                            get_logger(self.__class__.__name__).warning(
-                                "%s GPUs for each Spark task is configured, but each "
-                                "XGBoost training task uses only 1 GPU.",
-                                gpu_per_task,
+                                "XGBoost doesn't support GPU fractional configurations. Please set "
+                                "`spark.task.resource.gpu.amount=spark.executor.resource.gpu."
+                                "amount`. To enable GPU fractional configurations, you can try "
+                                "standalone/localcluster with spark 3.4.0+ and"
+                                "YARN/K8S with spark 3.5.1+"
                             )
                     else:
                         raise ValueError(
@@ -472,7 +482,9 @@ class _SparkXGBParams(
                     "`pyspark.ml.linalg.Vector` type."
                 )
 
-        self._validate_gpu_params()
+        ss = _get_spark_session()
+        sc = ss.sparkContext
+        self._validate_gpu_params(ss.version, sc.getConf(), _is_local(sc))
 
     def _run_on_gpu(self) -> bool:
         """If train or transform on the gpu according to the parameters"""
@@ -922,10 +934,14 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
                 )
                 return True
 
-            if not _is_standalone_or_localcluster(conf):
+            if (
+                "3.4.0" <= spark_version < "3.5.1"
+                and not _is_standalone_or_localcluster(conf)
+            ):
                 self.logger.info(
-                    "Stage-level scheduling in xgboost requires spark standalone or "
-                    "local-cluster mode"
+                    "For %s, Stage-level scheduling in xgboost requires spark standalone "
+                    "or local-cluster mode",
+                    spark_version,
                 )
                 return True
 
@@ -977,7 +993,9 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
         """Try to enable stage-level scheduling"""
         ss = _get_spark_session()
         conf = ss.sparkContext.getConf()
-        if self._skip_stage_level_scheduling(ss.version, conf):
+        if _is_local(ss.sparkContext) or self._skip_stage_level_scheduling(
+            ss.version, conf
+        ):
             return rdd
 
         # executor_cores will not be None
@@ -1034,6 +1052,8 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
 
         num_workers = self.getOrDefault(self.num_workers)
 
+        log_level = get_logger_level(_LOG_TAG)
+
         def _train_booster(
             pandas_df_iter: Iterator[pd.DataFrame],
         ) -> Iterator[pd.DataFrame]:
@@ -1047,7 +1067,8 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
 
             dev_ordinal = None
             use_qdm = _can_use_qdm(booster_params.get("tree_method", None))
-
+            verbosity = booster_params.get("verbosity", 1)
+            msg = "Training on CPUs"
             if run_on_gpu:
                 dev_ordinal = (
                     context.partitionId() if is_local else _get_gpu_id(context)
@@ -1058,10 +1079,9 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
                 # Note: Checking `is_cudf_available` in spark worker side because
                 # spark worker might has different python environment with driver side.
                 use_qdm = use_qdm and is_cudf_available()
-                get_logger("XGBoost-PySpark").info(
-                    "Leveraging %s to train with QDM: %s",
-                    booster_params["device"],
-                    "on" if use_qdm else "off",
+                msg = (
+                    f"Leveraging {booster_params['device']} to train with "
+                    f"QDM: {'on' if use_qdm else 'off'}"
                 )
 
             if use_qdm and (booster_params.get("max_bin", None) is not None):
@@ -1070,6 +1090,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
             _rabit_args = {}
             if context.partitionId() == 0:
                 _rabit_args = _get_rabit_args(context, num_workers)
+                get_logger(_LOG_TAG, log_level).info(msg)
 
             worker_message = {
                 "rabit_msg": _rabit_args,
@@ -1084,15 +1105,16 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
 
             evals_result: Dict[str, Any] = {}
             with CommunicatorContext(context, **_rabit_args):
-                dtrain, dvalid = create_dmatrix_from_partitions(
-                    pandas_df_iter,
-                    feature_prop.features_cols_names,
-                    dev_ordinal,
-                    use_qdm,
-                    dmatrix_kwargs,
-                    enable_sparse_data_optim=feature_prop.enable_sparse_data_optim,
-                    has_validation_col=feature_prop.has_validation_col,
-                )
+                with xgboost.config_context(verbosity=verbosity):
+                    dtrain, dvalid = create_dmatrix_from_partitions(
+                        pandas_df_iter,
+                        feature_prop.features_cols_names,
+                        dev_ordinal,
+                        use_qdm,
+                        dmatrix_kwargs,
+                        enable_sparse_data_optim=feature_prop.enable_sparse_data_optim,
+                        has_validation_col=feature_prop.has_validation_col,
+                    )
                 if dvalid is not None:
                     dval = [(dtrain, "training"), (dvalid, "validation")]
                 else:
@@ -1127,7 +1149,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
             ret = rdd_with_resource.collect()[0]
             return ret[0], ret[1]
 
-        get_logger("XGBoost-PySpark").info(
+        get_logger(_LOG_TAG).info(
             "Running xgboost-%s on %s workers with"
             "\n\tbooster params: %s"
             "\n\ttrain_call_kwargs_params: %s"
@@ -1139,7 +1161,7 @@ class _SparkXGBEstimator(Estimator, _SparkXGBParams, MLReadable, MLWritable):
             dmatrix_kwargs,
         )
         (config, booster) = _run_job()
-        get_logger("XGBoost-PySpark").info("Finished xgboost training!")
+        get_logger(_LOG_TAG).info("Finished xgboost training!")
 
         result_xgb_model = self._convert_to_sklearn_model(
             bytearray(booster, "utf-8"), config
@@ -1342,7 +1364,7 @@ class _SparkXGBModel(Model, _SparkXGBParams, MLReadable, MLWritable):
         # User don't set gpu configurations, just use cpu
         if gpu_per_task is None:
             if use_gpu_by_params:
-                get_logger("XGBoost-PySpark").warning(
+                get_logger(_LOG_TAG).warning(
                     "Do the prediction on the CPUs since "
                     "no gpu configurations are set"
                 )
@@ -1377,6 +1399,8 @@ class _SparkXGBModel(Model, _SparkXGBParams, MLReadable, MLWritable):
         is_local = _is_local(_get_spark_session().sparkContext)
         run_on_gpu = self._run_on_gpu()
 
+        log_level = get_logger_level(_LOG_TAG)
+
         @pandas_udf(schema)  # type: ignore
         def predict_udf(iterator: Iterator[pd.DataFrame]) -> Iterator[pd.Series]:
             assert xgb_sklearn_model is not None
@@ -1413,7 +1437,8 @@ class _SparkXGBModel(Model, _SparkXGBParams, MLReadable, MLWritable):
                 else:
                     msg = "CUDF or Cupy is unavailable, fallback the inference on the CPUs"
 
-            get_logger("XGBoost-PySpark").info(msg)
+            if context.partitionId() == 0:
+                get_logger(_LOG_TAG, log_level).info(msg)
 
             def to_gpu_if_possible(data: ArrayLike) -> ArrayLike:
                 """Move the data to gpu if possible"""
diff --git a/python-package/xgboost/spark/utils.py b/python-package/xgboost/spark/utils.py
index 84333df53..7dbe290ae 100644
--- a/python-package/xgboost/spark/utils.py
+++ b/python-package/xgboost/spark/utils.py
@@ -8,13 +8,14 @@ import os
 import sys
 import uuid
 from threading import Thread
-from typing import Any, Callable, Dict, Optional, Set, Type
+from typing import Any, Callable, Dict, Optional, Set, Type, Union
 
 import pyspark
 from pyspark import BarrierTaskContext, SparkConf, SparkContext, SparkFiles, TaskContext
 from pyspark.sql.session import SparkSession
 
-from xgboost import Booster, XGBModel, collective
+from xgboost import Booster, XGBModel
+from xgboost.collective import CommunicatorContext as CCtx
 from xgboost.tracker import RabitTracker
 
 
@@ -42,22 +43,12 @@ def _get_default_params_from_func(
     return filtered_params_dict
 
 
-class CommunicatorContext:
-    """A context controlling collective communicator initialization and finalization.
-    This isn't specificially necessary (note Part 3), but it is more understandable
-    coding-wise.
-
-    """
+class CommunicatorContext(CCtx):  # pylint: disable=too-few-public-methods
+    """Context with PySpark specific task ID."""
 
     def __init__(self, context: BarrierTaskContext, **args: Any) -> None:
-        self.args = args
-        self.args["DMLC_TASK_ID"] = str(context.partitionId())
-
-    def __enter__(self) -> None:
-        collective.init(**self.args)
-
-    def __exit__(self, *args: Any) -> None:
-        collective.finalize()
+        args["DMLC_TASK_ID"] = str(context.partitionId())
+        super().__init__(**args)
 
 
 def _start_tracker(context: BarrierTaskContext, n_workers: int) -> Dict[str, Any]:
@@ -98,10 +89,15 @@ def _get_spark_session() -> SparkSession:
     return SparkSession.builder.getOrCreate()
 
 
-def get_logger(name: str, level: str = "INFO") -> logging.Logger:
+def get_logger(name: str, level: Optional[Union[str, int]] = None) -> logging.Logger:
     """Gets a logger by name, or creates and configures it for the first time."""
     logger = logging.getLogger(name)
-    logger.setLevel(level)
+    if level is not None:
+        logger.setLevel(level)
+    else:
+        # Default to info if not set.
+        if logger.level == logging.NOTSET:
+            logger.setLevel(logging.INFO)
     # If the logger is configured, skip the configure
     if not logger.handlers and not logging.getLogger().handlers:
         handler = logging.StreamHandler(sys.stderr)
@@ -113,6 +109,12 @@ def get_logger(name: str, level: str = "INFO") -> logging.Logger:
     return logger
 
 
+def get_logger_level(name: str) -> Optional[int]:
+    """Get the logger level for the given log name"""
+    logger = logging.getLogger(name)
+    return None if logger.level == logging.NOTSET else logger.level
+
+
 def _get_max_num_concurrent_tasks(spark_context: SparkContext) -> int:
     """Gets the current max number of concurrent tasks."""
     # pylint: disable=protected-access
diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py
index 389066f0e..f7d9510fa 100644
--- a/python-package/xgboost/testing/__init__.py
+++ b/python-package/xgboost/testing/__init__.py
@@ -815,10 +815,15 @@ def softprob_obj(
     return objective
 
 
-def ls_obj(y_true: np.ndarray, y_pred: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+def ls_obj(
+    y_true: np.ndarray, y_pred: np.ndarray, sample_weight: Optional[np.ndarray] = None
+) -> Tuple[np.ndarray, np.ndarray]:
     """Least squared error."""
     grad = y_pred - y_true
     hess = np.ones(len(y_true))
+    if sample_weight is not None:
+        grad *= sample_weight
+        hess *= sample_weight
     return grad, hess
 
 
diff --git a/python-package/xgboost/testing/ranking.py b/python-package/xgboost/testing/ranking.py
index a11eb3e03..72cf37aeb 100644
--- a/python-package/xgboost/testing/ranking.py
+++ b/python-package/xgboost/testing/ranking.py
@@ -100,3 +100,21 @@ def run_ranking_categorical(device: str) -> None:
     scores = cross_val_score(ltr, X, y)
     for s in scores:
         assert s > 0.7
+
+
+def run_normalization(device: str) -> None:
+    """Test normalization."""
+    X, y, qid, _ = tm.make_ltr(2048, 4, 64, 3)
+    ltr = xgb.XGBRanker(objective="rank:pairwise", n_estimators=4, device=device)
+    ltr.fit(X, y, qid=qid, eval_set=[(X, y)], eval_qid=[qid])
+    e0 = ltr.evals_result()
+
+    ltr = xgb.XGBRanker(
+        objective="rank:pairwise",
+        n_estimators=4,
+        device=device,
+        lambdarank_normalization=False,
+    )
+    ltr.fit(X, y, qid=qid, eval_set=[(X, y)], eval_qid=[qid])
+    e1 = ltr.evals_result()
+    assert e1["validation_0"]["ndcg@32"][-1] > e0["validation_0"]["ndcg@32"][-1]
diff --git a/rabit/include/rabit/internal/socket.h b/rabit/include/rabit/internal/socket.h
index 89e324482..cec246efd 100644
--- a/rabit/include/rabit/internal/socket.h
+++ b/rabit/include/rabit/internal/socket.h
@@ -100,6 +100,24 @@ std::enable_if_t<std::is_integral_v<E>, xgboost::collective::Result> PollError(E
   if ((revents & POLLNVAL) != 0) {
     return xgboost::system::FailWithCode("Invalid polling request.");
   }
+  if ((revents & POLLHUP) != 0) {
+    // Excerpt from the Linux manual:
+    //
+    // Note that when reading from a channel such as a pipe or a stream socket, this event
+    // merely indicates that the peer closed its end of the channel.Subsequent reads from
+    // the channel will return 0 (end of file) only after all outstanding data in the
+    // channel has been consumed.
+    //
+    // We don't usually have a barrier for exiting workers, it's normal to have one end
+    // exit while the other still reading data.
+    return xgboost::collective::Success();
+  }
+#if defined(POLLRDHUP)
+  // Linux only flag
+  if ((revents & POLLRDHUP) != 0) {
+    return xgboost::system::FailWithCode("Poll hung up on the other end.");
+  }
+#endif  // defined(POLLRDHUP)
   return xgboost::collective::Success();
 }
 
@@ -179,9 +197,11 @@ struct PollHelper {
     }
     std::int32_t ret = PollImpl(fdset.data(), fdset.size(), timeout);
     if (ret == 0) {
-      return xgboost::collective::Fail("Poll timeout.", std::make_error_code(std::errc::timed_out));
+      return xgboost::collective::Fail(
+          "Poll timeout:" + std::to_string(timeout.count()) + " seconds.",
+          std::make_error_code(std::errc::timed_out));
     } else if (ret < 0) {
-      return xgboost::system::FailWithCode("Poll failed.");
+      return xgboost::system::FailWithCode("Poll failed, nfds:" + std::to_string(fdset.size()));
     }
 
     for (auto& pfd : fdset) {
diff --git a/rabit/src/allreduce_base.cc b/rabit/src/allreduce_base.cc
index b99eb3763..fcf80b414 100644
--- a/rabit/src/allreduce_base.cc
+++ b/rabit/src/allreduce_base.cc
@@ -132,7 +132,7 @@ bool AllreduceBase::Shutdown() {
   try {
     for (auto &all_link : all_links) {
       if (!all_link.sock.IsClosed()) {
-        all_link.sock.Close();
+        SafeColl(all_link.sock.Close());
       }
     }
     all_links.clear();
@@ -146,7 +146,7 @@ bool AllreduceBase::Shutdown() {
       LOG(FATAL) << rc.Report();
     }
     tracker.Send(xgboost::StringView{"shutdown"});
-    tracker.Close();
+    SafeColl(tracker.Close());
     xgboost::system::SocketFinalize();
     return true;
   } catch (std::exception const &e) {
@@ -167,7 +167,7 @@ void AllreduceBase::TrackerPrint(const std::string &msg) {
 
   tracker.Send(xgboost::StringView{"print"});
   tracker.Send(xgboost::StringView{msg});
-  tracker.Close();
+  SafeColl(tracker.Close());
 }
 
 // util to parse data with unit suffix
@@ -332,15 +332,15 @@ void AllreduceBase::SetParam(const char *name, const char *val) {
 
     auto sock_listen{xgboost::collective::TCPSocket::Create(tracker.Domain())};
     // create listening socket
-    int port = sock_listen.BindHost();
-    utils::Check(port != -1, "ReConnectLink fail to bind the ports specified");
-    sock_listen.Listen();
+    std::int32_t port{0};
+    SafeColl(sock_listen.BindHost(&port));
+    SafeColl(sock_listen.Listen());
 
     // get number of to connect and number of to accept nodes from tracker
     int num_conn, num_accept, num_error = 1;
     do {
       for (auto & all_link : all_links) {
-        all_link.sock.Close();
+        SafeColl(all_link.sock.Close());
       }
       // tracker construct goodset
       Assert(tracker.RecvAll(&num_conn, sizeof(num_conn)) == sizeof(num_conn),
@@ -352,7 +352,7 @@ void AllreduceBase::SetParam(const char *name, const char *val) {
         LinkRecord r;
         int hport, hrank;
         std::string hname;
-        tracker.Recv(&hname);
+        SafeColl(tracker.Recv(&hname));
         Assert(tracker.RecvAll(&hport, sizeof(hport)) == sizeof(hport), "ReConnectLink failure 9");
         Assert(tracker.RecvAll(&hrank, sizeof(hrank)) == sizeof(hrank), "ReConnectLink failure 10");
         // connect to peer
@@ -360,7 +360,7 @@ void AllreduceBase::SetParam(const char *name, const char *val) {
                                           timeout_sec, &r.sock)
                  .OK()) {
           num_error += 1;
-          r.sock.Close();
+          SafeColl(r.sock.Close());
           continue;
         }
         Assert(r.sock.SendAll(&rank, sizeof(rank)) == sizeof(rank),
@@ -386,7 +386,7 @@ void AllreduceBase::SetParam(const char *name, const char *val) {
     // send back socket listening port to tracker
     Assert(tracker.SendAll(&port, sizeof(port)) == sizeof(port), "ReConnectLink failure 14");
     // close connection to tracker
-    tracker.Close();
+    SafeColl(tracker.Close());
 
     // listen to incoming links
     for (int i = 0; i < num_accept; ++i) {
@@ -408,7 +408,7 @@ void AllreduceBase::SetParam(const char *name, const char *val) {
       }
       if (!match) all_links.emplace_back(std::move(r));
     }
-    sock_listen.Close();
+    SafeColl(sock_listen.Close());
 
     this->parent_index = -1;
     // setup tree links and ring structure
@@ -635,7 +635,7 @@ AllreduceBase::TryAllreduceTree(void *sendrecvbuf_,
           Recv(sendrecvbuf + size_down_in, total_size - size_down_in);
 
           if (len == 0) {
-            links[parent_index].sock.Close();
+            SafeColl(links[parent_index].sock.Close());
             return ReportError(&links[parent_index], kRecvZeroLen);
           }
           if (len != -1) {
diff --git a/rabit/src/allreduce_base.h b/rabit/src/allreduce_base.h
index 7724bf3d5..9991c2138 100644
--- a/rabit/src/allreduce_base.h
+++ b/rabit/src/allreduce_base.h
@@ -270,7 +270,7 @@ class AllreduceBase : public IEngine {
       ssize_t len = sock.Recv(buffer_head + offset, nmax);
       // length equals 0, remote disconnected
       if (len == 0) {
-        sock.Close(); return kRecvZeroLen;
+        SafeColl(sock.Close()); return kRecvZeroLen;
       }
       if (len == -1) return Errno2Return();
       size_read += static_cast<size_t>(len);
@@ -289,7 +289,7 @@ class AllreduceBase : public IEngine {
       ssize_t len = sock.Recv(p + size_read, max_size - size_read);
       // length equals 0, remote disconnected
       if (len == 0) {
-        sock.Close(); return kRecvZeroLen;
+        SafeColl(sock.Close()); return kRecvZeroLen;
       }
       if (len == -1) return Errno2Return();
       size_read += static_cast<size_t>(len);
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index f674997af..297945ab9 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -17,8 +17,9 @@ if(USE_CUDA)
 endif()
 
 if (USE_HIP)
-  file(GLOB_RECURSE HIP_SOURCES *.hip *.hip.h)
+  file(GLOB_RECURSE HIP_SOURCES *.cu *.hip.h)
   target_sources(objxgboost PRIVATE ${HIP_SOURCES})
+  set_source_files_properties(${HIP_SOURCES} PROPERTIES LANGUAGE HIP)
 endif (USE_HIP)
 
 if(PLUGIN_SYCL)
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index dfd663da3..df1d9c05f 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2014-2024 by XGBoost Contributors
+ * Copyright 2014-2024, XGBoost Contributors
  */
 #include "xgboost/c_api.h"
 
@@ -617,8 +617,8 @@ XGB_DLL int XGDMatrixSetFloatInfo(DMatrixHandle handle, const char *field, const
   API_BEGIN();
   CHECK_HANDLE();
   xgboost_CHECK_C_ARG_PTR(field);
-  auto const& p_fmat = *static_cast<std::shared_ptr<DMatrix> *>(handle);
-  p_fmat->SetInfo(field, info, xgboost::DataType::kFloat32, len);
+  auto const &p_fmat = *static_cast<std::shared_ptr<DMatrix> *>(handle);
+  p_fmat->SetInfo(field, linalg::Make1dInterface(info, len));
   API_END();
 }
 
@@ -637,8 +637,9 @@ XGB_DLL int XGDMatrixSetUIntInfo(DMatrixHandle handle, const char *field, const
   API_BEGIN();
   CHECK_HANDLE();
   xgboost_CHECK_C_ARG_PTR(field);
+  LOG(WARNING) << error::DeprecatedFunc(__func__, "2.1.0", "XGDMatrixSetInfoFromInterface");
   auto const &p_fmat = *static_cast<std::shared_ptr<DMatrix> *>(handle);
-  p_fmat->SetInfo(field, info, xgboost::DataType::kUInt32, len);
+  p_fmat->SetInfo(field, linalg::Make1dInterface(info, len));
   API_END();
 }
 
@@ -682,19 +683,52 @@ XGB_DLL int XGDMatrixSetDenseInfo(DMatrixHandle handle, const char *field, void
                                   xgboost::bst_ulong size, int type) {
   API_BEGIN();
   CHECK_HANDLE();
+  LOG(WARNING) << error::DeprecatedFunc(__func__, "2.1.0", "XGDMatrixSetInfoFromInterface");
   auto const &p_fmat = *static_cast<std::shared_ptr<DMatrix> *>(handle);
   CHECK(type >= 1 && type <= 4);
   xgboost_CHECK_C_ARG_PTR(field);
-  p_fmat->SetInfo(field, data, static_cast<DataType>(type), size);
-  API_END();
-}
 
-XGB_DLL int XGDMatrixSetGroup(DMatrixHandle handle, const unsigned *group, xgboost::bst_ulong len) {
-  API_BEGIN();
-  CHECK_HANDLE();
-  LOG(WARNING) << "XGDMatrixSetGroup is deprecated, use `XGDMatrixSetUIntInfo` instead.";
-  auto const &p_fmat = *static_cast<std::shared_ptr<DMatrix> *>(handle);
-  p_fmat->SetInfo("group", group, xgboost::DataType::kUInt32, len);
+  Context ctx;
+  auto dtype = static_cast<DataType>(type);
+  std::string str;
+  auto proc = [&](auto cast_d_ptr) {
+    using T = std::remove_pointer_t<decltype(cast_d_ptr)>;
+    auto t = linalg::TensorView<T, 1>(
+        common::Span<T>{cast_d_ptr, static_cast<typename common::Span<T>::index_type>(size)},
+        {size}, DeviceOrd::CPU());
+    CHECK(t.CContiguous());
+    Json iface{linalg::ArrayInterface(t)};
+    CHECK(ArrayInterface<1>{iface}.is_contiguous);
+    str = Json::Dump(iface);
+    return str;
+  };
+
+  // Legacy code using XGBoost dtype, which is a small subset of array interface types.
+  switch (dtype) {
+    case xgboost::DataType::kFloat32: {
+      auto cast_ptr = reinterpret_cast<const float *>(data);
+      p_fmat->Info().SetInfo(ctx, field, proc(cast_ptr));
+      break;
+    }
+    case xgboost::DataType::kDouble: {
+      auto cast_ptr = reinterpret_cast<const double *>(data);
+      p_fmat->Info().SetInfo(ctx, field, proc(cast_ptr));
+      break;
+    }
+    case xgboost::DataType::kUInt32: {
+      auto cast_ptr = reinterpret_cast<const uint32_t *>(data);
+      p_fmat->Info().SetInfo(ctx, field, proc(cast_ptr));
+      break;
+    }
+    case xgboost::DataType::kUInt64: {
+      auto cast_ptr = reinterpret_cast<const uint64_t *>(data);
+      p_fmat->Info().SetInfo(ctx, field, proc(cast_ptr));
+      break;
+    }
+    default:
+      LOG(FATAL) << "Unknown data type" << static_cast<uint8_t>(dtype);
+  }
+
   API_END();
 }
 
@@ -990,7 +1024,7 @@ XGB_DLL int XGBoosterBoostOneIter(BoosterHandle handle, DMatrixHandle dtrain, bs
                                   bst_float *hess, xgboost::bst_ulong len) {
   API_BEGIN();
   CHECK_HANDLE();
-  error::DeprecatedFunc(__func__, "2.1.0", "XGBoosterTrainOneIter");
+  LOG(WARNING) << error::DeprecatedFunc(__func__, "2.1.0", "XGBoosterTrainOneIter");
   auto *learner = static_cast<Learner *>(handle);
   auto ctx = learner->Ctx()->MakeCPU();
 
diff --git a/src/c_api/c_api.hip b/src/c_api/c_api.hip
deleted file mode 100644
index 715845ea3..000000000
--- a/src/c_api/c_api.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "c_api.cu"
-#endif
diff --git a/src/c_api/c_api_utils.h b/src/c_api/c_api_utils.h
index 5526619c0..7e38b8aa2 100644
--- a/src/c_api/c_api_utils.h
+++ b/src/c_api/c_api_utils.h
@@ -1,17 +1,18 @@
 /**
- * Copyright 2021-2023, XGBoost Contributors
+ * Copyright 2021-2024, XGBoost Contributors
  */
 #ifndef XGBOOST_C_API_C_API_UTILS_H_
 #define XGBOOST_C_API_C_API_UTILS_H_
 
-#include <algorithm>
-#include <cstddef>
-#include <functional>
-#include <memory>   // for shared_ptr
-#include <string>   // for string
-#include <tuple>    // for make_tuple
-#include <utility>  // for move
-#include <vector>
+#include <algorithm>   // for min
+#include <cstddef>     // for size_t
+#include <functional>  // for multiplies
+#include <memory>      // for shared_ptr
+#include <numeric>     // for accumulate
+#include <string>      // for string
+#include <tuple>       // for make_tuple
+#include <utility>     // for move
+#include <vector>      // for vector
 
 #include "../common/json_utils.h"  // for TypeCheck
 #include "xgboost/c_api.h"
diff --git a/src/c_api/coll_c_api.cc b/src/c_api/coll_c_api.cc
index 01713dbad..fba2647cc 100644
--- a/src/c_api/coll_c_api.cc
+++ b/src/c_api/coll_c_api.cc
@@ -1,15 +1,17 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
 #include <chrono>       // for seconds
-#include <cstddef>      // for size_t
 #include <future>       // for future
 #include <memory>       // for unique_ptr
 #include <string>       // for string
+#include <thread>       // for sleep_for
 #include <type_traits>  // for is_same_v, remove_pointer_t
 #include <utility>      // for pair
 
+#include "../collective/comm.h"     // for DefaultTimeoutSec
 #include "../collective/tracker.h"  // for RabitTracker
+#include "../common/timer.h"        // for Timer
 #include "c_api_error.h"            // for API_BEGIN
 #include "xgboost/c_api.h"
 #include "xgboost/collective/result.h"  // for Result
@@ -26,7 +28,7 @@ using namespace xgboost;  // NOLINT
 
 namespace {
 using TrackerHandleT =
-    std::pair<std::unique_ptr<collective::Tracker>, std::shared_future<collective::Result>>;
+    std::pair<std::shared_ptr<collective::Tracker>, std::shared_future<collective::Result>>;
 
 TrackerHandleT *GetTrackerHandle(TrackerHandle handle) {
   xgboost_CHECK_C_ARG_PTR(handle);
@@ -40,17 +42,29 @@ struct CollAPIEntry {
 };
 using CollAPIThreadLocalStore = dmlc::ThreadLocalStore<CollAPIEntry>;
 
-void WaitImpl(TrackerHandleT *ptr) {
-  std::chrono::seconds wait_for{100};
+void WaitImpl(TrackerHandleT *ptr, std::chrono::seconds timeout) {
+  constexpr std::int64_t kDft{collective::DefaultTimeoutSec()};
+  std::chrono::seconds wait_for{timeout.count() != 0 ? std::min(kDft, timeout.count()) : kDft};
+
+  common::Timer timer;
+  timer.Start();
+
+  auto ref = ptr->first;  // hold a reference to that free don't delete it while waiting.
+
   auto fut = ptr->second;
   while (fut.valid()) {
     auto res = fut.wait_for(wait_for);
     CHECK(res != std::future_status::deferred);
+
     if (res == std::future_status::ready) {
       auto const &rc = ptr->second.get();
-      CHECK(rc.OK()) << rc.Report();
+      collective::SafeColl(rc);
       break;
     }
+
+    if (timer.Duration() > timeout && timeout.count() != 0) {
+      collective::SafeColl(collective::Fail("Timeout waiting for the tracker."));
+    }
   }
 }
 }  // namespace
@@ -62,15 +76,15 @@ XGB_DLL int XGTrackerCreate(char const *config, TrackerHandle *handle) {
   Json jconfig = Json::Load(config);
 
   auto type = RequiredArg<String>(jconfig, "dmlc_communicator", __func__);
-  std::unique_ptr<collective::Tracker> tptr;
+  std::shared_ptr<collective::Tracker> tptr;
   if (type == "federated") {
 #if defined(XGBOOST_USE_FEDERATED)
-    tptr = std::make_unique<collective::FederatedTracker>(jconfig);
+    tptr = std::make_shared<collective::FederatedTracker>(jconfig);
 #else
     LOG(FATAL) << error::NoFederated();
 #endif  // defined(XGBOOST_USE_FEDERATED)
   } else if (type == "rabit") {
-    tptr = std::make_unique<collective::RabitTracker>(jconfig);
+    tptr = std::make_shared<collective::RabitTracker>(jconfig);
   } else {
     LOG(FATAL) << "Unknown communicator:" << type;
   }
@@ -93,7 +107,7 @@ XGB_DLL int XGTrackerWorkerArgs(TrackerHandle handle, char const **args) {
   API_END();
 }
 
-XGB_DLL int XGTrackerRun(TrackerHandle handle) {
+XGB_DLL int XGTrackerRun(TrackerHandle handle, char const *) {
   API_BEGIN();
   auto *ptr = GetTrackerHandle(handle);
   CHECK(!ptr->second.valid()) << "Tracker is already running.";
@@ -101,19 +115,39 @@ XGB_DLL int XGTrackerRun(TrackerHandle handle) {
   API_END();
 }
 
-XGB_DLL int XGTrackerWait(TrackerHandle handle, char const *config) {
+XGB_DLL int XGTrackerWaitFor(TrackerHandle handle, char const *config) {
   API_BEGIN();
   auto *ptr = GetTrackerHandle(handle);
   xgboost_CHECK_C_ARG_PTR(config);
   auto jconfig = Json::Load(StringView{config});
-  WaitImpl(ptr);
+  // Internally, 0 indicates no timeout, which is the default since we don't want to
+  // interrupt the model training.
+  xgboost_CHECK_C_ARG_PTR(config);
+  auto timeout = OptionalArg<Integer>(jconfig, "timeout", std::int64_t{0});
+  WaitImpl(ptr, std::chrono::seconds{timeout});
   API_END();
 }
 
 XGB_DLL int XGTrackerFree(TrackerHandle handle) {
   API_BEGIN();
+  using namespace std::chrono_literals;  // NOLINT
   auto *ptr = GetTrackerHandle(handle);
-  WaitImpl(ptr);
+  ptr->first->Stop();
+  // The wait is not necessary since we just called stop, just reusing the function to do
+  // any potential cleanups.
+  WaitImpl(ptr, ptr->first->Timeout());
+  common::Timer timer;
+  timer.Start();
+  // Make sure no one else is waiting on the tracker.
+  while (!ptr->first.unique()) {
+    auto ela = timer.Duration().count();
+    if (ela > ptr->first->Timeout().count()) {
+      LOG(WARNING) << "Time out " << ptr->first->Timeout().count()
+                   << " seconds reached for TrackerFree, killing the tracker.";
+      break;
+    }
+    std::this_thread::sleep_for(64ms);
+  }
   delete ptr;
   API_END();
 }
diff --git a/src/collective/aggregator.h b/src/collective/aggregator.h
index f2a9ff528..bc652f2e8 100644
--- a/src/collective/aggregator.h
+++ b/src/collective/aggregator.h
@@ -1,22 +1,21 @@
 /**
- * Copyright 2023 by XGBoost contributors
+ * Copyright 2023-2024, XGBoost contributors
  *
  * Higher level functions built on top the Communicator API, taking care of behavioral differences
  * between row-split vs column-split distributed training, and horizontal vs vertical federated
  * learning.
  */
 #pragma once
-#include <xgboost/data.h>
-
 #include <limits>
 #include <string>
 #include <utility>
 #include <vector>
 
 #include "communicator-inl.h"
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/data.h"               // for MetaINfo
 
-namespace xgboost {
-namespace collective {
+namespace xgboost::collective {
 
 /**
  * @brief Apply the given function where the labels are.
@@ -31,15 +30,16 @@ namespace collective {
  * @param size The size of the buffer.
  * @param function The function used to calculate the results.
  */
-template <typename Function>
-void ApplyWithLabels(MetaInfo const& info, void* buffer, size_t size, Function&& function) {
+template <typename FN>
+void ApplyWithLabels(Context const*, MetaInfo const& info, void* buffer, std::size_t size,
+                     FN&& function) {
   if (info.IsVerticalFederated()) {
     // We assume labels are only available on worker 0, so the calculation is done there and result
     // broadcast to other workers.
     std::string message;
     if (collective::GetRank() == 0) {
       try {
-        std::forward<Function>(function)();
+        std::forward<FN>(function)();
       } catch (dmlc::Error& e) {
         message = e.what();
       }
@@ -52,7 +52,7 @@ void ApplyWithLabels(MetaInfo const& info, void* buffer, size_t size, Function&&
       LOG(FATAL) << &message[0];
     }
   } else {
-    std::forward<Function>(function)();
+    std::forward<FN>(function)();
   }
 }
 
@@ -70,7 +70,8 @@ void ApplyWithLabels(MetaInfo const& info, void* buffer, size_t size, Function&&
  * @param function The function used to calculate the results.
  */
 template <typename T, typename Function>
-void ApplyWithLabels(MetaInfo const& info, HostDeviceVector<T>* result, Function&& function) {
+void ApplyWithLabels(Context const*, MetaInfo const& info, HostDeviceVector<T>* result,
+                     Function&& function) {
   if (info.IsVerticalFederated()) {
     // We assume labels are only available on worker 0, so the calculation is done there and result
     // broadcast to other workers.
@@ -114,7 +115,9 @@ void ApplyWithLabels(MetaInfo const& info, HostDeviceVector<T>* result, Function
  * @return The global max of the input.
  */
 template <typename T>
-T GlobalMax(MetaInfo const& info, T value) {
+std::enable_if_t<std::is_trivially_copy_assignable_v<T>, T> GlobalMax(Context const*,
+                                                                      MetaInfo const& info,
+                                                                      T value) {
   if (info.IsRowSplit()) {
     collective::Allreduce<collective::Operation::kMax>(&value, 1);
   }
@@ -132,16 +135,18 @@ T GlobalMax(MetaInfo const& info, T value) {
  * @param values Pointer to the inputs to sum.
  * @param size Number of values to sum.
  */
-template <typename T>
-void GlobalSum(MetaInfo const& info, T* values, size_t size) {
+template <typename T, std::int32_t kDim>
+[[nodiscard]] Result GlobalSum(Context const*, MetaInfo const& info,
+                               linalg::TensorView<T, kDim> values) {
   if (info.IsRowSplit()) {
-    collective::Allreduce<collective::Operation::kSum>(values, size);
+    collective::Allreduce<collective::Operation::kSum>(values.Values().data(), values.Size());
   }
+  return Success();
 }
 
 template <typename Container>
-void GlobalSum(MetaInfo const& info, Container* values) {
-  GlobalSum(info, values->data(), values->size());
+[[nodiscard]] Result GlobalSum(Context const* ctx, MetaInfo const& info, Container* values) {
+  return GlobalSum(ctx, info, values->data(), values->size());
 }
 
 /**
@@ -157,9 +162,10 @@ void GlobalSum(MetaInfo const& info, Container* values) {
  * @return The global ratio of the two inputs.
  */
 template <typename T>
-T GlobalRatio(MetaInfo const& info, T dividend, T divisor) {
+T GlobalRatio(Context const* ctx, MetaInfo const& info, T dividend, T divisor) {
   std::array<T, 2> results{dividend, divisor};
-  GlobalSum(info, &results);
+  auto rc = GlobalSum(ctx, info, linalg::MakeVec(results.data(), results.size()));
+  SafeColl(rc);
   std::tie(dividend, divisor) = std::tuple_cat(results);
   if (divisor <= 0) {
     return std::numeric_limits<T>::quiet_NaN();
@@ -167,6 +173,4 @@ T GlobalRatio(MetaInfo const& info, T dividend, T divisor) {
     return dividend / divisor;
   }
 }
-
-}  // namespace collective
-}  // namespace xgboost
+}  // namespace xgboost::collective
diff --git a/src/collective/allgather.cc b/src/collective/allgather.cc
index 148cb6cd2..5d1ec664e 100644
--- a/src/collective/allgather.cc
+++ b/src/collective/allgather.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
 #include "allgather.h"
 
@@ -7,6 +7,7 @@
 #include <cstddef>    // for size_t
 #include <cstdint>    // for int8_t, int32_t, int64_t
 #include <memory>     // for shared_ptr
+#include <utility>    // for move
 
 #include "broadcast.h"
 #include "comm.h"                       // for Comm, Channel
@@ -29,16 +30,22 @@ Result RingAllgather(Comm const& comm, common::Span<std::int8_t> data, std::size
     auto rc = Success() << [&] {
       auto send_rank = (rank + world - r + worker_off) % world;
       auto send_off = send_rank * segment_size;
-      send_off = std::min(send_off, data.size_bytes());
-      auto send_seg = data.subspan(send_off, std::min(segment_size, data.size_bytes() - send_off));
+      bool is_last_segment = send_rank == (world - 1);
+      auto send_nbytes = is_last_segment ? (data.size_bytes() - send_off) : segment_size;
+      auto send_seg = data.subspan(send_off, send_nbytes);
+      CHECK_NE(send_seg.size(), 0);
       return next_ch->SendAll(send_seg.data(), send_seg.size_bytes());
     } << [&] {
       auto recv_rank = (rank + world - r - 1 + worker_off) % world;
       auto recv_off = recv_rank * segment_size;
-      recv_off = std::min(recv_off, data.size_bytes());
-      auto recv_seg = data.subspan(recv_off, std::min(segment_size, data.size_bytes() - recv_off));
+      bool is_last_segment = recv_rank == (world - 1);
+      auto recv_nbytes = is_last_segment ? (data.size_bytes() - recv_off) : segment_size;
+      auto recv_seg = data.subspan(recv_off, recv_nbytes);
+      CHECK_NE(recv_seg.size(), 0);
       return prev_ch->RecvAll(recv_seg.data(), recv_seg.size_bytes());
-    } << [&] { return prev_ch->Block(); };
+    } << [&] {
+      return comm.Block();
+    };
     if (!rc.OK()) {
       return rc;
     }
@@ -91,7 +98,9 @@ namespace detail {
       auto recv_size = sizes[recv_rank];
       auto recv_seg = erased_result.subspan(recv_off, recv_size);
       return prev_ch->RecvAll(recv_seg.data(), recv_seg.size_bytes());
-    } << [&] { return prev_ch->Block(); };
+    } << [&] {
+      return prev_ch->Block();
+    };
     if (!rc.OK()) {
       return rc;
     }
@@ -99,4 +108,47 @@ namespace detail {
   return comm.Block();
 }
 }  // namespace detail
+
+[[nodiscard]] std::vector<std::vector<char>> VectorAllgatherV(
+    Context const* ctx, CommGroup const& comm, std::vector<std::vector<char>> const& input) {
+  auto n_inputs = input.size();
+  std::vector<std::int64_t> sizes(n_inputs);
+  std::transform(input.cbegin(), input.cend(), sizes.begin(),
+                 [](auto const& vec) { return vec.size(); });
+
+  std::vector<std::int64_t> recv_segments(comm.World() + 1, 0);
+
+  HostDeviceVector<std::int8_t> recv;
+  auto rc =
+      AllgatherV(ctx, comm, linalg::MakeVec(sizes.data(), sizes.size()), &recv_segments, &recv);
+  SafeColl(rc);
+
+  auto global_sizes = common::RestoreType<std::int64_t const>(recv.ConstHostSpan());
+  std::vector<std::int64_t> offset(global_sizes.size() + 1);
+  offset[0] = 0;
+  for (std::size_t i = 1; i < offset.size(); i++) {
+    offset[i] = offset[i - 1] + global_sizes[i - 1];
+  }
+
+  std::vector<char> collected;
+  for (auto const& vec : input) {
+    collected.insert(collected.end(), vec.cbegin(), vec.cend());
+  }
+  rc = AllgatherV(ctx, comm, linalg::MakeVec(collected.data(), collected.size()), &recv_segments,
+                  &recv);
+  SafeColl(rc);
+  auto out = common::RestoreType<char const>(recv.ConstHostSpan());
+
+  std::vector<std::vector<char>> result;
+  for (std::size_t i = 1; i < offset.size(); ++i) {
+    std::vector<char> local(out.cbegin() + offset[i - 1], out.cbegin() + offset[i]);
+    result.emplace_back(std::move(local));
+  }
+  return result;
+}
+
+[[nodiscard]] std::vector<std::vector<char>> VectorAllgatherV(
+    Context const* ctx, std::vector<std::vector<char>> const& input) {
+  return VectorAllgatherV(ctx, *GlobalCommGroup(), input);
+}
 }  // namespace xgboost::collective
diff --git a/src/collective/allgather.h b/src/collective/allgather.h
index 4f13014be..ca44c3916 100644
--- a/src/collective/allgather.h
+++ b/src/collective/allgather.h
@@ -1,25 +1,27 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
 #pragma once
 #include <cstddef>      // for size_t
 #include <cstdint>      // for int32_t
 #include <memory>       // for shared_ptr
 #include <numeric>      // for accumulate
+#include <string>       // for string
 #include <type_traits>  // for remove_cv_t
 #include <vector>       // for vector
 
-#include "../common/type.h"  // for EraseType
+#include "../common/type.h"             // for EraseType
 #include "comm.h"                       // for Comm, Channel
+#include "comm_group.h"                 // for CommGroup
 #include "xgboost/collective/result.h"  // for Result
-#include "xgboost/linalg.h"
-#include "xgboost/span.h"  // for Span
+#include "xgboost/linalg.h"             // for MakeVec
+#include "xgboost/span.h"               // for Span
 
 namespace xgboost::collective {
 namespace cpu_impl {
 /**
  * @param worker_off Segment offset. For example, if the rank 2 worker specifies
- *                   worker_off = 1, then it owns the third segment.
+ *                   worker_off = 1, then it owns the third segment (2 + 1).
  */
 [[nodiscard]] Result RingAllgather(Comm const& comm, common::Span<std::int8_t> data,
                                    std::size_t segment_size, std::int32_t worker_off,
@@ -51,8 +53,10 @@ inline void AllgatherVOffset(common::Span<std::int64_t const> sizes,
 }  // namespace detail
 
 template <typename T>
-[[nodiscard]] Result RingAllgather(Comm const& comm, common::Span<T> data, std::size_t size) {
-  auto n_bytes = sizeof(T) * size;
+[[nodiscard]] Result RingAllgather(Comm const& comm, common::Span<T> data) {
+  // This function is also used for ring allreduce, hence we allow the last segment to be
+  // larger due to round-down.
+  auto n_bytes_per_segment = data.size_bytes() / comm.World();
   auto erased = common::EraseType(data);
 
   auto rank = comm.Rank();
@@ -61,7 +65,7 @@ template <typename T>
 
   auto prev_ch = comm.Chan(prev);
   auto next_ch = comm.Chan(next);
-  auto rc = cpu_impl::RingAllgather(comm, erased, n_bytes, 0, prev_ch, next_ch);
+  auto rc = cpu_impl::RingAllgather(comm, erased, n_bytes_per_segment, 0, prev_ch, next_ch);
   if (!rc.OK()) {
     return rc;
   }
@@ -76,7 +80,7 @@ template <typename T>
 
   std::vector<std::int64_t> sizes(world, 0);
   sizes[rank] = data.size_bytes();
-  auto rc = RingAllgather(comm, common::Span{sizes.data(), sizes.size()}, 1);
+  auto rc = RingAllgather(comm, common::Span{sizes.data(), sizes.size()});
   if (!rc.OK()) {
     return rc;
   }
@@ -98,4 +102,115 @@ template <typename T>
 
   return detail::RingAllgatherV(comm, sizes, s_segments, erased_result);
 }
+
+template <typename T>
+[[nodiscard]] Result Allgather(Context const* ctx, CommGroup const& comm,
+                               linalg::VectorView<T> data) {
+  if (!comm.IsDistributed()) {
+    return Success();
+  }
+  CHECK(data.Contiguous());
+  auto erased = common::EraseType(data.Values());
+
+  auto const& cctx = comm.Ctx(ctx, data.Device());
+  auto backend = comm.Backend(data.Device());
+  return backend->Allgather(cctx, erased);
+}
+
+/**
+ * @brief Gather all data from all workers.
+ *
+ * @param data The input and output buffer, needs to be pre-allocated by the caller.
+ */
+template <typename T>
+[[nodiscard]] Result Allgather(Context const* ctx, linalg::VectorView<T> data) {
+  auto const& cg = *GlobalCommGroup();
+  if (data.Size() % cg.World() != 0) {
+    return Fail("The total number of elements should be multiple of the number of workers.");
+  }
+  return Allgather(ctx, cg, data);
+}
+
+template <typename T>
+[[nodiscard]] Result AllgatherV(Context const* ctx, CommGroup const& comm,
+                                linalg::VectorView<T> data,
+                                std::vector<std::int64_t>* recv_segments,
+                                HostDeviceVector<std::int8_t>* recv) {
+  if (!comm.IsDistributed()) {
+    return Success();
+  }
+  std::vector<std::int64_t> sizes(comm.World(), 0);
+  sizes[comm.Rank()] = data.Values().size_bytes();
+  auto erased_sizes = common::EraseType(common::Span{sizes.data(), sizes.size()});
+  auto rc = comm.Backend(DeviceOrd::CPU())
+                ->Allgather(comm.Ctx(ctx, DeviceOrd::CPU()), erased_sizes);
+  if (!rc.OK()) {
+    return rc;
+  }
+
+  recv_segments->resize(sizes.size() + 1);
+  detail::AllgatherVOffset(sizes, common::Span{recv_segments->data(), recv_segments->size()});
+  auto total_bytes = std::accumulate(sizes.cbegin(), sizes.cend(), 0LL);
+  recv->SetDevice(data.Device());
+  recv->Resize(total_bytes);
+
+  auto s_segments = common::Span{recv_segments->data(), recv_segments->size()};
+
+  auto backend = comm.Backend(data.Device());
+  auto erased = common::EraseType(data.Values());
+
+  return backend->AllgatherV(
+      comm.Ctx(ctx, data.Device()), erased, common::Span{sizes.data(), sizes.size()}, s_segments,
+      data.Device().IsCUDA() ? recv->DeviceSpan() : recv->HostSpan(), AllgatherVAlgo::kBcast);
+}
+
+/**
+ * @brief Allgather with variable length data.
+ *
+ * @param data The input data.
+ * @param recv_segments segment size for each worker.  [0, 2, 5] means [0, 2) elements are
+ *                      from the first worker, [2, 5) elements are from the second one.
+ * @param recv The buffer storing the result.
+ */
+template <typename T>
+[[nodiscard]] Result AllgatherV(Context const* ctx, linalg::VectorView<T> data,
+                                std::vector<std::int64_t>* recv_segments,
+                                HostDeviceVector<std::int8_t>* recv) {
+  return AllgatherV(ctx, *GlobalCommGroup(), data, recv_segments, recv);
+}
+
+[[nodiscard]] std::vector<std::vector<char>> VectorAllgatherV(
+    Context const* ctx, CommGroup const& comm, std::vector<std::vector<char>> const& input);
+
+/**
+ * @brief Gathers variable-length data from all processes and distributes it to all processes.
+ *
+ * @param inputs All the inputs from the local worker. The number of inputs can vary
+ *               across different workers. Along with which, the size of each vector in
+ *               the input can also vary.
+ *
+ * @return The AllgatherV result, containing vectors from all workers.
+ */
+[[nodiscard]] std::vector<std::vector<char>> VectorAllgatherV(
+    Context const* ctx, std::vector<std::vector<char>> const& input);
+
+/**
+ * @brief Gathers variable-length strings from all processes and distributes them to all processes.
+ * @param input Variable-length list of variable-length strings.
+ */
+[[nodiscard]] inline Result AllgatherStrings(std::vector<std::string> const& input,
+                                             std::vector<std::string>* p_result) {
+  std::vector<std::vector<char>> inputs(input.size());
+  for (std::size_t i = 0; i < input.size(); ++i) {
+    inputs[i] = {input[i].cbegin(), input[i].cend()};
+  }
+  Context ctx;
+  auto out = VectorAllgatherV(&ctx, *GlobalCommGroup(), inputs);
+  auto& result = *p_result;
+  result.resize(out.size());
+  for (std::size_t i = 0; i < out.size(); ++i) {
+    result[i] = {out[i].cbegin(), out[i].cend()};
+  }
+  return Success();
+}
 }  // namespace xgboost::collective
diff --git a/src/collective/allreduce.cc b/src/collective/allreduce.cc
index 93b76355f..55c5c8854 100644
--- a/src/collective/allreduce.cc
+++ b/src/collective/allreduce.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
 #include "allreduce.h"
 
@@ -16,7 +16,44 @@
 #include "xgboost/span.h"               // for Span
 
 namespace xgboost::collective::cpu_impl {
+namespace {
 template <typename T>
+Result RingAllreduceSmall(Comm const& comm, common::Span<std::int8_t> data, Func const& op) {
+  auto rank = comm.Rank();
+  auto world = comm.World();
+
+  auto next_ch = comm.Chan(BootstrapNext(rank, world));
+  auto prev_ch = comm.Chan(BootstrapPrev(rank, world));
+
+  std::vector<std::int8_t> buffer(data.size_bytes() * world, 0);
+  auto s_buffer = common::Span{buffer.data(), buffer.size()};
+
+  auto offset = data.size_bytes() * rank;
+  auto self = s_buffer.subspan(offset, data.size_bytes());
+  std::copy_n(data.data(), data.size_bytes(), self.data());
+
+  auto typed = common::RestoreType<T>(s_buffer);
+  auto rc = RingAllgather(comm, typed);
+
+  if (!rc.OK()) {
+    return rc;
+  }
+  auto first = s_buffer.subspan(0, data.size_bytes());
+  CHECK_EQ(first.size(), data.size());
+
+  for (std::int32_t r = 1; r < world; ++r) {
+    auto offset = data.size_bytes() * r;
+    auto buf = s_buffer.subspan(offset, data.size_bytes());
+    op(buf, first);
+  }
+  std::copy_n(first.data(), first.size(), data.data());
+
+  return Success();
+}
+}  // namespace
+
+template <typename T>
+// note that n_bytes_in_seg is calculated with round-down.
 Result RingScatterReduceTyped(Comm const& comm, common::Span<std::int8_t> data,
                               std::size_t n_bytes_in_seg, Func const& op) {
   auto rank = comm.Rank();
@@ -27,33 +64,39 @@ Result RingScatterReduceTyped(Comm const& comm, common::Span<std::int8_t> data,
   auto next_ch = comm.Chan(dst_rank);
   auto prev_ch = comm.Chan(src_rank);
 
-  std::vector<std::int8_t> buffer(n_bytes_in_seg, 0);
+  std::vector<std::int8_t> buffer(data.size_bytes() - (world - 1) * n_bytes_in_seg, 0);
   auto s_buf = common::Span{buffer.data(), buffer.size()};
 
   for (std::int32_t r = 0; r < world - 1; ++r) {
-    // send to ring next
-    auto send_off = ((rank + world - r) % world) * n_bytes_in_seg;
-    send_off = std::min(send_off, data.size_bytes());
-    auto seg_nbytes = std::min(data.size_bytes() - send_off, n_bytes_in_seg);
-    auto send_seg = data.subspan(send_off, seg_nbytes);
+    common::Span<std::int8_t> seg, recv_seg;
+    auto rc = Success() << [&] {
+      // send to ring next
+      auto send_rank = (rank + world - r) % world;
+      auto send_off = send_rank * n_bytes_in_seg;
 
-    auto rc = next_ch->SendAll(send_seg);
-    if (!rc.OK()) {
-      return rc;
-    }
+      bool is_last_segment = send_rank == (world - 1);
 
-    // receive from ring prev
-    auto recv_off = ((rank + world - r - 1) % world) * n_bytes_in_seg;
-    recv_off = std::min(recv_off, data.size_bytes());
-    seg_nbytes = std::min(data.size_bytes() - recv_off, n_bytes_in_seg);
-    CHECK_EQ(seg_nbytes % sizeof(T), 0);
-    auto recv_seg = data.subspan(recv_off, seg_nbytes);
-    auto seg = s_buf.subspan(0, recv_seg.size());
+      auto seg_nbytes = is_last_segment ? data.size_bytes() - send_off : n_bytes_in_seg;
+      CHECK_EQ(seg_nbytes % sizeof(T), 0);
 
-    rc = std::move(rc) << [&] { return prev_ch->RecvAll(seg); } << [&] { return comm.Block(); };
-    if (!rc.OK()) {
-      return rc;
-    }
+      auto send_seg = data.subspan(send_off, seg_nbytes);
+      return next_ch->SendAll(send_seg);
+    } << [&] {
+      // receive from ring prev
+      auto recv_rank = (rank + world - r - 1) % world;
+      auto recv_off = recv_rank * n_bytes_in_seg;
+
+      bool is_last_segment = recv_rank == (world - 1);
+
+      auto seg_nbytes = is_last_segment ? (data.size_bytes() - recv_off) : n_bytes_in_seg;
+      CHECK_EQ(seg_nbytes % sizeof(T), 0);
+
+      recv_seg = data.subspan(recv_off, seg_nbytes);
+      seg = s_buf.subspan(0, recv_seg.size());
+      return prev_ch->RecvAll(seg);
+    } << [&] {
+      return comm.Block();
+    };
 
     // accumulate to recv_seg
     CHECK_EQ(seg.size(), recv_seg.size());
@@ -68,6 +111,9 @@ Result RingAllreduce(Comm const& comm, common::Span<std::int8_t> data, Func cons
   if (comm.World() == 1) {
     return Success();
   }
+  if (data.size_bytes() == 0) {
+    return Success();
+  }
   return DispatchDType(type, [&](auto t) {
     using T = decltype(t);
     // Divide the data into segments according to the number of workers.
@@ -75,7 +121,11 @@ Result RingAllreduce(Comm const& comm, common::Span<std::int8_t> data, Func cons
     CHECK_EQ(data.size_bytes() % n_bytes_elem, 0);
     auto n = data.size_bytes() / n_bytes_elem;
     auto world = comm.World();
-    auto n_bytes_in_seg = common::DivRoundUp(n, world) * sizeof(T);
+    if (n < static_cast<decltype(n)>(world)) {
+      return RingAllreduceSmall<T>(comm, data, op);
+    }
+
+    auto n_bytes_in_seg = (n / world) * sizeof(T);
     auto rc = RingScatterReduceTyped<T>(comm, data, n_bytes_in_seg, op);
     if (!rc.OK()) {
       return rc;
@@ -88,7 +138,9 @@ Result RingAllreduce(Comm const& comm, common::Span<std::int8_t> data, Func cons
 
     return std::move(rc) << [&] {
       return RingAllgather(comm, data, n_bytes_in_seg, 1, prev_ch, next_ch);
-    } << [&] { return comm.Block(); };
+    } << [&] {
+      return comm.Block();
+    };
   });
 }
 }  // namespace xgboost::collective::cpu_impl
diff --git a/src/collective/allreduce.h b/src/collective/allreduce.h
index 0c94d11cc..3e88cca11 100644
--- a/src/collective/allreduce.h
+++ b/src/collective/allreduce.h
@@ -1,15 +1,18 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
 #pragma once
 #include <cstdint>      // for int8_t
 #include <functional>   // for function
 #include <type_traits>  // for is_invocable_v, enable_if_t
+#include <vector>       // for vector
 
 #include "../common/type.h"             // for EraseType, RestoreType
-#include "../data/array_interface.h"    // for ArrayInterfaceHandler
+#include "../data/array_interface.h"    // for ToDType, ArrayInterfaceHandler
 #include "comm.h"                       // for Comm, RestoreType
+#include "comm_group.h"                 // for GlobalCommGroup
 #include "xgboost/collective/result.h"  // for Result
+#include "xgboost/context.h"            // for Context
 #include "xgboost/span.h"               // for Span
 
 namespace xgboost::collective {
@@ -27,8 +30,7 @@ std::enable_if_t<std::is_invocable_v<Fn, common::Span<T const>, common::Span<T>>
   auto erased = common::EraseType(data);
   auto type = ToDType<T>::kType;
 
-  auto erased_fn = [type, redop](common::Span<std::int8_t const> lhs,
-                                 common::Span<std::int8_t> out) {
+  auto erased_fn = [redop](common::Span<std::int8_t const> lhs, common::Span<std::int8_t> out) {
     CHECK_EQ(lhs.size(), out.size()) << "Invalid input for reduction.";
     auto lhs_t = common::RestoreType<T const>(lhs);
     auto rhs_t = common::RestoreType<T>(out);
@@ -37,4 +39,40 @@ std::enable_if_t<std::is_invocable_v<Fn, common::Span<T const>, common::Span<T>>
 
   return cpu_impl::RingAllreduce(comm, erased, erased_fn, type);
 }
+
+template <typename T, std::int32_t kDim>
+[[nodiscard]] Result Allreduce(Context const* ctx, CommGroup const& comm,
+                               linalg::TensorView<T, kDim> data, Op op) {
+  if (!comm.IsDistributed()) {
+    return Success();
+  }
+  CHECK(data.Contiguous());
+  auto erased = common::EraseType(data.Values());
+  auto type = ToDType<T>::kType;
+
+  auto backend = comm.Backend(data.Device());
+  return backend->Allreduce(comm.Ctx(ctx, data.Device()), erased, type, op);
+}
+
+template <typename T, std::int32_t kDim>
+[[nodiscard]] Result Allreduce(Context const* ctx, linalg::TensorView<T, kDim> data, Op op) {
+  return Allreduce(ctx, *GlobalCommGroup(), data, op);
+}
+
+/**
+ * @brief Specialization for std::vector.
+ */
+template <typename T, typename Alloc>
+[[nodiscard]] Result Allreduce(Context const* ctx, std::vector<T, Alloc>* data, Op op) {
+  return Allreduce(ctx, linalg::MakeVec(data->data(), data->size()), op);
+}
+
+/**
+ * @brief Specialization for scalar value.
+ */
+template <typename T>
+[[nodiscard]] std::enable_if_t<std::is_standard_layout_v<T> && std::is_trivial_v<T>, Result>
+Allreduce(Context const* ctx, T* data, Op op) {
+  return Allreduce(ctx, linalg::MakeVec(data, 1), op);
+}
 }  // namespace xgboost::collective
diff --git a/src/collective/broadcast.h b/src/collective/broadcast.h
index 28db83815..61cab8cdd 100644
--- a/src/collective/broadcast.h
+++ b/src/collective/broadcast.h
@@ -1,11 +1,15 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
 #pragma once
 #include <cstdint>  // for int32_t, int8_t
 
-#include "comm.h"                       // for Comm
-#include "xgboost/collective/result.h"  // for
+#include "../common/type.h"
+#include "comm.h"                       // for Comm, EraseType
+#include "comm_group.h"                 // for CommGroup
+#include "xgboost/collective/result.h"  // for Result
+#include "xgboost/context.h"            // for Context
+#include "xgboost/linalg.h"             // for VectorView
 #include "xgboost/span.h"               // for Span
 
 namespace xgboost::collective {
@@ -23,4 +27,21 @@ template <typename T>
       common::Span<std::int8_t>{reinterpret_cast<std::int8_t*>(data.data()), n_total_bytes};
   return cpu_impl::Broadcast(comm, erased, root);
 }
+
+template <typename T>
+[[nodiscard]] Result Broadcast(Context const* ctx, CommGroup const& comm,
+                               linalg::VectorView<T> data, std::int32_t root) {
+  if (!comm.IsDistributed()) {
+    return Success();
+  }
+  CHECK(data.Contiguous());
+  auto erased = common::EraseType(data.Values());
+  auto backend = comm.Backend(data.Device());
+  return backend->Broadcast(comm.Ctx(ctx, data.Device()), erased, root);
+}
+
+template <typename T>
+[[nodiscard]] Result Broadcast(Context const* ctx, linalg::VectorView<T> data, std::int32_t root) {
+  return Broadcast(ctx, *GlobalCommGroup(), data, root);
+}
 }  // namespace xgboost::collective
diff --git a/src/collective/coll.cc b/src/collective/coll.cc
index 5f14e4d9a..b19029874 100644
--- a/src/collective/coll.cc
+++ b/src/collective/coll.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
 #include "coll.h"
 
@@ -7,6 +7,7 @@
 #include <cstddef>      // for size_t
 #include <cstdint>      // for int8_t, int64_t
 #include <functional>   // for bit_and, bit_or, bit_xor, plus
+#include <string>       // for string
 #include <type_traits>  // for is_floating_point_v, is_same_v
 #include <utility>      // for move
 
@@ -41,6 +42,10 @@ bool constexpr IsFloatingPointV() {
   auto redop_fn = [](auto lhs, auto out, auto elem_op) {
     auto p_lhs = lhs.data();
     auto p_out = out.data();
+#if defined(__GNUC__) || defined(__clang__)
+    // For the sum op, one can verify the simd by: addps  %xmm15, %xmm14
+#pragma omp simd
+#endif
     for (std::size_t i = 0; i < lhs.size(); ++i) {
       p_out[i] = elem_op(p_lhs[i], p_out[i]);
     }
@@ -60,6 +65,8 @@ bool constexpr IsFloatingPointV() {
     return cpu_impl::RingAllreduce(comm, data, erased_fn, type);
   };
 
+  std::string msg{"Floating point is not supported for bit wise collective operations."};
+
   auto rc = DispatchDType(type, [&](auto t) {
     using T = decltype(t);
     switch (op) {
@@ -74,21 +81,21 @@ bool constexpr IsFloatingPointV() {
       }
       case Op::kBitwiseAND: {
         if constexpr (IsFloatingPointV<T>()) {
-          return Fail("Invalid type.");
+          return Fail(msg);
         } else {
           return fn(std::bit_and<>{}, t);
         }
       }
       case Op::kBitwiseOR: {
         if constexpr (IsFloatingPointV<T>()) {
-          return Fail("Invalid type.");
+          return Fail(msg);
         } else {
           return fn(std::bit_or<>{}, t);
         }
       }
       case Op::kBitwiseXOR: {
         if constexpr (IsFloatingPointV<T>()) {
-          return Fail("Invalid type.");
+          return Fail(msg);
         } else {
           return fn(std::bit_xor<>{}, t);
         }
@@ -105,9 +112,8 @@ bool constexpr IsFloatingPointV() {
   return cpu_impl::Broadcast(comm, data, root);
 }
 
-[[nodiscard]] Result Coll::Allgather(Comm const& comm, common::Span<std::int8_t> data,
-                                     std::int64_t size) {
-  return RingAllgather(comm, data, size);
+[[nodiscard]] Result Coll::Allgather(Comm const& comm, common::Span<std::int8_t> data) {
+  return RingAllgather(comm, data);
 }
 
 [[nodiscard]] Result Coll::AllgatherV(Comm const& comm, common::Span<std::int8_t const> data,
diff --git a/src/collective/coll.cu b/src/collective/coll.cu
index 0aac853a5..6943e5fe9 100644
--- a/src/collective/coll.cu
+++ b/src/collective/coll.cu
@@ -1,10 +1,9 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
 #if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
 #include <cstdint>  // for int8_t, int64_t
 
-#include "../common/cuda_context.cuh"
 #include "../common/device_helpers.cuh"
 #include "../data/array_interface.h"
 #include "allgather.h"  // for AllgatherVOffset
@@ -166,14 +165,14 @@ ncclRedOp_t GetNCCLRedOp(Op const& op) {
   } << [&] { return nccl->Block(); };
 }
 
-[[nodiscard]] Result NCCLColl::Allgather(Comm const& comm, common::Span<std::int8_t> data,
-                                         std::int64_t size) {
+[[nodiscard]] Result NCCLColl::Allgather(Comm const& comm, common::Span<std::int8_t> data) {
   if (!comm.IsDistributed()) {
     return Success();
   }
   auto nccl = dynamic_cast<NCCLComm const*>(&comm);
   CHECK(nccl);
   auto stub = nccl->Stub();
+  auto size = data.size_bytes() / comm.World();
 
   auto send = data.subspan(comm.Rank() * size, size);
   return Success() << [&] {
diff --git a/src/collective/coll.cuh b/src/collective/coll.cuh
index 6ededd101..4d45295d7 100644
--- a/src/collective/coll.cuh
+++ b/src/collective/coll.cuh
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
 #pragma once
 
@@ -8,8 +8,7 @@
 #include "../data/array_interface.h"  // for ArrayInterfaceHandler
 #include "coll.h"                     // for Coll
 #include "comm.h"                     // for Comm
-#include "nccl_stub.h"
-#include "xgboost/span.h"  // for Span
+#include "xgboost/span.h"             // for Span
 
 namespace xgboost::collective {
 class NCCLColl : public Coll {
@@ -20,8 +19,7 @@ class NCCLColl : public Coll {
                                  ArrayInterfaceHandler::Type type, Op op) override;
   [[nodiscard]] Result Broadcast(Comm const& comm, common::Span<std::int8_t> data,
                                  std::int32_t root) override;
-  [[nodiscard]] Result Allgather(Comm const& comm, common::Span<std::int8_t> data,
-                                 std::int64_t size) override;
+  [[nodiscard]] Result Allgather(Comm const& comm, common::Span<std::int8_t> data) override;
   [[nodiscard]] Result AllgatherV(Comm const& comm, common::Span<std::int8_t const> data,
                                   common::Span<std::int64_t const> sizes,
                                   common::Span<std::int64_t> recv_segments,
diff --git a/src/collective/coll.h b/src/collective/coll.h
index 1afc8ed59..96fe35229 100644
--- a/src/collective/coll.h
+++ b/src/collective/coll.h
@@ -48,10 +48,8 @@ class Coll : public std::enable_shared_from_this<Coll> {
    * @brief Allgather
    *
    * @param [in,out] data Data buffer for input and output.
-   * @param [in] size Size of data for each worker.
    */
-  [[nodiscard]] virtual Result Allgather(Comm const& comm, common::Span<std::int8_t> data,
-                                         std::int64_t size);
+  [[nodiscard]] virtual Result Allgather(Comm const& comm, common::Span<std::int8_t> data);
   /**
    * @brief Allgather with variable length.
    *
diff --git a/src/collective/coll.hip b/src/collective/coll.hip
deleted file mode 100644
index 8f3e09ac1..000000000
--- a/src/collective/coll.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "coll.cu"
-#endif
diff --git a/src/collective/comm.cc b/src/collective/comm.cc
index 38510a292..4d3c81468 100644
--- a/src/collective/comm.cc
+++ b/src/collective/comm.cc
@@ -1,16 +1,19 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
 #include "comm.h"
 
 #include <algorithm>  // for copy
 #include <chrono>     // for seconds
+#include <cstdint>    // for int32_t
 #include <cstdlib>    // for exit
 #include <memory>     // for shared_ptr
 #include <string>     // for string
+#include <thread>     // for thread
 #include <utility>    // for move, forward
-
-#include "../common/common.h"           // for AssertGPUSupport
+#if !defined(XGBOOST_USE_NCCL)
+#include "../common/common.h"           // for AssertNCCLSupport
+#endif                                  // !defined(XGBOOST_USE_NCCL)
 #include "allgather.h"                  // for RingAllgather
 #include "protocol.h"                   // for kMagic
 #include "xgboost/base.h"               // for XGBOOST_STRICT_R_MODE
@@ -21,11 +24,7 @@
 namespace xgboost::collective {
 Comm::Comm(std::string const& host, std::int32_t port, std::chrono::seconds timeout,
            std::int32_t retry, std::string task_id)
-    : timeout_{timeout},
-      retry_{retry},
-      tracker_{host, port, -1},
-      task_id_{std::move(task_id)},
-      loop_{std::shared_ptr<Loop>{new Loop{timeout}}} {}
+    : timeout_{timeout}, retry_{retry}, tracker_{host, port, -1}, task_id_{std::move(task_id)} {}
 
 Result ConnectTrackerImpl(proto::PeerInfo info, std::chrono::seconds timeout, std::int32_t retry,
                           std::string const& task_id, TCPSocket* out, std::int32_t rank,
@@ -75,9 +74,11 @@ Result ConnectTrackerImpl(proto::PeerInfo info, std::chrono::seconds timeout, st
   } << [&] {
     return next->NonBlocking(true);
   } << [&] {
-    SockAddrV4 addr;
+    SockAddress addr;
     return listener->Accept(prev.get(), &addr);
-  } << [&] { return prev->NonBlocking(true); };
+  } << [&] {
+    return prev->NonBlocking(true);
+  };
   if (!rc.OK()) {
     return rc;
   }
@@ -157,10 +158,13 @@ Result ConnectTrackerImpl(proto::PeerInfo info, std::chrono::seconds timeout, st
   }
 
   for (std::int32_t r = 0; r < comm.Rank(); ++r) {
-    SockAddrV4 addr;
     auto peer = std::shared_ptr<TCPSocket>(TCPSocket::CreatePtr(comm.Domain()));
-    rc = std::move(rc) << [&] { return listener->Accept(peer.get(), &addr); }
-                       << [&] { return peer->RecvTimeout(timeout); };
+    rc = std::move(rc) << [&] {
+      SockAddress addr;
+      return listener->Accept(peer.get(), &addr);
+    } << [&] {
+      return peer->RecvTimeout(timeout);
+    };
     if (!rc.OK()) {
       return rc;
     }
@@ -182,12 +186,32 @@ Result ConnectTrackerImpl(proto::PeerInfo info, std::chrono::seconds timeout, st
   return Success();
 }
 
-RabitComm::RabitComm(std::string const& host, std::int32_t port, std::chrono::seconds timeout,
-                     std::int32_t retry, std::string task_id, StringView nccl_path)
-    : HostComm{std::move(host), port, timeout, retry, std::move(task_id)},
+namespace {
+std::string InitLog(std::string task_id, std::int32_t rank) {
+  if (task_id.empty()) {
+    return "Rank " + std::to_string(rank);
+  }
+  return "Task " + task_id + " got rank " + std::to_string(rank);
+}
+}  // namespace
+
+RabitComm::RabitComm(std::string const& tracker_host, std::int32_t tracker_port,
+                     std::chrono::seconds timeout, std::int32_t retry, std::string task_id,
+                     StringView nccl_path)
+    : HostComm{tracker_host, tracker_port, timeout, retry, std::move(task_id)},
       nccl_path_{std::move(nccl_path)} {
+  if (this->TrackerInfo().host.empty()) {
+    // Not in a distributed environment.
+    LOG(CONSOLE) << InitLog(task_id_, rank_);
+    return;
+  }
+
+  loop_.reset(new Loop{std::chrono::seconds{timeout_}});  // NOLINT
   auto rc = this->Bootstrap(timeout_, retry_, task_id_);
-  CHECK(rc.OK()) << rc.Report();
+  if (!rc.OK()) {
+    this->ResetState();
+    SafeColl(Fail("Failed to bootstrap the communication group.", std::move(rc)));
+  }
 }
 
 #if !defined(XGBOOST_USE_NCCL) && !defined(XGBOOST_USE_RCCL)
@@ -212,20 +236,54 @@ Comm* RabitComm::MakeCUDAVar(Context const*, std::shared_ptr<Coll>) const {
 
   // Start command
   TCPSocket listener = TCPSocket::Create(tracker.Domain());
-  std::int32_t lport = listener.BindHost();
-  listener.Listen();
+  std::int32_t lport{0};
+  rc = std::move(rc) << [&] {
+    return listener.BindHost(&lport);
+  } << [&] {
+    return listener.Listen();
+  };
+  if (!rc.OK()) {
+    return rc;
+  }
 
   // create worker for listening to error notice.
   auto domain = tracker.Domain();
   std::shared_ptr<TCPSocket> error_sock{TCPSocket::CreatePtr(domain)};
-  auto eport = error_sock->BindHost();
-  error_sock->Listen();
+  std::int32_t eport{0};
+  rc = std::move(rc) << [&] {
+    return error_sock->BindHost(&eport);
+  } << [&] {
+    return error_sock->Listen();
+  };
+  if (!rc.OK()) {
+    return rc;
+  }
+  error_port_ = eport;
+
   error_worker_ = std::thread{[error_sock = std::move(error_sock)] {
-    auto conn = error_sock->Accept();
+    TCPSocket conn;
+    SockAddress addr;
+    auto rc = error_sock->Accept(&conn, &addr);
+    // On Linux, a shutdown causes an invalid argument error;
+    if (rc.Code() == std::errc::invalid_argument) {
+      return;
+    }
     // On Windows, accept returns a closed socket after finalize.
     if (conn.IsClosed()) {
       return;
     }
+    // The error signal is from the tracker, while shutdown signal is from the shutdown method
+    // of the RabitComm class (this).
+    bool is_error{false};
+    rc = proto::Error{}.RecvSignal(&conn, &is_error);
+    if (!rc.OK()) {
+      LOG(WARNING) << rc.Report();
+      return;
+    }
+    if (!is_error) {
+      return;  // shutdown
+    }
+
     LOG(WARNING) << "Another worker is running into error.";
 #if !defined(XGBOOST_STRICT_R_MODE) || XGBOOST_STRICT_R_MODE == 0
     // exit is nicer than abort as the former performs cleanups.
@@ -234,6 +292,9 @@ Comm* RabitComm::MakeCUDAVar(Context const*, std::shared_ptr<Coll>) const {
     LOG(FATAL) << "abort";
 #endif
   }};
+  // The worker thread is detached here to avoid the need to handle it later during
+  // destruction. For C++, if a thread is not joined or detached, it will segfault during
+  // destruction.
   error_worker_.detach();
 
   proto::Start start;
@@ -246,11 +307,13 @@ Comm* RabitComm::MakeCUDAVar(Context const*, std::shared_ptr<Coll>) const {
 
   // get ring neighbors
   std::string snext;
-  tracker.Recv(&snext);
+  rc = tracker.Recv(&snext);
+  if (!rc.OK()) {
+    return Fail("Failed to receive the rank for the next worker.", std::move(rc));
+  }
   auto jnext = Json::Load(StringView{snext});
 
   proto::PeerInfo ninfo{jnext};
-
   // get the rank of this worker
   this->rank_ = BootstrapPrev(ninfo.rank, world);
   this->tracker_.rank = rank_;
@@ -258,20 +321,27 @@ Comm* RabitComm::MakeCUDAVar(Context const*, std::shared_ptr<Coll>) const {
   std::vector<std::shared_ptr<TCPSocket>> workers;
   rc = ConnectWorkers(*this, &listener, lport, ninfo, timeout, retry, &workers);
   if (!rc.OK()) {
-    return rc;
+    return Fail("Failed to connect to other workers.", std::move(rc));
   }
 
   CHECK(this->channels_.empty());
   for (auto& w : workers) {
     if (w) {
-      rc = std::move(rc) << [&] { return w->SetNoDelay(); } << [&] { return w->NonBlocking(true); }
-                         << [&] { return w->SetKeepAlive(); };
+      rc = std::move(rc) << [&] {
+        return w->SetNoDelay();
+      } << [&] {
+        return w->NonBlocking(true);
+      } << [&] {
+        return w->SetKeepAlive();
+      };
     }
     if (!rc.OK()) {
       return rc;
     }
     this->channels_.emplace_back(std::make_shared<Channel>(*this, w));
   }
+
+  LOG(CONSOLE) << InitLog(task_id_, rank_);
   return rc;
 }
 
@@ -279,6 +349,8 @@ RabitComm::~RabitComm() noexcept(false) {
   if (!this->IsDistributed()) {
     return;
   }
+  LOG(WARNING) << "The communicator is being destroyed without a call to shutdown first. This can "
+                  "lead to undefined behaviour.";
   auto rc = this->Shutdown();
   if (!rc.OK()) {
     LOG(WARNING) << rc.Report();
@@ -286,24 +358,52 @@ RabitComm::~RabitComm() noexcept(false) {
 }
 
 [[nodiscard]] Result RabitComm::Shutdown() {
+  if (!this->IsDistributed()) {
+    return Success();
+  }
+  // Tell the tracker that this worker is shutting down.
   TCPSocket tracker;
+  // Tell the error hanlding thread that we are shutting down.
+  TCPSocket err_client;
+
   return Success() << [&] {
     return ConnectTrackerImpl(tracker_, timeout_, retry_, task_id_, &tracker, Rank(), World());
   } << [&] {
     return this->Block();
   } << [&] {
-    Json jcmd{Object{}};
-    jcmd["cmd"] = Integer{static_cast<std::int32_t>(proto::CMD::kShutdown)};
-    auto scmd = Json::Dump(jcmd);
-    auto n_bytes = tracker.Send(scmd);
-    if (n_bytes != scmd.size()) {
-      return Fail("Faled to send cmd.");
-    }
+    return proto::ShutdownCMD{}.Send(&tracker);
+  } << [&] {
+    this->channels_.clear();
     return Success();
+  } << [&] {
+    // Use tracker address to determine whether we want to use IPv6.
+    auto taddr = MakeSockAddress(xgboost::StringView{this->tracker_.host}, this->tracker_.port);
+    // Shutdown the error handling thread. We signal the thread through socket,
+    // alternatively, we can get the native handle and use pthread_cancel. But using a
+    // socket seems to be clearer as we know what's happening.
+    auto const& addr = taddr.IsV4() ? SockAddrV4::Loopback().Addr() : SockAddrV6::Loopback().Addr();
+    // We use hardcoded 10 seconds and 1 retry here since we are just connecting to a
+    // local socket. For a normal OS, this should be enough time to schedule the
+    // connection.
+    auto rc = Connect(StringView{addr}, this->error_port_, 1,
+                      std::min(std::chrono::seconds{10}, timeout_), &err_client);
+    this->ResetState();
+    if (!rc.OK()) {
+      return Fail("Failed to connect to the error socket.", std::move(rc));
+    }
+    return rc;
+  } << [&] {
+    // We put error thread shutdown at the end so that we have a better chance to finish
+    // the previous more important steps.
+    return proto::Error{}.SignalShutdown(&err_client);
   };
 }
 
 [[nodiscard]] Result RabitComm::LogTracker(std::string msg) const {
+  if (!this->IsDistributed()) {
+    LOG(CONSOLE) << msg;
+    return Success();
+  }
   TCPSocket out;
   proto::Print print;
   return Success() << [&] { return this->ConnectTracker(&out); }
@@ -311,8 +411,11 @@ RabitComm::~RabitComm() noexcept(false) {
 }
 
 [[nodiscard]] Result RabitComm::SignalError(Result const& res) {
-  TCPSocket out;
-  return Success() << [&] { return this->ConnectTracker(&out); }
-                   << [&] { return proto::ErrorCMD{}.WorkerSend(&out, res); };
+  TCPSocket tracker;
+  return Success() << [&] {
+    return this->ConnectTracker(&tracker);
+  } << [&] {
+    return proto::ErrorCMD{}.WorkerSend(&tracker, res);
+  };
 }
 }  // namespace xgboost::collective
diff --git a/src/collective/comm.cu b/src/collective/comm.cu
index c453b92cf..ff4be5090 100644
--- a/src/collective/comm.cu
+++ b/src/collective/comm.cu
@@ -27,7 +27,7 @@ Result GetUniqueId(Comm const& comm, std::shared_ptr<NcclStub> stub, std::shared
   ncclUniqueId id;
   if (comm.Rank() == kRootRank) {
     auto rc = stub->GetUniqueId(&id);
-    CHECK(rc.OK()) << rc.Report();
+    SafeColl(rc);
   }
   auto rc = coll->Broadcast(
       comm, common::Span{reinterpret_cast<std::int8_t*>(&id), sizeof(ncclUniqueId)}, kRootRank);
@@ -90,9 +90,8 @@ NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> p
   auto s_this_uuid = s_uuid.subspan(root.Rank() * kUuidLength, kUuidLength);
   GetCudaUUID(s_this_uuid, ctx->Device());
 
-  auto rc = pimpl->Allgather(root, common::EraseType(s_uuid), s_this_uuid.size_bytes());
-
-  CHECK(rc.OK()) << rc.Report();
+  auto rc = pimpl->Allgather(root, common::EraseType(s_uuid));
+  SafeColl(rc);
 
   std::vector<xgboost::common::Span<std::uint64_t, kUuidLength>> converted(root.World());
   std::size_t j = 0;
@@ -113,7 +112,7 @@ NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> p
        [&] {
          return this->stub_->CommInitRank(&nccl_comm_, root.World(), nccl_unique_id_, root.Rank());
        };
-  CHECK(rc.OK()) << rc.Report();
+  SafeColl(rc);
 
   for (std::int32_t r = 0; r < root.World(); ++r) {
     this->channels_.emplace_back(
@@ -124,7 +123,7 @@ NCCLComm::NCCLComm(Context const* ctx, Comm const& root, std::shared_ptr<Coll> p
 NCCLComm::~NCCLComm() {
   if (nccl_comm_) {
     auto rc = stub_->CommDestroy(nccl_comm_);
-    CHECK(rc.OK()) << rc.Report();
+    SafeColl(rc);
   }
 }
 }  // namespace xgboost::collective
diff --git a/src/collective/comm.cuh b/src/collective/comm.cuh
index fcc919c54..1cf2c06cf 100644
--- a/src/collective/comm.cuh
+++ b/src/collective/comm.cuh
@@ -53,6 +53,10 @@ class NCCLComm : public Comm {
     auto rc = this->Stream().Sync(false);
     return GetCUDAResult(rc);
   }
+  [[nodiscard]] Result Shutdown() final {
+    this->ResetState();
+    return Success();
+  }
 };
 
 class NCCLChannel : public Channel {
diff --git a/src/collective/comm.h b/src/collective/comm.h
index 82aa2c45e..a41f47be9 100644
--- a/src/collective/comm.h
+++ b/src/collective/comm.h
@@ -1,10 +1,10 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
 #pragma once
 #include <chrono>   // for seconds
 #include <cstddef>  // for size_t
-#include <cstdint>  // for int32_t
+#include <cstdint>  // for int32_t, int64_t
 #include <memory>   // for shared_ptr
 #include <string>   // for string
 #include <thread>   // for thread
@@ -14,13 +14,13 @@
 #include "loop.h"                       // for Loop
 #include "protocol.h"                   // for PeerInfo
 #include "xgboost/collective/result.h"  // for Result
-#include "xgboost/collective/socket.h"  // for TCPSocket
+#include "xgboost/collective/socket.h"  // for TCPSocket, GetHostName
 #include "xgboost/context.h"            // for Context
 #include "xgboost/span.h"               // for Span
 
 namespace xgboost::collective {
 
-inline constexpr std::int32_t DefaultTimeoutSec() { return 300; }  // 5min
+inline constexpr std::int64_t DefaultTimeoutSec() { return 300; }  // 5min
 inline constexpr std::int32_t DefaultRetry() { return 3; }
 
 // indexing into the ring
@@ -51,11 +51,25 @@ class Comm : public std::enable_shared_from_this<Comm> {
 
   proto::PeerInfo tracker_;
   SockDomain domain_{SockDomain::kV4};
+
   std::thread error_worker_;
+  std::int32_t error_port_;
+
   std::string task_id_;
   std::vector<std::shared_ptr<Channel>> channels_;
-  std::shared_ptr<Loop> loop_{new Loop{std::chrono::seconds{
-      DefaultTimeoutSec()}}};  // fixme: require federated comm to have a timeout
+  std::shared_ptr<Loop> loop_{nullptr};  // fixme: require federated comm to have a timeout
+
+  void ResetState() {
+    this->world_ = -1;
+    this->rank_ = 0;
+    this->timeout_ = std::chrono::seconds{DefaultTimeoutSec()};
+
+    tracker_ = proto::PeerInfo{};
+    this->task_id_.clear();
+    channels_.clear();
+
+    loop_.reset();
+  }
 
  public:
   Comm() = default;
@@ -75,10 +89,13 @@ class Comm : public std::enable_shared_from_this<Comm> {
   [[nodiscard]] auto Retry() const { return retry_; }
   [[nodiscard]] auto TaskID() const { return task_id_; }
 
-  [[nodiscard]] auto Rank() const { return rank_; }
-  [[nodiscard]] auto World() const { return IsDistributed() ? world_ : 1; }
-  [[nodiscard]] bool IsDistributed() const { return world_ != -1; }
-  void Submit(Loop::Op op) const { loop_->Submit(op); }
+  [[nodiscard]] auto Rank() const noexcept { return rank_; }
+  [[nodiscard]] auto World() const noexcept { return IsDistributed() ? world_ : 1; }
+  [[nodiscard]] bool IsDistributed() const noexcept { return world_ != -1; }
+  void Submit(Loop::Op op) const {
+    CHECK(loop_);
+    loop_->Submit(op);
+  }
   [[nodiscard]] virtual Result Block() const { return loop_->Block(); }
 
   [[nodiscard]] virtual std::shared_ptr<Channel> Chan(std::int32_t rank) const {
@@ -88,6 +105,14 @@ class Comm : public std::enable_shared_from_this<Comm> {
   [[nodiscard]] virtual Result LogTracker(std::string msg) const = 0;
 
   [[nodiscard]] virtual Result SignalError(Result const&) { return Success(); }
+  /**
+   * @brief Get a string ID for the current process.
+   */
+  [[nodiscard]] virtual Result ProcessorName(std::string* out) const {
+    auto rc = GetHostName(out);
+    return rc;
+  }
+  [[nodiscard]] virtual Result Shutdown() = 0;
 };
 
 /**
@@ -105,20 +130,20 @@ class RabitComm : public HostComm {
 
   [[nodiscard]] Result Bootstrap(std::chrono::seconds timeout, std::int32_t retry,
                                  std::string task_id);
-  [[nodiscard]] Result Shutdown();
 
  public:
   // bootstrapping construction.
   RabitComm() = default;
-  // ctor for testing where environment is known.
-  RabitComm(std::string const& host, std::int32_t port, std::chrono::seconds timeout,
-            std::int32_t retry, std::string task_id, StringView nccl_path);
+  RabitComm(std::string const& tracker_host, std::int32_t tracker_port,
+            std::chrono::seconds timeout, std::int32_t retry, std::string task_id,
+            StringView nccl_path);
   ~RabitComm() noexcept(false) override;
 
   [[nodiscard]] bool IsFederated() const override { return false; }
   [[nodiscard]] Result LogTracker(std::string msg) const override;
 
   [[nodiscard]] Result SignalError(Result const&) override;
+  [[nodiscard]] Result Shutdown() final;
 
   [[nodiscard]] Comm* MakeCUDAVar(Context const* ctx, std::shared_ptr<Coll> pimpl) const override;
 };
diff --git a/src/collective/comm.hip b/src/collective/comm.hip
deleted file mode 100644
index e8619d41f..000000000
--- a/src/collective/comm.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "comm.cu"
-#endif
diff --git a/src/collective/comm_group.cc b/src/collective/comm_group.cc
index f7bbba754..18a5ba8a7 100644
--- a/src/collective/comm_group.cc
+++ b/src/collective/comm_group.cc
@@ -1,22 +1,21 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
 #include "comm_group.h"
 
 #include <algorithm>  // for transform
+#include <cctype>     // for tolower
 #include <chrono>     // for seconds
 #include <cstdint>    // for int32_t
+#include <iterator>   // for back_inserter
 #include <memory>     // for shared_ptr, unique_ptr
 #include <string>     // for string
-#include <vector>     // for vector
 
-#include "../common/json_utils.h"       // for OptionalArg
-#include "coll.h"                       // for Coll
-#include "comm.h"                       // for Comm
-#include "tracker.h"                    // for GetHostAddress
-#include "xgboost/collective/result.h"  // for Result
-#include "xgboost/context.h"            // for DeviceOrd
-#include "xgboost/json.h"               // for Json
+#include "../common/json_utils.h"  // for OptionalArg
+#include "coll.h"                  // for Coll
+#include "comm.h"                  // for Comm
+#include "xgboost/context.h"       // for DeviceOrd
+#include "xgboost/json.h"          // for Json
 
 #if defined(XGBOOST_USE_FEDERATED)
 #include "../../plugin/federated/federated_coll.h"
@@ -65,6 +64,9 @@ CommGroup::CommGroup()
 
     auto const& obj = get<Object const>(config);
     auto it = obj.find(upper);
+    if (it != obj.cend() && obj.find(name) != obj.cend()) {
+      LOG(FATAL) << "Duplicated parameter:" << name;
+    }
     if (it != obj.cend()) {
       return OptionalArg<decltype(t)>(config, upper, dft);
     } else {
@@ -78,14 +80,14 @@ CommGroup::CommGroup()
   auto task_id = get_param("dmlc_task_id", std::string{}, String{});
 
   if (type == "rabit") {
-    auto host = get_param("dmlc_tracker_uri", std::string{}, String{});
-    auto port = get_param("dmlc_tracker_port", static_cast<std::int64_t>(0), Integer{});
+    auto tracker_host = get_param("dmlc_tracker_uri", std::string{}, String{});
+    auto tracker_port = get_param("dmlc_tracker_port", static_cast<std::int64_t>(0), Integer{});
     auto nccl = get_param("dmlc_nccl_path", std::string{DefaultNcclName()}, String{});
-    auto ptr =
-        new CommGroup{std::shared_ptr<RabitComm>{new RabitComm{  // NOLINT
-                          host, static_cast<std::int32_t>(port), std::chrono::seconds{timeout},
-                          static_cast<std::int32_t>(retry), task_id, nccl}},
-                      std::shared_ptr<Coll>(new Coll{})};  // NOLINT
+    auto ptr = new CommGroup{
+        std::shared_ptr<RabitComm>{new RabitComm{  // NOLINT
+            tracker_host, static_cast<std::int32_t>(tracker_port), std::chrono::seconds{timeout},
+            static_cast<std::int32_t>(retry), task_id, nccl}},
+        std::shared_ptr<Coll>(new Coll{})};  // NOLINT
     return ptr;
   } else if (type == "federated") {
 #if defined(XGBOOST_USE_FEDERATED)
@@ -117,6 +119,8 @@ void GlobalCommGroupInit(Json config) {
 
 void GlobalCommGroupFinalize() {
   auto& sptr = GlobalCommGroup();
+  auto rc = sptr->Finalize();
   sptr.reset();
+  SafeColl(rc);
 }
 }  // namespace xgboost::collective
diff --git a/src/collective/comm_group.h b/src/collective/comm_group.h
index 2f6f91d73..a98de0c16 100644
--- a/src/collective/comm_group.h
+++ b/src/collective/comm_group.h
@@ -9,7 +9,6 @@
 #include "coll.h"                       // for Comm
 #include "comm.h"                       // for Coll
 #include "xgboost/collective/result.h"  // for Result
-#include "xgboost/collective/socket.h"  // for GetHostName
 
 namespace xgboost::collective {
 /**
@@ -31,19 +30,35 @@ class CommGroup {
  public:
   CommGroup();
 
-  [[nodiscard]] auto World() const { return comm_->World(); }
-  [[nodiscard]] auto Rank() const { return comm_->Rank(); }
-  [[nodiscard]] bool IsDistributed() const { return comm_->IsDistributed(); }
+  [[nodiscard]] auto World() const noexcept { return comm_->World(); }
+  [[nodiscard]] auto Rank() const noexcept { return comm_->Rank(); }
+  [[nodiscard]] bool IsDistributed() const noexcept { return comm_->IsDistributed(); }
+
+  [[nodiscard]] Result Finalize() const {
+    return Success() << [this] {
+      if (gpu_comm_) {
+        return gpu_comm_->Shutdown();
+      }
+      return Success();
+    } << [&] {
+      return comm_->Shutdown();
+    };
+  }
 
   [[nodiscard]] static CommGroup* Create(Json config);
 
   [[nodiscard]] std::shared_ptr<Coll> Backend(DeviceOrd device) const;
+  /**
+   * @brief Decide the context to use for communication.
+   *
+   * @param ctx Global context, provides the CUDA stream and ordinal.
+   * @param device The device used by the data to be communicated.
+   */
   [[nodiscard]] Comm const& Ctx(Context const* ctx, DeviceOrd device) const;
   [[nodiscard]] Result SignalError(Result const& res) { return comm_->SignalError(res); }
 
   [[nodiscard]] Result ProcessorName(std::string* out) const {
-    auto rc = GetHostName(out);
-    return rc;
+    return this->comm_->ProcessorName(out);
   }
 };
 
diff --git a/src/collective/communicator-inl.cc b/src/collective/communicator-inl.cc
new file mode 100644
index 000000000..4164855f1
--- /dev/null
+++ b/src/collective/communicator-inl.cc
@@ -0,0 +1,34 @@
+/**
+ * Copyright 2024, XGBoost contributors
+ */
+#include "communicator-inl.h"
+
+namespace xgboost::collective {
+[[nodiscard]] std::vector<std::vector<char>> VectorAllgatherV(
+    std::vector<std::vector<char>> const &input) {
+  auto n_inputs = input.size();
+  std::vector<std::int64_t> sizes(n_inputs);
+  std::transform(input.cbegin(), input.cend(), sizes.begin(),
+                 [](auto const &vec) { return vec.size(); });
+
+  std::vector<std::int64_t> global_sizes = AllgatherV(sizes);
+  std::vector<std::int64_t> offset(global_sizes.size() + 1);
+  offset[0] = 0;
+  for (std::size_t i = 1; i < offset.size(); i++) {
+    offset[i] = offset[i - 1] + global_sizes[i - 1];
+  }
+
+  std::vector<char> collected;
+  for (auto const &vec : input) {
+    collected.insert(collected.end(), vec.cbegin(), vec.cend());
+  }
+  auto out = AllgatherV(collected);
+
+  std::vector<std::vector<char>> result;
+  for (std::size_t i = 1; i < offset.size(); ++i) {
+    std::vector<char> local(out.cbegin() + offset[i - 1], out.cbegin() + offset[i]);
+    result.emplace_back(std::move(local));
+  }
+  return result;
+}
+}  // namespace xgboost::collective
diff --git a/src/collective/communicator-inl.h b/src/collective/communicator-inl.h
index 34212def2..991e19f2c 100644
--- a/src/collective/communicator-inl.h
+++ b/src/collective/communicator-inl.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2022-2023 by XGBoost contributors
+ * Copyright 2022-2024, XGBoost contributors
  */
 #pragma once
 #include <string>
@@ -192,6 +192,18 @@ inline std::vector<T> AllgatherV(std::vector<T> const &input) {
   return result;
 }
 
+/**
+ * @brief Gathers variable-length data from all processes and distributes it to all processes.
+ *
+ * @param inputs All the inputs from the local worker. The number of inputs can vary
+ *               across different workers. Along with which, the size of each vector in
+ *               the input can also vary.
+ *
+ * @return The AllgatherV result, containing vectors from all workers.
+ */
+[[nodiscard]] std::vector<std::vector<char>> VectorAllgatherV(
+    std::vector<std::vector<char>> const &input);
+
 /**
  * @brief Gathers variable-length strings from all processes and distributes them to all processes.
  * @param input Variable-length list of variable-length strings.
@@ -294,38 +306,5 @@ template <Operation op>
 inline void Allreduce(double *send_receive_buffer, size_t count) {
   Communicator::Get()->AllReduce(send_receive_buffer, count, DataType::kDouble, op);
 }
-
-template <typename T>
-struct SpecialAllgatherVResult {
-  std::vector<std::size_t> offsets;
-  std::vector<std::size_t> sizes;
-  std::vector<T> result;
-};
-
-/**
- * @brief Gathers variable-length data from all processes and distributes it to all processes.
- *
- * We assume each worker has the same number of inputs, but each input may be of a different size.
- *
- * @param inputs All the inputs from the local worker.
- * @param sizes  Sizes of each input.
- */
-template <typename T>
-inline SpecialAllgatherVResult<T> SpecialAllgatherV(std::vector<T> const &inputs,
-                                                    std::vector<std::size_t> const &sizes) {
-  // Gather the sizes across all workers.
-  auto const all_sizes = Allgather(sizes);
-
-  // Calculate input offsets (std::exclusive_scan).
-  std::vector<std::size_t> offsets(all_sizes.size());
-  for (std::size_t i = 1; i < offsets.size(); i++) {
-    offsets[i] = offsets[i - 1] + all_sizes[i - 1];
-  }
-
-  // Gather all the inputs.
-  auto const all_inputs = AllgatherV(inputs);
-
-  return {offsets, all_sizes, all_inputs};
-}
 }  // namespace collective
 }  // namespace xgboost
diff --git a/src/collective/communicator.hip b/src/collective/communicator.hip
deleted file mode 100644
index 5a438771c..000000000
--- a/src/collective/communicator.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "communicator.cu"
-#endif
diff --git a/src/collective/device_communicator_adapter.cuh b/src/collective/device_communicator_adapter.cuh
index d149348a6..41b061066 100644
--- a/src/collective/device_communicator_adapter.cuh
+++ b/src/collective/device_communicator_adapter.cuh
@@ -3,6 +3,8 @@
  */
 #pragma once
 
+#include <numeric>  // for accumulate
+
 #include "communicator.h"
 #include "device_communicator.cuh"
 
diff --git a/src/collective/in_memory_handler.h b/src/collective/in_memory_handler.h
index f9ac52007..e9c69f537 100644
--- a/src/collective/in_memory_handler.h
+++ b/src/collective/in_memory_handler.h
@@ -32,7 +32,8 @@ class InMemoryHandler {
    *
    * This is used when the handler only needs to be initialized once with a known world size.
    */
-  explicit InMemoryHandler(std::size_t worldSize) : world_size_{worldSize} {}
+  explicit InMemoryHandler(std::int32_t worldSize)
+      : world_size_{static_cast<std::size_t>(worldSize)} {}
 
   /**
    * @brief Initialize the handler with the world size and rank.
diff --git a/src/collective/loop.cc b/src/collective/loop.cc
index b51749fcd..0cd41426d 100644
--- a/src/collective/loop.cc
+++ b/src/collective/loop.cc
@@ -18,9 +18,11 @@
 #include "xgboost/logging.h"            // for CHECK
 
 namespace xgboost::collective {
-Result Loop::EmptyQueue(std::queue<Op>* p_queue) const {
+Result Loop::ProcessQueue(std::queue<Op>* p_queue, bool blocking) const {
   timer_.Start(__func__);
-  auto error = [this] { timer_.Stop(__func__); };
+  auto error = [this] {
+    timer_.Stop(__func__);
+  };
 
   if (stop_) {
     timer_.Stop(__func__);
@@ -48,6 +50,9 @@ Result Loop::EmptyQueue(std::queue<Op>* p_queue) const {
           poll.WatchWrite(*op.sock);
           break;
         }
+        case Op::kSleep: {
+          break;
+        }
         default: {
           error();
           return Fail("Invalid socket operation.");
@@ -59,12 +64,14 @@ Result Loop::EmptyQueue(std::queue<Op>* p_queue) const {
 
     // poll, work on fds that are ready.
     timer_.Start("poll");
-    auto rc = poll.Poll(timeout_);
-    timer_.Stop("poll");
-    if (!rc.OK()) {
-      error();
-      return rc;
+    if (!poll.fds.empty()) {
+      auto rc = poll.Poll(timeout_);
+      if (!rc.OK()) {
+        error();
+        return rc;
+      }
     }
+    timer_.Stop("poll");
 
     // we wonldn't be here if the queue is empty.
     CHECK(!qcopy.empty());
@@ -75,12 +82,20 @@ Result Loop::EmptyQueue(std::queue<Op>* p_queue) const {
       qcopy.pop();
 
       std::int32_t n_bytes_done{0};
-      CHECK(op.sock->NonBlocking());
+      if (!op.sock) {
+        CHECK(op.code == Op::kSleep);
+      } else {
+        CHECK(op.sock->NonBlocking());
+      }
 
       switch (op.code) {
         case Op::kRead: {
           if (poll.CheckRead(*op.sock)) {
             n_bytes_done = op.sock->Recv(op.ptr + op.off, op.n - op.off);
+            if (n_bytes_done == 0) {
+              error();
+              return Fail("Encountered EOF. The other end is likely closed.");
+            }
           }
           break;
         }
@@ -90,6 +105,12 @@ Result Loop::EmptyQueue(std::queue<Op>* p_queue) const {
           }
           break;
         }
+        case Op::kSleep: {
+          // For testing only.
+          std::this_thread::sleep_for(std::chrono::seconds{op.n});
+          n_bytes_done = op.n;
+          break;
+        }
         default: {
           error();
           return Fail("Invalid socket operation.");
@@ -110,6 +131,10 @@ Result Loop::EmptyQueue(std::queue<Op>* p_queue) const {
         qcopy.push(op);
       }
     }
+
+    if (!blocking) {
+      break;
+    }
   }
 
   timer_.Stop(__func__);
@@ -128,6 +153,15 @@ void Loop::Process() {
   while (true) {
     try {
       std::unique_lock lock{mu_};
+      // This can handle missed notification: wait(lock, predicate) is equivalent to:
+      //
+      // while (!predicate()) {
+      //    cv.wait(lock);
+      // }
+      //
+      // As a result, if there's a missed notification, the queue wouldn't be empty, hence
+      // the predicate would be false and the actual wait wouldn't be invoked. Therefore,
+      // the blocking call can never go unanswered.
       cv_.wait(lock, [this] { return !this->queue_.empty() || stop_; });
       if (stop_) {
         break;  // only point where this loop can exit.
@@ -142,26 +176,27 @@ void Loop::Process() {
         queue_.pop();
         if (op.code == Op::kBlock) {
           is_blocking = true;
-          // Block must be the last op in the current batch since no further submit can be
-          // issued until the blocking call is finished.
-          CHECK(queue_.empty());
         } else {
           qcopy.push(op);
         }
       }
 
-      if (!is_blocking) {
-        // Unblock, we can write to the global queue again.
-        lock.unlock();
+      lock.unlock();
+      // Clear the local queue, if `is_blocking` is true, this is blocking the current
+      // worker thread (but not the client thread), wait until all operations are
+      // finished.
+      auto rc = this->ProcessQueue(&qcopy, is_blocking);
+
+      if (is_blocking && rc.OK()) {
+        CHECK(qcopy.empty());
       }
-
-      // Clear the local queue, this is blocking the current worker thread (but not the
-      // client thread), wait until all operations are finished.
-      auto rc = this->EmptyQueue(&qcopy);
-
-      if (is_blocking) {
-        // The unlock is delayed if this is a blocking call
-        lock.unlock();
+      // Push back the remaining operations.
+      if (rc.OK()) {
+        std::unique_lock lock{mu_};
+        while (!qcopy.empty()) {
+          queue_.push(qcopy.front());
+          qcopy.pop();
+        }
       }
 
       // Notify the client thread who called block after all error conditions are set.
@@ -228,7 +263,6 @@ Result Loop::Stop() {
   }
 
   this->Submit(Op{Op::kBlock});
-
   {
     // Wait for the block call to finish.
     std::unique_lock lock{mu_};
@@ -243,8 +277,20 @@ Result Loop::Stop() {
   }
 }
 
+void Loop::Submit(Op op) {
+  std::unique_lock lock{mu_};
+  if (op.code != Op::kBlock) {
+    CHECK_NE(op.n, 0);
+  }
+  queue_.push(op);
+  lock.unlock();
+  cv_.notify_one();
+}
+
 Loop::Loop(std::chrono::seconds timeout) : timeout_{timeout} {
   timer_.Init(__func__);
-  worker_ = std::thread{[this] { this->Process(); }};
+  worker_ = std::thread{[this] {
+    this->Process();
+  }};
 }
 }  // namespace xgboost::collective
diff --git a/src/collective/loop.h b/src/collective/loop.h
index 4839abfd3..a4de2a81b 100644
--- a/src/collective/loop.h
+++ b/src/collective/loop.h
@@ -19,20 +19,27 @@ namespace xgboost::collective {
 class Loop {
  public:
   struct Op {
-    enum Code : std::int8_t { kRead = 0, kWrite = 1, kBlock = 2 } code;
+    // kSleep is only for testing
+    enum Code : std::int8_t { kRead = 0, kWrite = 1, kBlock = 2, kSleep = 4 } code;
     std::int32_t rank{-1};
     std::int8_t* ptr{nullptr};
     std::size_t n{0};
     TCPSocket* sock{nullptr};
     std::size_t off{0};
 
-    explicit Op(Code c) : code{c} { CHECK(c == kBlock); }
+    explicit Op(Code c) : code{c} { CHECK(c == kBlock || c == kSleep); }
     Op(Code c, std::int32_t rank, std::int8_t* ptr, std::size_t n, TCPSocket* sock, std::size_t off)
         : code{c}, rank{rank}, ptr{ptr}, n{n}, sock{sock}, off{off} {}
     Op(Op const&) = default;
     Op& operator=(Op const&) = default;
     Op(Op&&) = default;
     Op& operator=(Op&&) = default;
+    // For testing purpose only
+    [[nodiscard]] static Op Sleep(std::size_t seconds) {
+      Op op{kSleep};
+      op.n = seconds;
+      return op;
+    }
   };
 
  private:
@@ -54,7 +61,7 @@ class Loop {
   std::exception_ptr curr_exce_{nullptr};
   common::Monitor mutable timer_;
 
-  Result EmptyQueue(std::queue<Op>* p_queue) const;
+  Result ProcessQueue(std::queue<Op>* p_queue, bool blocking) const;
   // The cunsumer function that runs inside a worker thread.
   void Process();
 
@@ -64,12 +71,7 @@ class Loop {
    */
   Result Stop();
 
-  void Submit(Op op) {
-    std::unique_lock lock{mu_};
-    queue_.push(op);
-    lock.unlock();
-    cv_.notify_one();
-  }
+  void Submit(Op op);
 
   /**
    * @brief Block the event loop until all ops are finished. In the case of failure, this
diff --git a/src/collective/nccl_device_communicator.cu b/src/collective/nccl_device_communicator.cu
index a1f45bfaf..35bcc773c 100644
--- a/src/collective/nccl_device_communicator.cu
+++ b/src/collective/nccl_device_communicator.cu
@@ -2,6 +2,8 @@
  * Copyright 2023 XGBoost contributors
  */
 #if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
+#include <numeric>  // for accumulate
+
 #include "comm.cuh"
 #include "nccl_device_communicator.cuh"
 
diff --git a/src/collective/nccl_device_communicator.hip b/src/collective/nccl_device_communicator.hip
deleted file mode 100644
index 765c18d79..000000000
--- a/src/collective/nccl_device_communicator.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "nccl_device_communicator.cu"
-#endif
diff --git a/src/collective/protocol.h b/src/collective/protocol.h
index 96edf4e29..29e6c9619 100644
--- a/src/collective/protocol.h
+++ b/src/collective/protocol.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
 #pragma once
 #include <cstdint>  // for int32_t
@@ -58,6 +58,7 @@ struct Magic {
   }
 };
 
+// Basic commands for communication between workers and the tracker.
 enum class CMD : std::int32_t {
   kInvalid = 0,
   kStart = 1,
@@ -84,7 +85,10 @@ struct Connect {
   [[nodiscard]] Result TrackerRecv(TCPSocket* sock, std::int32_t* world, std::int32_t* rank,
                                    std::string* task_id) const {
     std::string init;
-    sock->Recv(&init);
+    auto rc = sock->Recv(&init);
+    if (!rc.OK()) {
+      return Fail("Connect protocol failed.", std::move(rc));
+    }
     auto jinit = Json::Load(StringView{init});
     *world = get<Integer const>(jinit["world_size"]);
     *rank = get<Integer const>(jinit["rank"]);
@@ -122,9 +126,9 @@ class Start {
   }
   [[nodiscard]] Result WorkerRecv(TCPSocket* tracker, std::int32_t* p_world) const {
     std::string scmd;
-    auto n_bytes = tracker->Recv(&scmd);
-    if (n_bytes <= 0) {
-      return Fail("Failed to recv init command from tracker.");
+    auto rc = tracker->Recv(&scmd);
+    if (!rc.OK()) {
+      return Fail("Failed to recv init command from tracker.", std::move(rc));
     }
     auto jcmd = Json::Load(scmd);
     auto world = get<Integer const>(jcmd["world_size"]);
@@ -132,7 +136,7 @@ class Start {
       return Fail("Invalid world size.");
     }
     *p_world = world;
-    return Success();
+    return rc;
   }
   [[nodiscard]] Result TrackerHandle(Json jcmd, std::int32_t* recv_world, std::int32_t world,
                                      std::int32_t* p_port, TCPSocket* p_sock,
@@ -150,6 +154,7 @@ class Start {
   }
 };
 
+// Protocol for communicating with the tracker for printing message.
 struct Print {
   [[nodiscard]] Result WorkerSend(TCPSocket* tracker, std::string msg) const {
     Json jcmd{Object{}};
@@ -172,6 +177,7 @@ struct Print {
   }
 };
 
+// Protocol for communicating with the tracker during error.
 struct ErrorCMD {
   [[nodiscard]] Result WorkerSend(TCPSocket* tracker, Result const& res) const {
     auto msg = res.Report();
@@ -199,6 +205,7 @@ struct ErrorCMD {
   }
 };
 
+// Protocol for communicating with the tracker during shutdown.
 struct ShutdownCMD {
   [[nodiscard]] Result Send(TCPSocket* peer) const {
     Json jcmd{Object{}};
@@ -211,4 +218,40 @@ struct ShutdownCMD {
     return Success();
   }
 };
+
+// Protocol for communicating with the local error handler during error or shutdown. Only
+// one protocol that doesn't have the tracker involved.
+struct Error {
+  constexpr static std::int32_t ShutdownSignal() { return 0; }
+  constexpr static std::int32_t ErrorSignal() { return -1; }
+
+  [[nodiscard]] Result SignalError(TCPSocket* worker) const {
+    std::int32_t err{ErrorSignal()};
+    auto n_sent = worker->SendAll(&err, sizeof(err));
+    if (n_sent == sizeof(err)) {
+      return Success();
+    }
+    return Fail("Failed to send error signal");
+  }
+  // self is localhost, we are sending the signal to the error handling thread for it to
+  // close.
+  [[nodiscard]] Result SignalShutdown(TCPSocket* self) const {
+    std::int32_t err{ShutdownSignal()};
+    auto n_sent = self->SendAll(&err, sizeof(err));
+    if (n_sent == sizeof(err)) {
+      return Success();
+    }
+    return Fail("Failed to send shutdown signal");
+  }
+  // get signal, either for error or for shutdown.
+  [[nodiscard]] Result RecvSignal(TCPSocket* peer, bool* p_is_error) const {
+    std::int32_t err{ShutdownSignal()};
+    auto n_recv = peer->RecvAll(&err, sizeof(err));
+    if (n_recv == sizeof(err)) {
+      *p_is_error = err == 1;
+      return Success();
+    }
+    return Fail("Failed to receive error signal.");
+  }
+};
 }  // namespace xgboost::collective::proto
diff --git a/src/collective/result.cc b/src/collective/result.cc
new file mode 100644
index 000000000..b11710572
--- /dev/null
+++ b/src/collective/result.cc
@@ -0,0 +1,86 @@
+/**
+ *  Copyright 2024, XGBoost Contributors
+ */
+#include "xgboost/collective/result.h"
+
+#include <filesystem>  // for path
+#include <sstream>     // for stringstream
+#include <stack>       // for stack
+
+#include "xgboost/logging.h"
+
+namespace xgboost::collective {
+namespace detail {
+[[nodiscard]] std::string ResultImpl::Report() const {
+  std::stringstream ss;
+  ss << "\n- " << this->message;
+  if (this->errc != std::error_code{}) {
+    ss << " system error:" << this->errc.message();
+  }
+
+  auto ptr = prev.get();
+  while (ptr) {
+    ss << "\n- ";
+    ss << ptr->message;
+
+    if (ptr->errc != std::error_code{}) {
+      ss << " " << ptr->errc.message();
+    }
+    ptr = ptr->prev.get();
+  }
+
+  return ss.str();
+}
+
+[[nodiscard]] std::error_code ResultImpl::Code() const {
+  // Find the root error.
+  std::stack<ResultImpl const*> stack;
+  auto ptr = this;
+  while (ptr) {
+    stack.push(ptr);
+    if (ptr->prev) {
+      ptr = ptr->prev.get();
+    } else {
+      break;
+    }
+  }
+  while (!stack.empty()) {
+    auto frame = stack.top();
+    stack.pop();
+    if (frame->errc != std::error_code{}) {
+      return frame->errc;
+    }
+  }
+  return std::error_code{};
+}
+
+void ResultImpl::Concat(std::unique_ptr<ResultImpl> rhs) {
+  auto ptr = this;
+  while (ptr->prev) {
+    ptr = ptr->prev.get();
+  }
+  ptr->prev = std::move(rhs);
+}
+
+#if (!defined(__GNUC__) && !defined(__clang__)) || defined(__MINGW32__)
+std::string MakeMsg(std::string&& msg, char const*, std::int32_t) {
+  return std::forward<std::string>(msg);
+}
+#else
+std::string MakeMsg(std::string&& msg, char const* file, std::int32_t line) {
+  auto name = std::filesystem::path{file}.filename();
+  if (file && line != -1) {
+    return "[" + name.string() + ":" + std::to_string(line) +  // NOLINT
+           "]: " + std::forward<std::string>(msg);
+  }
+  return std::forward<std::string>(msg);
+}
+#endif
+}  // namespace detail
+
+void SafeColl(Result const& rc) {
+  if (!rc.OK()) {
+    LOG(FATAL) << rc.Report();
+  }
+}
+}  // namespace xgboost::collective
diff --git a/src/collective/socket.cc b/src/collective/socket.cc
index 43da366bd..737ce584e 100644
--- a/src/collective/socket.cc
+++ b/src/collective/socket.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2022-2023 by XGBoost Contributors
+ * Copyright 2022-2024, XGBoost Contributors
  */
 #include "xgboost/collective/socket.h"
 
@@ -8,7 +8,8 @@
 #include <cstdint>       // std::int32_t
 #include <cstring>       // std::memcpy, std::memset
 #include <filesystem>    // for path
-#include <system_error>  // std::error_code, std::system_category
+#include <system_error>  // for error_code, system_category
+#include <thread>        // for sleep_for
 
 #include "rabit/internal/socket.h"      // for PollHelper
 #include "xgboost/collective/result.h"  // for Result
@@ -65,14 +66,18 @@ std::size_t TCPSocket::Send(StringView str) {
   return bytes;
 }
 
-std::size_t TCPSocket::Recv(std::string *p_str) {
+[[nodiscard]] Result TCPSocket::Recv(std::string *p_str) {
   CHECK(!this->IsClosed());
   std::int32_t len;
-  CHECK_EQ(this->RecvAll(&len, sizeof(len)), sizeof(len)) << "Failed to recv string length.";
+  if (this->RecvAll(&len, sizeof(len)) != sizeof(len)) {
+    return Fail("Failed to recv string length.");
+  }
   p_str->resize(len);
   auto bytes = this->RecvAll(&(*p_str)[0], len);
-  CHECK_EQ(bytes, len) << "Failed to recv string.";
-  return bytes;
+  if (static_cast<decltype(len)>(bytes) != len) {
+    return Fail("Failed to recv string.");
+  }
+  return Success();
 }
 
 [[nodiscard]] Result Connect(xgboost::StringView host, std::int32_t port, std::int32_t retry,
@@ -110,11 +115,7 @@ std::size_t TCPSocket::Recv(std::string *p_str) {
   for (std::int32_t attempt = 0; attempt < std::max(retry, 1); ++attempt) {
     if (attempt > 0) {
       LOG(WARNING) << "Retrying connection to " << host << " for the " << attempt << " time.";
-#if defined(_MSC_VER) || defined(__MINGW32__)
-      Sleep(attempt << 1);
-#else
-      sleep(attempt << 1);
-#endif
+      std::this_thread::sleep_for(std::chrono::seconds{attempt << 1});
     }
 
     auto rc = connect(conn.Handle(), addr_handle, addr_len);
@@ -158,8 +159,8 @@ std::size_t TCPSocket::Recv(std::string *p_str) {
 
   std::stringstream ss;
   ss << "Failed to connect to " << host << ":" << port;
-  conn.Close();
-  return Fail(ss.str(), std::move(last_error));
+  auto close_rc = conn.Close();
+  return Fail(ss.str(), std::move(close_rc) + std::move(last_error));
 }
 
 [[nodiscard]] Result GetHostName(std::string *p_out) {
diff --git a/src/collective/tracker.cc b/src/collective/tracker.cc
index 88c51d8a9..142483ccf 100644
--- a/src/collective/tracker.cc
+++ b/src/collective/tracker.cc
@@ -1,6 +1,7 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
+#include "rabit/internal/socket.h"
 #if defined(__unix__) || defined(__APPLE__)
 #include <netdb.h>       // gethostbyname
 #include <sys/socket.h>  // socket, AF_INET6, AF_INET, connect, getsockname
@@ -27,12 +28,14 @@
 #include "tracker.h"
 #include "xgboost/collective/result.h"  // for Result, Fail, Success
 #include "xgboost/collective/socket.h"  // for GetHostName, FailWithCode, MakeSockAddress, ...
-#include "xgboost/json.h"
+#include "xgboost/json.h"               // for Json
 
 namespace xgboost::collective {
 Tracker::Tracker(Json const& config)
-    : n_workers_{static_cast<std::int32_t>(
-          RequiredArg<Integer const>(config, "n_workers", __func__))},
+    : sortby_{static_cast<SortBy>(
+          OptionalArg<Integer const>(config, "sortby", static_cast<Integer::Int>(SortBy::kHost)))},
+      n_workers_{
+          static_cast<std::int32_t>(RequiredArg<Integer const>(config, "n_workers", __func__))},
       port_{static_cast<std::int32_t>(OptionalArg<Integer const>(config, "port", Integer::Int{0}))},
       timeout_{std::chrono::seconds{OptionalArg<Integer const>(
           config, "timeout", static_cast<std::int64_t>(collective::DefaultTimeoutSec()))}} {}
@@ -56,20 +59,25 @@ Result Tracker::WaitUntilReady() const {
   return Success();
 }
 
-RabitTracker::WorkerProxy::WorkerProxy(std::int32_t world, TCPSocket sock, SockAddrV4 addr)
+RabitTracker::WorkerProxy::WorkerProxy(std::int32_t world, TCPSocket sock, SockAddress addr)
     : sock_{std::move(sock)} {
   std::int32_t rank{0};
   Json jcmd;
   std::int32_t port{0};
 
-  rc_ = Success() << [&] { return proto::Magic{}.Verify(&sock_); } << [&] {
+  rc_ = Success() << [&] {
+    return proto::Magic{}.Verify(&sock_);
+  } << [&] {
     return proto::Connect{}.TrackerRecv(&sock_, &world_, &rank, &task_id_);
   } << [&] {
     std::string cmd;
-    sock_.Recv(&cmd);
+    auto rc = sock_.Recv(&cmd);
+    if (!rc.OK()) {
+      return rc;
+    }
     jcmd = Json::Load(StringView{cmd});
     cmd_ = static_cast<proto::CMD>(get<Integer const>(jcmd["cmd"]));
-    return Success();
+    return rc;
   } << [&] {
     if (cmd_ == proto::CMD::kStart) {
       proto::Start start;
@@ -83,28 +91,37 @@ RabitTracker::WorkerProxy::WorkerProxy(std::int32_t world, TCPSocket sock, SockA
     }
     return Success();
   } << [&] {
-    auto host = addr.Addr();
-    info_ = proto::PeerInfo{host, port, rank};
+    if (addr.IsV4()) {
+      auto host = addr.V4().Addr();
+      info_ = proto::PeerInfo{host, port, rank};
+    } else {
+      auto host = addr.V6().Addr();
+      info_ = proto::PeerInfo{host, port, rank};
+    }
     return Success();
   };
 }
 
 RabitTracker::RabitTracker(Json const& config) : Tracker{config} {
   std::string self;
-  auto rc = collective::GetHostAddress(&self);
-  auto host = OptionalArg<String>(config, "host", self);
+  auto rc = Success() << [&] {
+    return collective::GetHostAddress(&self);
+  } << [&] {
+    host_ = OptionalArg<String>(config, "host", self);
 
-  host_ = host;
-  listener_ = TCPSocket::Create(SockDomain::kV4);
-  rc = listener_.Bind(host, &this->port_);
-  CHECK(rc.OK()) << rc.Report();
-  listener_.Listen();
+    auto addr = MakeSockAddress(xgboost::StringView{host_}, 0);
+    listener_ = TCPSocket::Create(addr.IsV4() ? SockDomain::kV4 : SockDomain::kV6);
+    return listener_.Bind(host_, &this->port_);
+  } << [&] {
+    return listener_.Listen();
+  };
+  SafeColl(rc);
 }
 
 Result RabitTracker::Bootstrap(std::vector<WorkerProxy>* p_workers) {
   auto& workers = *p_workers;
 
-  std::sort(workers.begin(), workers.end(), WorkerCmp{});
+  std::sort(workers.begin(), workers.end(), WorkerCmp{this->sortby_});
 
   std::vector<std::thread> bootstrap_threads;
   for (std::int32_t r = 0; r < n_workers_; ++r) {
@@ -211,9 +228,13 @@ Result RabitTracker::Bootstrap(std::vector<WorkerProxy>* p_workers) {
       //
       // retry is set to 1, just let the worker timeout or error. Otherwise the
       // tracker and the worker might be waiting for each other.
-      auto rc = Connect(w.first, w.second, 1, timeout_, &out);
+      auto rc = Success() << [&] {
+        return Connect(w.first, w.second, 1, timeout_, &out);
+      } << [&] {
+        return proto::Error{}.SignalError(&out);
+      };
       if (!rc.OK()) {
-        return Fail("Failed to inform workers to stop.");
+        return Fail("Failed to inform worker:" + w.first + " for error.", std::move(rc));
       }
     }
     return Success();
@@ -222,13 +243,37 @@ Result RabitTracker::Bootstrap(std::vector<WorkerProxy>* p_workers) {
   return std::async(std::launch::async, [this, handle_error] {
     State state{this->n_workers_};
 
+    auto select_accept = [&](TCPSocket* sock, auto* addr) {
+      // accept with poll so that we can enable timeout and interruption.
+      rabit::utils::PollHelper poll;
+      auto rc = Success() << [&] {
+        std::lock_guard lock{listener_mu_};
+        return listener_.NonBlocking(true);
+      } << [&] {
+        std::lock_guard lock{listener_mu_};
+        poll.WatchRead(listener_);
+        if (state.running) {
+          // Don't timeout if the communicator group is up and running.
+          return poll.Poll(std::chrono::seconds{-1});
+        } else {
+          // Have timeout for workers to bootstrap.
+          return poll.Poll(timeout_);
+        }
+      } << [&] {
+        // this->Stop() closes the socket with a lock. Therefore, when the accept returns
+        // due to shutdown, the state is still valid (closed).
+        return listener_.Accept(sock, addr);
+      };
+      return rc;
+    };
+
     while (state.ShouldContinue()) {
       TCPSocket sock;
-      SockAddrV4 addr;
+      SockAddress addr;
       this->ready_ = true;
-      auto rc = listener_.Accept(&sock, &addr);
+      auto rc = select_accept(&sock, &addr);
       if (!rc.OK()) {
-        return Fail("Failed to accept connection.", std::move(rc));
+        return Fail("Failed to accept connection.", this->Stop() + std::move(rc));
       }
 
       auto worker = WorkerProxy{n_workers_, std::move(sock), std::move(addr)};
@@ -243,7 +288,7 @@ Result RabitTracker::Bootstrap(std::vector<WorkerProxy>* p_workers) {
             state.Error();
             rc = handle_error(worker);
             if (!rc.OK()) {
-              return Fail("Failed to handle abort.", std::move(rc));
+              return Fail("Failed to handle abort.", this->Stop() + std::move(rc));
             }
           }
 
@@ -253,7 +298,7 @@ Result RabitTracker::Bootstrap(std::vector<WorkerProxy>* p_workers) {
             state.Bootstrap();
           }
           if (!rc.OK()) {
-            return rc;
+            return this->Stop() + std::move(rc);
           }
           continue;
         }
@@ -280,25 +325,43 @@ Result RabitTracker::Bootstrap(std::vector<WorkerProxy>* p_workers) {
         }
         case proto::CMD::kInvalid:
         default: {
-          return Fail("Invalid command received.");
+          return Fail("Invalid command received.", this->Stop());
         }
       }
     }
-    ready_ = false;
-    return Success();
+    return this->Stop();
   });
 }
 
 [[nodiscard]] Json RabitTracker::WorkerArgs() const {
   auto rc = this->WaitUntilReady();
-  CHECK(rc.OK()) << rc.Report();
+  SafeColl(rc);
 
   Json args{Object{}};
-  args["DMLC_TRACKER_URI"] = String{host_};
-  args["DMLC_TRACKER_PORT"] = this->Port();
+  args["dmlc_tracker_uri"] = String{host_};
+  args["dmlc_tracker_port"] = this->Port();
   return args;
 }
 
+[[nodiscard]] Result RabitTracker::Stop() {
+  if (!this->Ready()) {
+    return Success();
+  }
+
+  ready_ = false;
+  std::lock_guard lock{listener_mu_};
+  if (this->listener_.IsClosed()) {
+    return Success();
+  }
+
+  return Success() << [&] {
+    // This should have the effect of stopping the `accept` call.
+    return this->listener_.Shutdown();
+  } << [&] {
+    return listener_.Close();
+  };
+}
+
 [[nodiscard]] Result GetHostAddress(std::string* out) {
   auto rc = GetHostName(out);
   if (!rc.OK()) {
diff --git a/src/collective/tracker.h b/src/collective/tracker.h
index f336a82f9..af30e0be7 100644
--- a/src/collective/tracker.h
+++ b/src/collective/tracker.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
 #pragma once
 #include <chrono>   // for seconds
@@ -36,6 +36,19 @@ namespace xgboost::collective {
  *     signal an error to the tracker and the tracker will notify other workers.
  */
 class Tracker {
+ public:
+  enum class SortBy : std::int8_t {
+    kHost = 0,
+    kTask = 1,
+  };
+
+ protected:
+  // How to sort the workers, either by host name or by task ID. When using a multi-GPU
+  // setting, multiple workers can occupy the same host, in which case one should sort
+  // workers by task. Due to compatibility reason, the task ID is not always available, so
+  // we use host as the default.
+  SortBy sortby_;
+
  protected:
   std::int32_t n_workers_{0};
   std::int32_t port_{-1};
@@ -44,10 +57,7 @@ class Tracker {
 
  public:
   explicit Tracker(Json const& config);
-  Tracker(std::int32_t n_worders, std::int32_t port, std::chrono::seconds timeout)
-      : n_workers_{n_worders}, port_{port}, timeout_{timeout} {}
-
-  virtual ~Tracker() noexcept(false){};  // NOLINT
+  virtual ~Tracker() = default;
 
   [[nodiscard]] Result WaitUntilReady() const;
 
@@ -59,6 +69,11 @@ class Tracker {
    * @brief Flag to indicate whether the server is running.
    */
   [[nodiscard]] bool Ready() const { return ready_; }
+  /**
+   * @brief Shutdown the tracker, cannot be restarted again. Useful when the tracker hangs while
+   *        calling accept.
+   */
+  virtual Result Stop() { return Success(); }
 };
 
 class RabitTracker : public Tracker {
@@ -76,7 +91,7 @@ class RabitTracker : public Tracker {
     Result rc_;
 
    public:
-    explicit WorkerProxy(std::int32_t world, TCPSocket sock, SockAddrV4 addr);
+    explicit WorkerProxy(std::int32_t world, TCPSocket sock, SockAddress addr);
     WorkerProxy(WorkerProxy const& that) = delete;
     WorkerProxy(WorkerProxy&& that) = default;
     WorkerProxy& operator=(WorkerProxy const&) = delete;
@@ -96,11 +111,14 @@ class RabitTracker : public Tracker {
 
     void Send(StringView value) { this->sock_.Send(value); }
   };
-  // provide an ordering for workers, this helps us get deterministic topology.
+  // Provide an ordering for workers, this helps us get deterministic topology.
   struct WorkerCmp {
+    SortBy sortby;
+    explicit WorkerCmp(SortBy sortby) : sortby{sortby} {}
+
     [[nodiscard]] bool operator()(WorkerProxy const& lhs, WorkerProxy const& rhs) {
-      auto const& lh = lhs.Host();
-      auto const& rh = rhs.Host();
+      auto const& lh = sortby == Tracker::SortBy::kHost ? lhs.Host() : lhs.TaskID();
+      auto const& rh = sortby == Tracker::SortBy::kHost ? rhs.Host() : rhs.TaskID();
 
       if (lh != rh) {
         return lh < rh;
@@ -114,28 +132,22 @@ class RabitTracker : public Tracker {
   // record for how to reach out to workers if error happens.
   std::vector<std::pair<std::string, std::int32_t>> worker_error_handles_;
   // listening socket for incoming workers.
-  //
-  // At the moment, the listener calls accept without first polling. We can add an
-  // additional unix domain socket to allow cancelling the accept.
   TCPSocket listener_;
+  // mutex for protecting the listener, used to prevent race when it's listening while
+  // another thread tries to shut it down.
+  std::mutex listener_mu_;
 
   Result Bootstrap(std::vector<WorkerProxy>* p_workers);
 
  public:
-  explicit RabitTracker(StringView host, std::int32_t n_worders, std::int32_t port,
-                        std::chrono::seconds timeout)
-      : Tracker{n_worders, port, timeout}, host_{host.c_str(), host.size()} {
-    listener_ = TCPSocket::Create(SockDomain::kV4);
-    auto rc = listener_.Bind(host, &this->port_);
-    CHECK(rc.OK()) << rc.Report();
-    listener_.Listen();
-  }
-
   explicit RabitTracker(Json const& config);
-  ~RabitTracker() noexcept(false) override = default;
+  ~RabitTracker() override = default;
 
   std::future<Result> Run() override;
   [[nodiscard]] Json WorkerArgs() const override;
+  // Stop the tracker without waiting. This is to prevent the tracker from hanging when
+  // one of the workers failes to start.
+  [[nodiscard]] Result Stop() override;
 };
 
 // Prob the public IP address of the host, need a better method.
diff --git a/src/common/column_matrix.h b/src/common/column_matrix.h
index fa71c4f52..77ac18364 100644
--- a/src/common/column_matrix.h
+++ b/src/common/column_matrix.h
@@ -72,7 +72,7 @@ class SparseColumnIter : public Column<BinIdxT> {
 
  public:
   SparseColumnIter(common::Span<const BinIdxT> index, bst_bin_t least_bin_idx,
-                   common::Span<const size_t> row_ind, bst_row_t first_row_idx)
+                   common::Span<const size_t> row_ind, bst_idx_t first_row_idx)
       : Base{index, least_bin_idx}, row_ind_(row_ind) {
     // first_row_id is the first row in the leaf partition
     const size_t* row_data = RowIndices();
@@ -301,7 +301,7 @@ class ColumnMatrix {
   }
 
   template <typename BinIdxType>
-  auto SparseColumn(bst_feature_t fidx, bst_row_t first_row_idx) const {
+  auto SparseColumn(bst_feature_t fidx, bst_idx_t first_row_idx) const {
     const size_t feature_offset = feature_offsets_[fidx];  // to get right place for certain feature
     const size_t column_size = feature_offsets_[fidx + 1] - feature_offset;
     common::Span<const BinIdxType> bin_index = {
@@ -325,7 +325,7 @@ class ColumnMatrix {
   // all columns are dense column and has no missing value
   // FIXME(jiamingy): We don't need a column matrix if there's no missing value.
   template <typename RowBinIdxT>
-  void SetIndexNoMissing(bst_row_t base_rowid, RowBinIdxT const* row_index, const size_t n_samples,
+  void SetIndexNoMissing(bst_idx_t base_rowid, RowBinIdxT const* row_index, const size_t n_samples,
                          const size_t n_features, int32_t n_threads) {
     missing_.GrowTo(feature_offsets_[n_features], false);
 
diff --git a/src/common/common.hip b/src/common/common.hip
deleted file mode 100644
index c665b11bc..000000000
--- a/src/common/common.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "common.cu"
-#endif
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 0f5a37210..1d0cddf09 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -14,18 +14,15 @@
 #include <thrust/iterator/transform_output_iterator.h>  // make_transform_output_iterator
 #include <thrust/logical.h>
 #include <thrust/sequence.h>
-#include <thrust/sort.h>
 #include <thrust/system/cuda/error.h>
 #include <thrust/system_error.h>
 #include <thrust/transform_scan.h>
 #include <thrust/unique.h>
 
 #include <algorithm>
-#include <chrono>
 #include <cstddef>  // for size_t
 #include <cub/cub.cuh>
 #include <cub/util_allocator.cuh>
-#include <numeric>
 #include <sstream>
 #include <string>
 #include <tuple>
@@ -33,7 +30,6 @@
 
 #include "../collective/communicator-inl.h"
 #include "common.h"
-#include "xgboost/global_config.h"
 #include "xgboost/host_device_vector.h"
 #include "xgboost/logging.h"
 #include "xgboost/span.h"
@@ -304,21 +300,22 @@ class MemoryLogger {
     void RegisterAllocation(void *ptr, size_t n) {
       device_allocations[ptr] = n;
       currently_allocated_bytes += n;
-      peak_allocated_bytes =
-        std::max(peak_allocated_bytes, currently_allocated_bytes);
+      peak_allocated_bytes = std::max(peak_allocated_bytes, currently_allocated_bytes);
       num_allocations++;
       CHECK_GT(num_allocations, num_deallocations);
     }
     void RegisterDeallocation(void *ptr, size_t n, int current_device) {
       auto itr = device_allocations.find(ptr);
       if (itr == device_allocations.end()) {
-        LOG(WARNING) << "Attempting to deallocate " << n << " bytes on device "
-                   << current_device << " that was never allocated ";
+        LOG(WARNING) << "Attempting to deallocate " << n << " bytes on device " << current_device
+                     << " that was never allocated\n"
+                     << dmlc::StackTrace();
+      } else {
+        num_deallocations++;
+        CHECK_LE(num_deallocations, num_allocations);
+        currently_allocated_bytes -= itr->second;
+        device_allocations.erase(itr);
       }
-      num_deallocations++;
-      CHECK_LE(num_deallocations, num_allocations);
-      currently_allocated_bytes -= itr->second;
-      device_allocations.erase(itr);
     }
   };
   DeviceStats stats_;
diff --git a/src/common/error_msg.cc b/src/common/error_msg.cc
index 8871c1a1d..cdbe5ebf6 100644
--- a/src/common/error_msg.cc
+++ b/src/common/error_msg.cc
@@ -11,7 +11,7 @@
 #include "xgboost/logging.h"
 
 namespace xgboost::error {
-std::string DeprecatedFunc(StringView old, StringView since, StringView replacement) {
+[[nodiscard]] std::string DeprecatedFunc(StringView old, StringView since, StringView replacement) {
   std::stringstream ss;
   ss << "`" << old << "` is deprecated since" << since << ", use `" << replacement << "` instead.";
   return ss.str();
diff --git a/src/common/error_msg.h b/src/common/error_msg.h
index 7264c3532..67114320b 100644
--- a/src/common/error_msg.h
+++ b/src/common/error_msg.h
@@ -89,7 +89,7 @@ void WarnDeprecatedGPUId();
 
 void WarnEmptyDataset();
 
-std::string DeprecatedFunc(StringView old, StringView since, StringView replacement);
+[[nodiscard]] std::string DeprecatedFunc(StringView old, StringView since, StringView replacement);
 
 constexpr StringView InvalidCUDAOrdinal() {
   return "Invalid device. `device` is required to be CUDA and there must be at least one GPU "
diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc
index f10124792..9b703a3fa 100644
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -34,7 +34,7 @@ HistogramCuts SketchOnDMatrix(Context const *ctx, DMatrix *m, bst_bin_t max_bins
   HistogramCuts out;
   auto const &info = m->Info();
   auto n_threads = ctx->Threads();
-  std::vector<bst_row_t> reduced(info.num_col_, 0);
+  std::vector<bst_idx_t> reduced(info.num_col_, 0);
   for (auto const &page : m->GetBatches<SparsePage>()) {
     auto const &entries_per_column =
         CalcColumnSize(data::SparsePageAdapterBatch{page.GetView()}, info.num_col_, n_threads,
@@ -209,10 +209,10 @@ void RowsWiseBuildHistKernel(Span<GradientPair const> gpair,
     CHECK(offsets);
   }
 
-  auto get_row_ptr = [&](bst_row_t ridx) {
+  auto get_row_ptr = [&](bst_idx_t ridx) {
     return kFirstPage ? row_ptr[ridx] : row_ptr[ridx - base_rowid];
   };
-  auto get_rid = [&](bst_row_t ridx) { return kFirstPage ? ridx : (ridx - base_rowid); };
+  auto get_rid = [&](bst_idx_t ridx) { return kFirstPage ? ridx : (ridx - base_rowid); };
 
   const size_t n_features =
       get_row_ptr(row_indices.begin[0] + 1) - get_row_ptr(row_indices.begin[0]);
@@ -275,10 +275,10 @@ void ColsWiseBuildHistKernel(Span<GradientPair const> gpair,
   auto const &row_ptr = gmat.row_ptr.data();
   auto base_rowid = gmat.base_rowid;
   const uint32_t *offsets = gmat.index.Offset();
-  auto get_row_ptr = [&](bst_row_t ridx) {
+  auto get_row_ptr = [&](bst_idx_t ridx) {
     return kFirstPage ? row_ptr[ridx] : row_ptr[ridx - base_rowid];
   };
-  auto get_rid = [&](bst_row_t ridx) { return kFirstPage ? ridx : (ridx - base_rowid); };
+  auto get_rid = [&](bst_idx_t ridx) { return kFirstPage ? ridx : (ridx - base_rowid); };
 
   const size_t n_features = gmat.cut.Ptrs().size() - 1;
   const size_t n_columns = n_features;
diff --git a/src/common/hist_util.cu b/src/common/hist_util.cu
index fbe6356bf..39f310ebb 100644
--- a/src/common/hist_util.cu
+++ b/src/common/hist_util.cu
@@ -13,8 +13,6 @@
 #include <xgboost/logging.h>
 
 #include <cstddef>  // for size_t
-#include <memory>
-#include <mutex>
 #include <utility>
 #include <vector>
 
@@ -39,7 +37,7 @@ size_t RequiredSampleCutsPerColumn(int max_bins, size_t num_rows) {
   return std::min(num_cuts, num_rows);
 }
 
-size_t RequiredSampleCuts(bst_row_t num_rows, bst_feature_t num_columns,
+size_t RequiredSampleCuts(bst_idx_t num_rows, bst_feature_t num_columns,
                           size_t max_bins, size_t nnz) {
   auto per_column = RequiredSampleCutsPerColumn(max_bins, num_rows);
   auto if_dense = num_columns * per_column;
@@ -47,7 +45,7 @@ size_t RequiredSampleCuts(bst_row_t num_rows, bst_feature_t num_columns,
   return result;
 }
 
-size_t RequiredMemory(bst_row_t num_rows, bst_feature_t num_columns, size_t nnz,
+size_t RequiredMemory(bst_idx_t num_rows, bst_feature_t num_columns, size_t nnz,
                       size_t num_bins, bool with_weights) {
   size_t peak = 0;
   // 0. Allocate cut pointer in quantile container by increasing: n_columns + 1
@@ -85,7 +83,7 @@ size_t RequiredMemory(bst_row_t num_rows, bst_feature_t num_columns, size_t nnz,
   return peak;
 }
 
-size_t SketchBatchNumElements(size_t sketch_batch_num_elements, bst_row_t num_rows,
+size_t SketchBatchNumElements(size_t sketch_batch_num_elements, bst_idx_t num_rows,
                               bst_feature_t columns, size_t nnz, int device, size_t num_cuts,
                               bool has_weight) {
   auto constexpr kIntMax = static_cast<std::size_t>(std::numeric_limits<std::int32_t>::max());
@@ -123,7 +121,7 @@ void SortByWeight(dh::device_vector<float>* weights, dh::device_vector<Entry>* s
       [=] __device__(const Entry& a, const Entry& b) { return a.index == b.index; });
 }
 
-void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
+void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_idx_t> d_cuts_ptr,
                                 dh::device_vector<Entry>* p_sorted_entries,
                                 dh::device_vector<float>* p_sorted_weights,
                                 dh::caching_device_vector<size_t>* p_column_sizes_scan) {
@@ -210,7 +208,7 @@ void ProcessWeightedBatch(Context const* ctx, const SparsePage& page, MetaInfo c
     sorted_entries = dh::device_vector<Entry>(h_data.begin() + begin, h_data.begin() + end);
   }
 
-  bst_row_t base_rowid = page.base_rowid;
+  bst_idx_t base_rowid = page.base_rowid;
 
   dh::device_vector<float> entry_weight;
   auto cuctx = ctx->CUDACtx();
diff --git a/src/common/hist_util.cuh b/src/common/hist_util.cuh
index 37751b40b..1ee8ef7f5 100644
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@@ -8,6 +8,7 @@
 #define COMMON_HIST_UTIL_CUH_
 
 #include <thrust/host_vector.h>
+#include <thrust/sort.h>  // for sort
 
 #include <cstddef>  // for size_t
 
@@ -187,7 +188,7 @@ inline size_t constexpr BytesPerElement(bool has_weight) {
  *        directly if it's not 0.
  */
 size_t SketchBatchNumElements(size_t sketch_batch_num_elements,
-                              bst_row_t num_rows, bst_feature_t columns,
+                              bst_idx_t num_rows, bst_feature_t columns,
                               size_t nnz, int device,
                               size_t num_cuts, bool has_weight);
 
@@ -210,7 +211,7 @@ size_t RequiredSampleCutsPerColumn(int max_bins, size_t num_rows);
  *
  * \return The estimated bytes
  */
-size_t RequiredMemory(bst_row_t num_rows, bst_feature_t num_columns, size_t nnz,
+size_t RequiredMemory(bst_idx_t num_rows, bst_feature_t num_columns, size_t nnz,
                       size_t num_bins, bool with_weights);
 
 // Count the valid entries in each column and copy them out.
@@ -241,7 +242,7 @@ void MakeEntriesFromAdapter(AdapterBatch const& batch, BatchIter batch_iter, Ran
 void SortByWeight(dh::device_vector<float>* weights,
                   dh::device_vector<Entry>* sorted_entries);
 
-void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_row_t> d_cuts_ptr,
+void RemoveDuplicatedCategories(DeviceOrd device, MetaInfo const& info, Span<bst_idx_t> d_cuts_ptr,
                                 dh::device_vector<Entry>* p_sorted_entries,
                                 dh::device_vector<float>* p_sorted_weights,
                                 dh::caching_device_vector<size_t>* p_column_sizes_scan);
diff --git a/src/common/hist_util.hip b/src/common/hist_util.hip
deleted file mode 100644
index 86eb989b3..000000000
--- a/src/common/hist_util.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "hist_util.cu"
-#endif
diff --git a/src/common/host_device_vector.cc b/src/common/host_device_vector.cc
index f755426af..7956980d8 100644
--- a/src/common/host_device_vector.cc
+++ b/src/common/host_device_vector.cc
@@ -178,7 +178,7 @@ template class HostDeviceVector<uint8_t>;
 template class HostDeviceVector<int8_t>;
 template class HostDeviceVector<FeatureType>;
 template class HostDeviceVector<Entry>;
-template class HostDeviceVector<uint64_t>;  // bst_row_t
+template class HostDeviceVector<bst_idx_t>;
 template class HostDeviceVector<uint32_t>;  // bst_feature_t
 
 #if defined(__APPLE__) || defined(__EMSCRIPTEN__)
diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu
index d9ae38ced..a23d5fbbb 100644
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@@ -6,7 +6,6 @@
 
 #include <algorithm>
 #include <cstdint>
-#include <mutex>
 
 #include "xgboost/data.h"
 #include "xgboost/host_device_vector.h"
@@ -416,7 +415,7 @@ template class HostDeviceVector<uint8_t>;
 template class HostDeviceVector<int8_t>;
 template class HostDeviceVector<FeatureType>;
 template class HostDeviceVector<Entry>;
-template class HostDeviceVector<uint64_t>;  // bst_row_t
+template class HostDeviceVector<bst_idx_t>;
 template class HostDeviceVector<uint32_t>;  // bst_feature_t
 template class HostDeviceVector<RegTree::Node>;
 template class HostDeviceVector<RegTree::CategoricalSplitMatrix::Segment>;
diff --git a/src/common/host_device_vector.hip b/src/common/host_device_vector.hip
deleted file mode 100644
index beae69382..000000000
--- a/src/common/host_device_vector.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "host_device_vector.cu"
-#endif
diff --git a/src/common/json.cc b/src/common/json.cc
index de9a89f78..2887eeccf 100644
--- a/src/common/json.cc
+++ b/src/common/json.cc
@@ -1,11 +1,12 @@
 /**
- * Copyright 2019-2023, XGBoost Contributors
+ * Copyright 2019-2024, XGBoost Contributors
  */
 #include "xgboost/json.h"
 
 #include <array>             // for array
 #include <cctype>            // for isdigit
 #include <cmath>             // for isinf, isnan
+#include <cstdint>           // for uint8_t, uint16_t, uint32_t
 #include <cstdio>            // for EOF
 #include <cstdlib>           // for size_t, strtof
 #include <cstring>           // for memcpy
@@ -72,15 +73,16 @@ void JsonWriter::Visit(JsonNumber const* num) {
 }
 
 void JsonWriter::Visit(JsonInteger const* num) {
-  char i2s_buffer_[NumericLimits<int64_t>::kToCharsSize];
+  std::array<char, NumericLimits<int64_t>::kToCharsSize> i2s_buffer_;
   auto i = num->GetInteger();
-  auto ret = to_chars(i2s_buffer_, i2s_buffer_ + NumericLimits<int64_t>::kToCharsSize, i);
+  auto ret =
+      to_chars(i2s_buffer_.data(), i2s_buffer_.data() + NumericLimits<int64_t>::kToCharsSize, i);
   auto end = ret.ptr;
   CHECK(ret.ec == std::errc());
-  auto digits = std::distance(i2s_buffer_, end);
+  auto digits = std::distance(i2s_buffer_.data(), end);
   auto ori_size = stream_->size();
   stream_->resize(ori_size + digits);
-  std::memcpy(stream_->data() + ori_size, i2s_buffer_, digits);
+  std::memcpy(stream_->data() + ori_size, i2s_buffer_.data(), digits);
 }
 
 void JsonWriter::Visit(JsonNull const* ) {
@@ -143,8 +145,10 @@ std::string Value::TypeStr() const {
       return "Null";
     case ValueKind::kInteger:
       return "Integer";
-    case ValueKind::kNumberArray:
+    case ValueKind::kF32Array:
       return "F32Array";
+    case ValueKind::kF64Array:
+      return "F64Array";
     case ValueKind::kU8Array:
       return "U8Array";
     case ValueKind::kI32Array:
@@ -262,10 +266,11 @@ bool JsonTypedArray<T, kind>::operator==(Value const& rhs) const {
   return std::equal(arr.cbegin(), arr.cend(), vec_.cbegin());
 }
 
-template class JsonTypedArray<float, Value::ValueKind::kNumberArray>;
-template class JsonTypedArray<uint8_t, Value::ValueKind::kU8Array>;
-template class JsonTypedArray<int32_t, Value::ValueKind::kI32Array>;
-template class JsonTypedArray<int64_t, Value::ValueKind::kI64Array>;
+template class JsonTypedArray<float, Value::ValueKind::kF32Array>;
+template class JsonTypedArray<double, Value::ValueKind::kF64Array>;
+template class JsonTypedArray<std::uint8_t, Value::ValueKind::kU8Array>;
+template class JsonTypedArray<std::int32_t, Value::ValueKind::kI32Array>;
+template class JsonTypedArray<std::int64_t, Value::ValueKind::kI64Array>;
 
 // Json Number
 bool JsonNumber::operator==(Value const& rhs) const {
@@ -708,6 +713,8 @@ Json UBJReader::ParseArray() {
     switch (type) {
       case 'd':
         return ParseTypedArray<F32Array>(n);
+      case 'D':
+        return ParseTypedArray<F64Array>(n);
       case 'U':
         return ParseTypedArray<U8Array>(n);
       case 'l':
@@ -791,12 +798,16 @@ Json UBJReader::Parse() {
         return Json{JsonBoolean{true}};
       }
       case 'F': {
-        return Json{JsonBoolean{true}};
+        return Json{JsonBoolean{false}};
       }
       case 'd': {
         auto v = this->ReadPrimitive<float>();
         return Json{v};
       }
+      case 'D': {
+        auto v = this->ReadPrimitive<double>();
+        return Json{v};
+      }
       case 'S': {
         auto str = this->DecodeStr();
         return Json{str};
@@ -825,10 +836,6 @@ Json UBJReader::Parse() {
         Integer::Int i = this->ReadPrimitive<char>();
         return Json{i};
       }
-      case 'D': {
-        LOG(FATAL) << "f64 is not supported.";
-        break;
-      }
       case 'H': {
         LOG(FATAL) << "High precision number is not supported.";
         break;
@@ -882,6 +889,8 @@ void WriteTypedArray(JsonTypedArray<T, kind> const* arr, std::vector<char>* stre
   stream->push_back('$');
   if (std::is_same<T, float>::value) {
     stream->push_back('d');
+  } else if (std::is_same_v<T, double>) {
+    stream->push_back('D');
   } else if (std::is_same<T, int8_t>::value) {
     stream->push_back('i');
   } else if (std::is_same<T, uint8_t>::value) {
@@ -910,6 +919,7 @@ void WriteTypedArray(JsonTypedArray<T, kind> const* arr, std::vector<char>* stre
 }
 
 void UBJWriter::Visit(F32Array const* arr) { WriteTypedArray(arr, stream_); }
+void UBJWriter::Visit(F64Array const* arr) { WriteTypedArray(arr, stream_); }
 void UBJWriter::Visit(U8Array const* arr) { WriteTypedArray(arr, stream_); }
 void UBJWriter::Visit(I32Array const* arr) { WriteTypedArray(arr, stream_); }
 void UBJWriter::Visit(I64Array const* arr) { WriteTypedArray(arr, stream_); }
diff --git a/src/common/linalg_op.cuh b/src/common/linalg_op.cuh
index 74cee3757..21fad2dc0 100644
--- a/src/common/linalg_op.cuh
+++ b/src/common/linalg_op.cuh
@@ -13,15 +13,14 @@
 #include "xgboost/context.h"  // for Context
 #include "xgboost/linalg.h"   // for TensorView
 
-namespace xgboost {
-namespace linalg {
+namespace xgboost::linalg {
 namespace cuda_impl {
 // Use template specialization to dispatch, Windows + CUDA 11.8 doesn't support extended
 // lambda inside constexpr if
 template <typename T, std::int32_t D>
 struct ElementWiseImpl {
   template <typename Fn>
-  void operator()(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s) {
+  void operator()(TensorView<T, D> t, Fn&& fn, cudaStream_t s) {
     static_assert(D > 1);
     dh::LaunchN(t.Size(), s, [=] __device__(std::size_t i) mutable {
       std::apply(fn, linalg::UnravelIndex(i, t.Shape()));
@@ -32,37 +31,59 @@ struct ElementWiseImpl {
 template <typename T>
 struct ElementWiseImpl<T, 1> {
   template <typename Fn>
-  void operator()(linalg::TensorView<T, 1> t, Fn&& fn, cudaStream_t s) {
+  void operator()(TensorView<T, 1> t, Fn&& fn, cudaStream_t s) {
     dh::LaunchN(t.Size(), s, [=] __device__(std::size_t i) { fn(i); });
   }
 };
 
 template <typename T, std::int32_t D, typename Fn>
-void ElementWiseKernel(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr) {
+void ElementWiseKernel(TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr) {
   dh::safe_cuda(cudaSetDevice(t.Device().ordinal));
   cuda_impl::ElementWiseImpl<T, D>{}(t, fn, s);
 }
 }  // namespace cuda_impl
 
 template <typename T, int32_t D, typename Fn>
-void ElementWiseTransformDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr)
-{
+void ElementWiseTransformDevice(TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr) {
   if (t.Contiguous()) {
     auto ptr = t.Values().data();
     dh::LaunchN(t.Size(), s, [=] __device__(size_t i) { ptr[i] = fn(i, ptr[i]); });
   } else {
     dh::LaunchN(t.Size(), s, [=] __device__(size_t i) mutable {
-      T& v = detail::Apply(t, linalg::UnravelIndex(i, t.Shape()));
+      T& v = detail::Apply(t, UnravelIndex(i, t.Shape()));
       v = fn(i, v);
     });
   }
 }
 
 template <typename T, int32_t D, typename Fn>
-void ElementWiseKernel(Context const* ctx, linalg::TensorView<T, D> t, Fn&& fn) {
+void ElementWiseKernel(Context const* ctx, TensorView<T, D> t, Fn&& fn) {
   ctx->IsCUDA() ? cuda_impl::ElementWiseKernel(t, fn)
                 : ElementWiseKernelHost(t, ctx->Threads(), fn);
 }
-}  // namespace linalg
-}  // namespace xgboost
+
+namespace detail {
+template <typename T, std::int32_t kDim>
+struct IterOp {
+  TensorView<T, kDim> v;
+  XGBOOST_DEVICE T& operator()(std::size_t i) {
+    return detail::Apply(v, UnravelIndex(i, v.Shape()));
+  }
+};
+}  // namespace detail
+
+// naming: thrust begin
+// returns a thrust iterator for a tensor view.
+template <typename T, std::int32_t kDim>
+auto tcbegin(TensorView<T, kDim> v) {  // NOLINT
+  return dh::MakeTransformIterator<T>(
+      thrust::make_counting_iterator(0ul),
+      detail::IterOp<std::add_const_t<std::remove_const_t<T>>, kDim>{v});
+}
+
+template <typename T, std::int32_t kDim>
+auto tcend(TensorView<T, kDim> v) {  // NOLINT
+  return tcbegin(v) + v.Size();
+}
+}  // namespace xgboost::linalg
 #endif  // XGBOOST_COMMON_LINALG_OP_CUH_
diff --git a/src/common/numeric.hip b/src/common/numeric.hip
deleted file mode 100644
index 19c125901..000000000
--- a/src/common/numeric.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "numeric.cu"
-#endif
diff --git a/src/common/quantile.cc b/src/common/quantile.cc
index c74db99e4..8c743d940 100644
--- a/src/common/quantile.cc
+++ b/src/common/quantile.cc
@@ -1,9 +1,10 @@
-/*!
- * Copyright 2020-2022 by XGBoost Contributors
+/**
+ * Copyright 2020-2024, XGBoost Contributors
  */
 #include "quantile.h"
 
 #include <limits>
+#include <numeric>  // for partial_sum
 #include <utility>
 
 #include "../collective/aggregator.h"
@@ -14,7 +15,7 @@
 namespace xgboost::common {
 template <typename WQSketch>
 SketchContainerImpl<WQSketch>::SketchContainerImpl(Context const *ctx,
-                                                   std::vector<bst_row_t> columns_size,
+                                                   std::vector<bst_idx_t> columns_size,
                                                    int32_t max_bins,
                                                    Span<FeatureType const> feature_types,
                                                    bool use_group)
@@ -120,8 +121,8 @@ namespace {
 template <typename T>
 struct QuantileAllreduce {
   common::Span<T> global_values;
-  common::Span<size_t> worker_indptr;
-  common::Span<size_t> feature_indptr;
+  common::Span<bst_idx_t> worker_indptr;
+  common::Span<bst_idx_t> feature_indptr;
   size_t n_features{0};
   /**
    * \brief Get sketch values of the a feature from a worker.
@@ -145,9 +146,9 @@ struct QuantileAllreduce {
 
 template <typename WQSketch>
 void SketchContainerImpl<WQSketch>::GatherSketchInfo(
-    Context const *, MetaInfo const &info,
+    Context const *ctx, MetaInfo const &info,
     std::vector<typename WQSketch::SummaryContainer> const &reduced,
-    std::vector<size_t> *p_worker_segments, std::vector<bst_row_t> *p_sketches_scan,
+    std::vector<bst_idx_t> *p_worker_segments, std::vector<bst_idx_t> *p_sketches_scan,
     std::vector<typename WQSketch::Entry> *p_global_sketches) {
   auto &worker_segments = *p_worker_segments;
   worker_segments.resize(1, 0);
@@ -156,7 +157,7 @@ void SketchContainerImpl<WQSketch>::GatherSketchInfo(
   auto n_columns = sketches_.size();
 
   // get the size of each feature.
-  std::vector<bst_row_t> sketch_size;
+  std::vector<bst_idx_t> sketch_size;
   for (size_t i = 0; i < reduced.size(); ++i) {
     if (IsCat(feature_types_, i)) {
       sketch_size.push_back(0);
@@ -165,13 +166,15 @@ void SketchContainerImpl<WQSketch>::GatherSketchInfo(
     }
   }
   // turn the size into CSC indptr
-  std::vector<bst_row_t> &sketches_scan = *p_sketches_scan;
+  std::vector<bst_idx_t> &sketches_scan = *p_sketches_scan;
   sketches_scan.resize((n_columns + 1) * world, 0);
   size_t beg_scan = rank * (n_columns + 1);  // starting storage for current worker.
   std::partial_sum(sketch_size.cbegin(), sketch_size.cend(), sketches_scan.begin() + beg_scan + 1);
 
   // Gather all column pointers
-  collective::GlobalSum(info, sketches_scan.data(), sketches_scan.size());
+  auto rc =
+      collective::GlobalSum(ctx, info, linalg::MakeVec(sketches_scan.data(), sketches_scan.size()));
+  collective::SafeColl(rc);
   for (int32_t i = 0; i < world; ++i) {
     size_t back = (i + 1) * (n_columns + 1) - 1;
     auto n_entries = sketches_scan.at(back);
@@ -199,14 +202,15 @@ void SketchContainerImpl<WQSketch>::GatherSketchInfo(
 
   static_assert(sizeof(typename WQSketch::Entry) / 4 == sizeof(float),
                 "Unexpected size of sketch entry.");
-  collective::GlobalSum(
-      info,
-      reinterpret_cast<float *>(global_sketches.data()),
-      global_sketches.size() * sizeof(typename WQSketch::Entry) / sizeof(float));
+  rc = collective::GlobalSum(
+      ctx, info,
+      linalg::MakeVec(reinterpret_cast<float *>(global_sketches.data()),
+                      global_sketches.size() * sizeof(typename WQSketch::Entry) / sizeof(float)));
+  collective::SafeColl(rc);
 }
 
 template <typename WQSketch>
-void SketchContainerImpl<WQSketch>::AllreduceCategories(Context const*, MetaInfo const& info) {
+void SketchContainerImpl<WQSketch>::AllreduceCategories(Context const* ctx, MetaInfo const& info) {
   auto world_size = collective::GetWorldSize();
   auto rank = collective::GetRank();
   if (world_size == 1 || info.IsColumnSplit()) {
@@ -223,10 +227,11 @@ void SketchContainerImpl<WQSketch>::AllreduceCategories(Context const*, MetaInfo
   CHECK_EQ(feature_ptr.front(), 0);
 
   // gather all feature ptrs from workers
-  std::vector<size_t> global_feat_ptrs(feature_ptr.size() * world_size, 0);
+  std::vector<bst_idx_t> global_feat_ptrs(feature_ptr.size() * world_size, 0);
   size_t feat_begin = rank * feature_ptr.size();  // pointer to current worker
   std::copy(feature_ptr.begin(), feature_ptr.end(), global_feat_ptrs.begin() + feat_begin);
-  collective::GlobalSum(info, global_feat_ptrs.data(), global_feat_ptrs.size());
+  auto rc = collective::GlobalSum(
+      ctx, info, linalg::MakeVec(global_feat_ptrs.data(), global_feat_ptrs.size()));
 
   // move all categories into a flatten vector to prepare for allreduce
   size_t total = feature_ptr.back();
@@ -237,9 +242,10 @@ void SketchContainerImpl<WQSketch>::AllreduceCategories(Context const*, MetaInfo
   }
 
   // indptr for indexing workers
-  std::vector<size_t> global_worker_ptr(world_size + 1, 0);
+  std::vector<bst_idx_t> global_worker_ptr(world_size + 1, 0);
   global_worker_ptr[rank + 1] = total;  // shift 1 to right for constructing the indptr
-  collective::GlobalSum(info, global_worker_ptr.data(), global_worker_ptr.size());
+  rc = collective::GlobalSum(ctx, info,
+                             linalg::MakeVec(global_worker_ptr.data(), global_worker_ptr.size()));
   std::partial_sum(global_worker_ptr.cbegin(), global_worker_ptr.cend(), global_worker_ptr.begin());
   // total number of categories in all workers with all features
   auto gtotal = global_worker_ptr.back();
@@ -251,7 +257,8 @@ void SketchContainerImpl<WQSketch>::AllreduceCategories(Context const*, MetaInfo
   CHECK_EQ(rank_size, total);
   std::copy(flatten.cbegin(), flatten.cend(), global_categories.begin() + rank_begin);
   // gather values from all workers.
-  collective::GlobalSum(info, global_categories.data(), global_categories.size());
+  rc = collective::GlobalSum(ctx, info,
+                             linalg::MakeVec(global_categories.data(), global_categories.size()));
   QuantileAllreduce<float> allreduce_result{global_categories, global_worker_ptr, global_feat_ptrs,
                                             categories_.size()};
   ParallelFor(categories_.size(), n_threads_, [&](auto fidx) {
@@ -292,12 +299,14 @@ void SketchContainerImpl<WQSketch>::AllReduce(
   reduced.resize(sketches_.size());
 
   // Prune the intermediate num cuts for synchronization.
-  std::vector<bst_row_t> global_column_size(columns_size_);
-  collective::GlobalSum(info, &global_column_size);
+  std::vector<bst_idx_t> global_column_size(columns_size_);
+  auto rc = collective::GlobalSum(
+      ctx, info, linalg::MakeVec(global_column_size.data(), global_column_size.size()));
+  collective::SafeColl(rc);
 
   ParallelFor(sketches_.size(), n_threads_, [&](size_t i) {
     int32_t intermediate_num_cuts = static_cast<int32_t>(
-        std::min(global_column_size[i], static_cast<size_t>(max_bins_ * WQSketch::kFactor)));
+        std::min(global_column_size[i], static_cast<bst_idx_t>(max_bins_ * WQSketch::kFactor)));
     if (global_column_size[i] == 0) {
       return;
     }
@@ -319,8 +328,8 @@ void SketchContainerImpl<WQSketch>::AllReduce(
     return;
   }
 
-  std::vector<size_t> worker_segments(1, 0);  // CSC pointer to sketches.
-  std::vector<bst_row_t> sketches_scan((n_columns + 1) * world, 0);
+  std::vector<bst_idx_t> worker_segments(1, 0);  // CSC pointer to sketches.
+  std::vector<bst_idx_t> sketches_scan((n_columns + 1) * world, 0);
 
   std::vector<typename WQSketch::Entry> global_sketches;
   this->GatherSketchInfo(ctx, info, reduced, &worker_segments, &sketches_scan, &global_sketches);
@@ -444,11 +453,11 @@ template class SketchContainerImpl<WXQuantileSketch<float, float>>;
 
 HostSketchContainer::HostSketchContainer(Context const *ctx, bst_bin_t max_bins,
                                          common::Span<FeatureType const> ft,
-                                         std::vector<size_t> columns_size, bool use_group)
+                                         std::vector<bst_idx_t> columns_size, bool use_group)
     : SketchContainerImpl{ctx, columns_size, max_bins, ft, use_group} {
   monitor_.Init(__func__);
   ParallelFor(sketches_.size(), n_threads_, Sched::Auto(), [&](auto i) {
-    auto n_bins = std::min(static_cast<size_t>(max_bins_), columns_size_[i]);
+    auto n_bins = std::min(static_cast<bst_idx_t>(max_bins_), columns_size_[i]);
     n_bins = std::max(n_bins, static_cast<decltype(n_bins)>(1));
     auto eps = 1.0 / (static_cast<float>(n_bins) * WQSketch::kFactor);
     if (!IsCat(this->feature_types_, i)) {
diff --git a/src/common/quantile.cu b/src/common/quantile.cu
index a903d2f69..8a6f9f0db 100644
--- a/src/common/quantile.cu
+++ b/src/common/quantile.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2023 by XGBoost Contributors
+ * Copyright 2020-2024, XGBoost Contributors
  */
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
@@ -8,8 +8,8 @@
 #include <thrust/transform_scan.h>
 #include <thrust/unique.h>
 
-#include <limits>  // std::numeric_limits
-#include <memory>
+#include <limits>   // std::numeric_limits
+#include <numeric>  // for partial_sum
 #include <utility>
 
 #include "../collective/communicator-inl.cuh"
@@ -115,16 +115,16 @@ void CopyTo(Span<T> out, Span<U> src) {
 
 // Compute the merge path.
 common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
-    Span<SketchEntry const> const &d_x, Span<bst_row_t const> const &x_ptr,
-    Span<SketchEntry const> const &d_y, Span<bst_row_t const> const &y_ptr,
-    Span<SketchEntry> out, Span<bst_row_t> out_ptr) {
+    Span<SketchEntry const> const &d_x, Span<bst_idx_t const> const &x_ptr,
+    Span<SketchEntry const> const &d_y, Span<bst_idx_t const> const &y_ptr,
+    Span<SketchEntry> out, Span<bst_idx_t> out_ptr) {
   auto x_merge_key_it = thrust::make_zip_iterator(thrust::make_tuple(
-      dh::MakeTransformIterator<bst_row_t>(
+      dh::MakeTransformIterator<bst_idx_t>(
           thrust::make_counting_iterator(0ul),
           [=] __device__(size_t idx) { return dh::SegmentId(x_ptr, idx); }),
       d_x.data()));
   auto y_merge_key_it = thrust::make_zip_iterator(thrust::make_tuple(
-      dh::MakeTransformIterator<bst_row_t>(
+      dh::MakeTransformIterator<bst_idx_t>(
           thrust::make_counting_iterator(0ul),
           [=] __device__(size_t idx) { return dh::SegmentId(y_ptr, idx); }),
       d_y.data()));
@@ -175,13 +175,13 @@ common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
 
   auto scan_key_it = dh::MakeTransformIterator<size_t>(
       thrust::make_counting_iterator(0ul),
-      [=] __device__(size_t idx) { return dh::SegmentId(out_ptr, idx); });
+      [=] XGBOOST_DEVICE(size_t idx) { return dh::SegmentId(out_ptr, idx); });
 
   auto scan_val_it = dh::MakeTransformIterator<Tuple>(
-      merge_path.data(), [=] __device__(Tuple const &t) -> Tuple {
+      merge_path.data(), [=] XGBOOST_DEVICE(Tuple const &t) -> Tuple {
         auto ind = get_ind(t);  // == 0 if element is from x
         // x_counter, y_counter
-        return thrust::make_tuple<uint64_t, uint64_t>(!ind, ind);
+        return thrust::tuple<std::uint64_t, std::uint64_t>{!ind, ind};
       });
 
   // Compute the index for both x and y (which of the element in a and b are used in each
@@ -208,8 +208,8 @@ common::Span<thrust::tuple<uint64_t, uint64_t>> MergePath(
 // run it in 2 passes to obtain the merge path and then customize the standard merge
 // algorithm.
 void MergeImpl(DeviceOrd device, Span<SketchEntry const> const &d_x,
-               Span<bst_row_t const> const &x_ptr, Span<SketchEntry const> const &d_y,
-               Span<bst_row_t const> const &y_ptr, Span<SketchEntry> out, Span<bst_row_t> out_ptr) {
+               Span<bst_idx_t const> const &x_ptr, Span<SketchEntry const> const &d_y,
+               Span<bst_idx_t const> const &y_ptr, Span<SketchEntry> out, Span<bst_idx_t> out_ptr) {
   dh::safe_cuda(cudaSetDevice(device.ordinal));
   CHECK_EQ(d_x.size() + d_y.size(), out.size());
   CHECK_EQ(x_ptr.size(), out_ptr.size());
diff --git a/src/common/quantile.cuh b/src/common/quantile.cuh
index 31d8574c1..f446a9508 100644
--- a/src/common/quantile.cuh
+++ b/src/common/quantile.cuh
@@ -1,8 +1,9 @@
+/**
+ * Copyright 2020-2024, XGBoost Contributors
+ */
 #ifndef XGBOOST_COMMON_QUANTILE_CUH_
 #define XGBOOST_COMMON_QUANTILE_CUH_
 
-#include <memory>
-
 #include "xgboost/span.h"
 #include "xgboost/data.h"
 #include "device_helpers.cuh"
@@ -32,13 +33,13 @@ struct SketchUnique {
 class SketchContainer {
  public:
   static constexpr float kFactor = WQSketch::kFactor;
-  using OffsetT = bst_row_t;
+  using OffsetT = bst_idx_t;
   static_assert(sizeof(OffsetT) == sizeof(size_t), "Wrong type for sketch element offset.");
 
  private:
   Monitor timer_;
   HostDeviceVector<FeatureType> feature_types_;
-  bst_row_t num_rows_;
+  bst_idx_t num_rows_;
   bst_feature_t num_columns_;
   int32_t num_bins_;
   DeviceOrd device_;
@@ -94,7 +95,7 @@ class SketchContainer {
    * \param device      GPU ID.
    */
   SketchContainer(HostDeviceVector<FeatureType> const& feature_types, int32_t max_bin,
-                  bst_feature_t num_columns, bst_row_t num_rows, DeviceOrd device)
+                  bst_feature_t num_columns, bst_idx_t num_rows, DeviceOrd device)
       : num_rows_{num_rows}, num_columns_{num_columns}, num_bins_{max_bin}, device_{device} {
     CHECK(device.IsCUDA());
     // Initialize Sketches for this dmatrix
diff --git a/src/common/quantile.h b/src/common/quantile.h
index 0af93a03e..59bc3a4f7 100644
--- a/src/common/quantile.h
+++ b/src/common/quantile.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2014-2023 by XGBoost Contributors
+ * Copyright 2014-2024, XGBoost Contributors
  * \file quantile.h
  * \brief util to compute quantiles
  * \author Tianqi Chen
@@ -701,12 +701,12 @@ inline std::vector<float> UnrollGroupWeights(MetaInfo const &info) {
   auto n_groups = group_ptr.size() - 1;
   CHECK_EQ(info.weights_.Size(), n_groups) << error::GroupWeight();
 
-  bst_row_t n_samples = info.num_row_;
+  bst_idx_t n_samples = info.num_row_;
   std::vector<float> results(n_samples);
   CHECK_EQ(group_ptr.back(), n_samples)
       << error::GroupSize() << " the number of rows from the data.";
   size_t cur_group = 0;
-  for (bst_row_t i = 0; i < n_samples; ++i) {
+  for (bst_idx_t i = 0; i < n_samples; ++i) {
     results[i] = group_weights[cur_group];
     if (i == group_ptr[cur_group + 1]) {
       cur_group++;
@@ -719,9 +719,9 @@ inline std::vector<float> UnrollGroupWeights(MetaInfo const &info) {
 class HistogramCuts;
 
 template <typename Batch, typename IsValid>
-std::vector<bst_row_t> CalcColumnSize(Batch const &batch, bst_feature_t const n_columns,
+std::vector<bst_idx_t> CalcColumnSize(Batch const &batch, bst_feature_t const n_columns,
                                       size_t const n_threads, IsValid &&is_valid) {
-  std::vector<std::vector<bst_row_t>> column_sizes_tloc(n_threads);
+  std::vector<std::vector<bst_idx_t>> column_sizes_tloc(n_threads);
   for (auto &column : column_sizes_tloc) {
     column.resize(n_columns, 0);
   }
@@ -759,7 +759,7 @@ std::vector<bst_feature_t> LoadBalance(Batch const &batch, size_t nnz, bst_featu
   size_t const entries_per_thread = DivRoundUp(total_entries, nthreads);
 
   // Need to calculate the size for each batch.
-  std::vector<bst_row_t> entries_per_columns = CalcColumnSize(batch, n_columns, nthreads, is_valid);
+  std::vector<bst_idx_t> entries_per_columns = CalcColumnSize(batch, n_columns, nthreads, is_valid);
   std::vector<bst_feature_t> cols_ptr(nthreads + 1, 0);
   size_t count{0};
   size_t current_thread{1};
@@ -791,8 +791,8 @@ class SketchContainerImpl {
   std::vector<std::set<float>> categories_;
   std::vector<FeatureType> const feature_types_;
 
-  std::vector<bst_row_t> columns_size_;
-  int32_t max_bins_;
+  std::vector<bst_idx_t> columns_size_;
+  bst_bin_t max_bins_;
   bool use_group_ind_{false};
   int32_t n_threads_;
   bool has_categorical_{false};
@@ -805,7 +805,7 @@ class SketchContainerImpl {
    * \param max_bins maximum number of bins for each feature.
    * \param use_group whether is assigned to group to data instance.
    */
-  SketchContainerImpl(Context const *ctx, std::vector<bst_row_t> columns_size, int32_t max_bins,
+  SketchContainerImpl(Context const *ctx, std::vector<bst_idx_t> columns_size, bst_bin_t max_bins,
                       common::Span<FeatureType const> feature_types, bool use_group);
 
   static bool UseGroup(MetaInfo const &info) {
@@ -829,8 +829,8 @@ class SketchContainerImpl {
   // Gather sketches from all workers.
   void GatherSketchInfo(Context const *ctx, MetaInfo const &info,
                         std::vector<typename WQSketch::SummaryContainer> const &reduced,
-                        std::vector<bst_row_t> *p_worker_segments,
-                        std::vector<bst_row_t> *p_sketches_scan,
+                        std::vector<bst_idx_t> *p_worker_segments,
+                        std::vector<bst_idx_t> *p_sketches_scan,
                         std::vector<typename WQSketch::Entry> *p_global_sketches);
   // Merge sketches from all workers.
   void AllReduce(Context const *ctx, MetaInfo const &info,
@@ -901,7 +901,7 @@ class HostSketchContainer : public SketchContainerImpl<WQuantileSketch<float, fl
 
  public:
   HostSketchContainer(Context const *ctx, bst_bin_t max_bins, common::Span<FeatureType const> ft,
-                      std::vector<size_t> columns_size, bool use_group);
+                      std::vector<bst_idx_t> columns_size, bool use_group);
 
   template <typename Batch>
   void PushAdapterBatch(Batch const &batch, size_t base_rowid, MetaInfo const &info, float missing);
@@ -998,7 +998,7 @@ class SortedSketchContainer : public SketchContainerImpl<WXQuantileSketch<float,
  public:
   explicit SortedSketchContainer(Context const *ctx, int32_t max_bins,
                                  common::Span<FeatureType const> ft,
-                                 std::vector<size_t> columns_size, bool use_group)
+                                 std::vector<bst_idx_t> columns_size, bool use_group)
       : SketchContainerImpl{ctx, columns_size, max_bins, ft, use_group} {
     monitor_.Init(__func__);
     sketches_.resize(columns_size.size());
diff --git a/src/common/quantile.hip b/src/common/quantile.hip
deleted file mode 100644
index c0e4385be..000000000
--- a/src/common/quantile.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "quantile.cu"
-#endif
diff --git a/src/common/random.h b/src/common/random.h
index 098e94b74..908090871 100644
--- a/src/common/random.h
+++ b/src/common/random.h
@@ -31,7 +31,7 @@ namespace xgboost::common {
  */
 using RandomEngine = std::mt19937;
 
-#if XGBOOST_CUSTOMIZE_GLOBAL_PRNG
+#if defined(XGBOOST_CUSTOMIZE_GLOBAL_PRNG) && XGBOOST_CUSTOMIZE_GLOBAL_PRNG == 1
 /*!
  * \brief An customized random engine, used to be plugged in PRNG from other systems.
  *  The implementation of this library is not provided by xgboost core library.
diff --git a/src/common/random.hip b/src/common/random.hip
deleted file mode 100644
index 8f2a6f7a0..000000000
--- a/src/common/random.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "random.cu"
-#endif
diff --git a/src/common/ranking_utils.h b/src/common/ranking_utils.h
index e6b87ed4b..acba0feeb 100644
--- a/src/common/ranking_utils.h
+++ b/src/common/ranking_utils.h
@@ -78,6 +78,7 @@ struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {
 
   // unbiased
   bool lambdarank_unbiased{false};
+  bool lambdarank_normalization{true};
   double lambdarank_bias_norm{1.0};
   // ndcg
   bool ndcg_exp_gain{true};
@@ -86,6 +87,7 @@ struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {
     return lambdarank_pair_method == that.lambdarank_pair_method &&
            lambdarank_num_pair_per_sample == that.lambdarank_num_pair_per_sample &&
            lambdarank_unbiased == that.lambdarank_unbiased &&
+           lambdarank_normalization == that.lambdarank_normalization &&
            lambdarank_bias_norm == that.lambdarank_bias_norm && ndcg_exp_gain == that.ndcg_exp_gain;
   }
   bool operator!=(LambdaRankParam const& that) const { return !(*this == that); }
@@ -134,6 +136,9 @@ struct LambdaRankParam : public XGBoostParameter<LambdaRankParam> {
     DMLC_DECLARE_FIELD(lambdarank_unbiased)
         .set_default(false)
         .describe("Unbiased lambda mart. Use extended IPW to debias click position");
+    DMLC_DECLARE_FIELD(lambdarank_normalization)
+        .set_default(true)
+        .describe("Whether to normalize the leaf value for lambda rank.");
     DMLC_DECLARE_FIELD(lambdarank_bias_norm)
         .set_default(1.0)
         .set_lower_bound(0.0)
diff --git a/src/common/ranking_utils.hip b/src/common/ranking_utils.hip
deleted file mode 100644
index a7860758d..000000000
--- a/src/common/ranking_utils.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "ranking_utils.cu"
-#endif
diff --git a/src/common/stats.hip b/src/common/stats.hip
deleted file mode 100644
index b8d51225e..000000000
--- a/src/common/stats.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "stats.cu"
-#endif
diff --git a/src/common/timer.cc b/src/common/timer.cc
index 99150aa26..2eccc67cd 100644
--- a/src/common/timer.cc
+++ b/src/common/timer.cc
@@ -1,9 +1,8 @@
-/*!
- * Copyright by Contributors 2019
+/**
+ * Copyright 2019-2024, XGBoost Contributors
  */
 #include "timer.h"
 
-#include <sstream>
 #include <utility>
 
 #include "../collective/communicator-inl.h"
@@ -61,6 +60,9 @@ void Monitor::Print() const {
                              kv.second.timer.elapsed)
                              .count());
   }
+  if (stat_map.empty()) {
+    return;
+  }
   LOG(CONSOLE) << "======== Monitor (" << rank << "): " << label_ << " ========";
   this->PrintStatistics(stat_map);
 }
diff --git a/src/context.hip b/src/context.hip
deleted file mode 100644
index d4e3938bf..000000000
--- a/src/context.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "context.cu"
-#endif
diff --git a/src/data/adapter.h b/src/data/adapter.h
index e9a4ad9fc..0ad1e9e38 100644
--- a/src/data/adapter.h
+++ b/src/data/adapter.h
@@ -73,11 +73,11 @@ constexpr size_t kAdapterUnknownSize = std::numeric_limits<size_t >::max();
 
 struct COOTuple {
   COOTuple() = default;
-  XGBOOST_DEVICE COOTuple(size_t row_idx, size_t column_idx, float value)
+  XGBOOST_DEVICE COOTuple(bst_idx_t row_idx, bst_idx_t column_idx, float value)
       : row_idx(row_idx), column_idx(column_idx), value(value) {}
 
-  size_t row_idx{0};
-  size_t column_idx{0};
+  bst_idx_t row_idx{0};
+  bst_idx_t column_idx{0};
   float value{0};
 };
 
@@ -136,12 +136,8 @@ class CSRAdapterBatch : public detail::NoMetaInfo {
  public:
   class Line {
    public:
-    Line(size_t row_idx, size_t size, const unsigned* feature_idx,
-         const float* values)
-        : row_idx_(row_idx),
-          size_(size),
-          feature_idx_(feature_idx),
-          values_(values) {}
+    Line(bst_idx_t row_idx, bst_idx_t size, const unsigned* feature_idx, const float* values)
+        : row_idx_(row_idx), size_(size), feature_idx_(feature_idx), values_(values) {}
 
     size_t Size() const { return size_; }
     COOTuple GetElement(size_t idx) const {
@@ -149,8 +145,8 @@ class CSRAdapterBatch : public detail::NoMetaInfo {
     }
 
    private:
-    size_t row_idx_;
-    size_t size_;
+    bst_idx_t row_idx_;
+    bst_idx_t size_;
     const unsigned* feature_idx_;
     const float* values_;
   };
@@ -178,29 +174,25 @@ class CSRAdapterBatch : public detail::NoMetaInfo {
 
 class CSRAdapter : public detail::SingleBatchDataIter<CSRAdapterBatch> {
  public:
-  CSRAdapter(const size_t* row_ptr, const unsigned* feature_idx,
-             const float* values, size_t num_rows, size_t num_elements,
-             size_t num_features)
-      : batch_(row_ptr, feature_idx, values, num_rows, num_elements,
-               num_features),
+  CSRAdapter(const size_t* row_ptr, const unsigned* feature_idx, const float* values,
+             bst_idx_t num_rows, bst_idx_t num_elements, size_t num_features)
+      : batch_(row_ptr, feature_idx, values, num_rows, num_elements, num_features),
         num_rows_(num_rows),
         num_columns_(num_features) {}
   const CSRAdapterBatch& Value() const override { return batch_; }
-  size_t NumRows() const { return num_rows_; }
-  size_t NumColumns() const { return num_columns_; }
+  bst_idx_t NumRows() const { return num_rows_; }
+  bst_idx_t NumColumns() const { return num_columns_; }
 
  private:
   CSRAdapterBatch batch_;
-  size_t num_rows_;
-  size_t num_columns_;
+  bst_idx_t num_rows_;
+  bst_idx_t num_columns_;
 };
 
 class DenseAdapterBatch : public detail::NoMetaInfo {
  public:
-  DenseAdapterBatch(const float* values, size_t num_rows, size_t num_features)
-      : values_(values),
-        num_rows_(num_rows),
-        num_features_(num_features) {}
+  DenseAdapterBatch(const float* values, bst_idx_t num_rows, bst_idx_t num_features)
+      : values_(values), num_rows_(num_rows), num_features_(num_features) {}
 
  private:
   class Line {
@@ -910,7 +902,7 @@ class SparsePageAdapterBatch {
   struct Line {
     Entry const* inst;
     size_t n;
-    bst_row_t ridx;
+    bst_idx_t ridx;
     COOTuple GetElement(size_t idx) const { return {ridx, inst[idx].index, inst[idx].fvalue}; }
     size_t Size() const { return n; }
   };
diff --git a/src/data/array_interface.hip b/src/data/array_interface.hip
deleted file mode 100644
index b90160d91..000000000
--- a/src/data/array_interface.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "array_interface.cu"
-#endif
diff --git a/src/data/data.cc b/src/data/data.cc
index 0096c9c60..1ffd0f153 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -11,7 +11,6 @@
 #include <cmath>        // for abs
 #include <cstdint>      // for uint64_t, int32_t, uint8_t, uint32_t
 #include <cstring>      // for size_t, strcmp, memcpy
-#include <exception>    // for exception
 #include <iostream>     // for operator<<, basic_ostream, basic_ostream::op...
 #include <map>          // for map, operator!=
 #include <numeric>      // for accumulate, partial_sum
@@ -22,7 +21,6 @@
 #include "../collective/communicator.h"      // for Operation
 #include "../common/algorithm.h"             // for StableSort
 #include "../common/api_entry.h"             // for XGBAPIThreadLocalEntry
-#include "../common/common.h"                // for Split
 #include "../common/error_msg.h"             // for GroupSize, GroupWeight, InfInData
 #include "../common/group_data.h"            // for ParallelGroupBuilder
 #include "../common/io.h"                    // for PeekableInStream
@@ -47,7 +45,7 @@
 #include "simple_dmatrix.h"                  // for SimpleDMatrix
 #include "sparse_page_writer.h"              // for SparsePageFormatReg
 #include "validation.h"                      // for LabelsCheck, WeightsCheck, ValidateQueryGroup
-#include "xgboost/base.h"                    // for bst_group_t, bst_row_t, bst_float, bst_ulong
+#include "xgboost/base.h"                    // for bst_group_t, bst_idx_t, bst_float, bst_ulong
 #include "xgboost/context.h"                 // for Context
 #include "xgboost/host_device_vector.h"      // for HostDeviceVector
 #include "xgboost/learner.h"                 // for HostDeviceVector
@@ -473,11 +471,11 @@ void MetaInfo::SetInfo(Context const& ctx, StringView key, StringView interface_
                               << ", must have at least 1 column even if it's empty.";
     auto const& first = get<Object const>(array.front());
     auto ptr = ArrayInterfaceHandler::GetPtrFromArrayData<void*>(first);
-    is_cuda = ArrayInterfaceHandler::IsCudaPtr(ptr);
+    is_cuda = first.find("stream") != first.cend() || ArrayInterfaceHandler::IsCudaPtr(ptr);
   } else {
     auto const& first = get<Object const>(j_interface);
     auto ptr = ArrayInterfaceHandler::GetPtrFromArrayData<void*>(first);
-    is_cuda = ArrayInterfaceHandler::IsCudaPtr(ptr);
+    is_cuda = first.find("stream") != first.cend() || ArrayInterfaceHandler::IsCudaPtr(ptr);
   }
 
   if (is_cuda) {
@@ -567,46 +565,6 @@ void MetaInfo::SetInfoFromHost(Context const& ctx, StringView key, Json arr) {
   }
 }
 
-void MetaInfo::SetInfo(Context const& ctx, const char* key, const void* dptr, DataType dtype,
-                       size_t num) {
-  CHECK(key);
-  auto proc = [&](auto cast_d_ptr) {
-    using T = std::remove_pointer_t<decltype(cast_d_ptr)>;
-    auto t = linalg::TensorView<T, 1>(common::Span<T>{cast_d_ptr, num}, {num}, DeviceOrd::CPU());
-    CHECK(t.CContiguous());
-    Json interface {
-      linalg::ArrayInterface(t)
-    };
-    assert(ArrayInterface<1>{interface}.is_contiguous);
-    return interface;
-  };
-  // Legacy code using XGBoost dtype, which is a small subset of array interface types.
-  switch (dtype) {
-    case xgboost::DataType::kFloat32: {
-      auto cast_ptr = reinterpret_cast<const float*>(dptr);
-      this->SetInfoFromHost(ctx, key, proc(cast_ptr));
-      break;
-    }
-    case xgboost::DataType::kDouble: {
-      auto cast_ptr = reinterpret_cast<const double*>(dptr);
-      this->SetInfoFromHost(ctx, key, proc(cast_ptr));
-      break;
-    }
-    case xgboost::DataType::kUInt32: {
-      auto cast_ptr = reinterpret_cast<const uint32_t*>(dptr);
-      this->SetInfoFromHost(ctx, key, proc(cast_ptr));
-      break;
-    }
-    case xgboost::DataType::kUInt64: {
-      auto cast_ptr = reinterpret_cast<const uint64_t*>(dptr);
-      this->SetInfoFromHost(ctx, key, proc(cast_ptr));
-      break;
-    }
-    default:
-      LOG(FATAL) << "Unknown data type" << static_cast<uint8_t>(dtype);
-  }
-}
-
 void MetaInfo::GetInfo(char const* key, bst_ulong* out_len, DataType dtype,
                        const void** out_dptr) const {
   if (dtype == DataType::kFloat32) {
@@ -996,7 +954,7 @@ template DMatrix* DMatrix::Create(
 
 SparsePage SparsePage::GetTranspose(int num_columns, int32_t n_threads) const {
   SparsePage transpose;
-  common::ParallelGroupBuilder<Entry, bst_row_t> builder(&transpose.offset.HostVector(),
+  common::ParallelGroupBuilder<Entry, bst_idx_t> builder(&transpose.offset.HostVector(),
                                                          &transpose.data.HostVector());
   builder.InitBudget(num_columns, n_threads);
   long batch_size = static_cast<long>(this->Size());  // NOLINT(*)
@@ -1192,7 +1150,7 @@ uint64_t SparsePage::Push(const AdapterBatchT& batch, float missing, int nthread
 
 void SparsePage::PushCSC(const SparsePage &batch) {
   std::vector<xgboost::Entry>& self_data = data.HostVector();
-  std::vector<bst_row_t>& self_offset = offset.HostVector();
+  std::vector<bst_idx_t>& self_offset = offset.HostVector();
 
   auto const& other_data = batch.data.ConstHostVector();
   auto const& other_offset = batch.offset.ConstHostVector();
@@ -1211,7 +1169,7 @@ void SparsePage::PushCSC(const SparsePage &batch) {
     return;
   }
 
-  std::vector<bst_row_t> offset(other_offset.size());
+  std::vector<bst_idx_t> offset(other_offset.size());
   offset[0] = 0;
 
   std::vector<xgboost::Entry> data(self_data.size() + other_data.size());
diff --git a/src/data/data.hip b/src/data/data.hip
deleted file mode 100644
index a0b80a7e0..000000000
--- a/src/data/data.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "data.cu"
-#endif
diff --git a/src/data/device_adapter.cuh b/src/data/device_adapter.cuh
index 7fe4d831c..1b5566dbb 100644
--- a/src/data/device_adapter.cuh
+++ b/src/data/device_adapter.cuh
@@ -39,7 +39,7 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
     return {row_idx, column_idx, value};
   }
 
-  [[nodiscard]] __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
+  [[nodiscard]] __device__ float GetElement(bst_idx_t ridx, bst_feature_t fidx) const {
     auto const& column = columns_[fidx];
     float value = column.valid.Data() == nullptr || column.valid.Check(ridx)
                       ? column(ridx)
@@ -47,8 +47,8 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
     return value;
   }
 
-  [[nodiscard]] XGBOOST_DEVICE bst_row_t NumRows() const { return num_rows_; }
-  [[nodiscard]] XGBOOST_DEVICE bst_row_t NumCols() const { return columns_.size(); }
+  [[nodiscard]] XGBOOST_DEVICE bst_idx_t NumRows() const { return num_rows_; }
+  [[nodiscard]] XGBOOST_DEVICE bst_idx_t NumCols() const { return columns_.size(); }
 
  private:
   common::Span<ArrayInterface<1>> columns_;
@@ -168,13 +168,13 @@ class CupyAdapterBatch : public detail::NoMetaInfo {
     float value = array_interface_(row_idx, column_idx);
     return {row_idx, column_idx, value};
   }
-  [[nodiscard]] __device__ float GetElement(bst_row_t ridx, bst_feature_t fidx) const {
+  [[nodiscard]] __device__ float GetElement(bst_idx_t ridx, bst_feature_t fidx) const {
     float value = array_interface_(ridx, fidx);
     return value;
   }
 
-  [[nodiscard]] XGBOOST_DEVICE bst_row_t NumRows() const { return array_interface_.Shape(0); }
-  [[nodiscard]] XGBOOST_DEVICE bst_row_t NumCols() const { return array_interface_.Shape(1); }
+  [[nodiscard]] XGBOOST_DEVICE bst_idx_t NumRows() const { return array_interface_.Shape(0); }
+  [[nodiscard]] XGBOOST_DEVICE bst_idx_t NumCols() const { return array_interface_.Shape(1); }
 
  private:
   ArrayInterface<2> array_interface_;
@@ -208,8 +208,8 @@ class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {
 
 // Returns maximum row length
 template <typename AdapterBatchT>
-std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offset, DeviceOrd device,
-                         float missing) {
+bst_idx_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_idx_t> offset, DeviceOrd device,
+                       float missing) {
   dh::safe_cuda(cudaSetDevice(device.ordinal));
   IsValidFunctor is_valid(missing);
   dh::safe_cuda(cudaMemsetAsync(offset.data(), '\0', offset.size_bytes()));
@@ -231,7 +231,7 @@ std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offs
 
   // Count elements per row
   dh::LaunchN(n_samples * stride, [=] __device__(std::size_t idx) {
-    bst_row_t cnt{0};
+    bst_idx_t cnt{0};
     auto [ridx, fbeg] = linalg::UnravelIndex(idx, n_samples, stride);
     SPAN_CHECK(ridx < n_samples);
     for (bst_feature_t fidx = fbeg; fidx < n_features; fidx += stride) {
@@ -246,10 +246,10 @@ std::size_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_row_t> offs
   });
 
   dh::XGBCachingDeviceAllocator<char> alloc;
-  bst_row_t row_stride =
+  bst_idx_t row_stride =
       dh::Reduce(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
                  thrust::device_pointer_cast(offset.data()) + offset.size(),
-                 static_cast<bst_row_t>(0), thrust::maximum<bst_row_t>());
+                 static_cast<bst_idx_t>(0), thrust::maximum<bst_idx_t>());
   return row_stride;
 }
 
diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index b8ec1ab09..15d0a026d 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -175,11 +175,10 @@ struct WriteCompressedEllpackFunctor {
 
   using Tuple = thrust::tuple<size_t, size_t, size_t>;
   __device__ size_t operator()(Tuple out) {
-    auto e = batch.GetElement(out.get<2>());
+    auto e = batch.GetElement(thrust::get<2>(out));
     if (is_valid(e)) {
       // -1 because the scan is inclusive
-      size_t output_position =
-          accessor.row_stride * e.row_idx + out.get<1>() - 1;
+      size_t output_position = accessor.row_stride * e.row_idx + thrust::get<1>(out) - 1;
       uint32_t bin_idx = 0;
       if (common::IsCat(feature_types, e.column_idx)) {
         bin_idx = accessor.SearchBin<true>(e.value, e.column_idx);
@@ -196,8 +195,8 @@ template <typename Tuple>
 struct TupleScanOp {
   __device__ Tuple operator()(Tuple a, Tuple b) {
     // Key equal
-    if (a.template get<0>() == b.template get<0>()) {
-      b.template get<1>() += a.template get<1>();
+    if (thrust::get<0>(a) == thrust::get<0>(b)) {
+      thrust::get<1>(b) += thrust::get<1>(a);
       return b;
     }
     // Not equal
diff --git a/src/data/ellpack_page.hip b/src/data/ellpack_page.hip
deleted file mode 100644
index 697e9a021..000000000
--- a/src/data/ellpack_page.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "ellpack_page.cu"
-#endif
diff --git a/src/data/ellpack_page_raw_format.hip b/src/data/ellpack_page_raw_format.hip
deleted file mode 100644
index 9337d6afb..000000000
--- a/src/data/ellpack_page_raw_format.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "ellpack_page_raw_format.cu"
-#endif
diff --git a/src/data/ellpack_page_source.hip b/src/data/ellpack_page_source.hip
deleted file mode 100644
index fe26c1cb2..000000000
--- a/src/data/ellpack_page_source.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "ellpack_page_source.cu"
-#endif
diff --git a/src/data/file_iterator.cc b/src/data/file_iterator.cc
index cebfbdc19..1e341447c 100644
--- a/src/data/file_iterator.cc
+++ b/src/data/file_iterator.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021-2023, XGBoost contributors
+ * Copyright 2021-2024, XGBoost contributors
  */
 #include "file_iterator.h"
 
@@ -10,7 +10,10 @@
 #include <ostream>     // for operator<<, basic_ostream, istringstream
 #include <vector>      // for vector
 
-#include "../common/common.h"     // for Split
+#include "../common/common.h"  // for Split
+#include "xgboost/linalg.h"    // for ArrayInterfaceStr, MakeVec
+#include "xgboost/linalg.h"
+#include "xgboost/logging.h"      // for CHECK
 #include "xgboost/string_view.h"  // for operator<<, StringView
 
 namespace xgboost::data {
@@ -28,10 +31,10 @@ std::string ValidateFileFormat(std::string const& uri) {
   for (size_t i = 0; i < arg_list.size(); ++i) {
     std::istringstream is(arg_list[i]);
     std::pair<std::string, std::string> kv;
-    CHECK(std::getline(is, kv.first, '=')) << "Invalid uri argument format"
-                                           << " for key in arg " << i + 1;
-    CHECK(std::getline(is, kv.second)) << "Invalid uri argument format"
-                                       << " for value in arg " << i + 1;
+    CHECK(std::getline(is, kv.first, '='))
+        << "Invalid uri argument format" << " for key in arg " << i + 1;
+    CHECK(std::getline(is, kv.second))
+        << "Invalid uri argument format" << " for value in arg " << i + 1;
     args.insert(kv);
   }
   if (args.find("format") == args.cend()) {
@@ -48,4 +51,41 @@ std::string ValidateFileFormat(std::string const& uri) {
     return name_args[0] + "?" + name_args[1] + '#' + name_args_cache[1];
   }
 }
+
+int FileIterator::Next() {
+  CHECK(parser_);
+  if (parser_->Next()) {
+    row_block_ = parser_->Value();
+
+    indptr_ = linalg::Make1dInterface(row_block_.offset, row_block_.size + 1);
+    values_ = linalg::Make1dInterface(row_block_.value, row_block_.offset[row_block_.size]);
+    indices_ = linalg::Make1dInterface(row_block_.index, row_block_.offset[row_block_.size]);
+
+    size_t n_columns =
+        *std::max_element(row_block_.index, row_block_.index + row_block_.offset[row_block_.size]);
+    // dmlc parser converts 1-based indexing back to 0-based indexing so we can ignore
+    // this condition and just add 1 to n_columns
+    n_columns += 1;
+
+    XGProxyDMatrixSetDataCSR(proxy_, indptr_.c_str(), indices_.c_str(), values_.c_str(), n_columns);
+
+    if (row_block_.label) {
+      auto str = linalg::Make1dInterface(row_block_.label, row_block_.size);
+      XGDMatrixSetInfoFromInterface(proxy_, "label", str.c_str());
+    }
+    if (row_block_.qid) {
+      auto str = linalg::Make1dInterface(row_block_.qid, row_block_.size);
+      XGDMatrixSetInfoFromInterface(proxy_, "qid", str.c_str());
+    }
+    if (row_block_.weight) {
+      auto str = linalg::Make1dInterface(row_block_.weight, row_block_.size);
+      XGDMatrixSetInfoFromInterface(proxy_, "weight", str.c_str());
+    }
+    // Continue iteration
+    return true;
+  } else {
+    // Stop iteration
+    return false;
+  }
+}
 }  // namespace xgboost::data
diff --git a/src/data/file_iterator.h b/src/data/file_iterator.h
index c7f23b478..a4afbabe4 100644
--- a/src/data/file_iterator.h
+++ b/src/data/file_iterator.h
@@ -1,20 +1,16 @@
 /**
- * Copyright 2021-2023, XGBoost contributors
+ * Copyright 2021-2024, XGBoost contributors
  */
 #ifndef XGBOOST_DATA_FILE_ITERATOR_H_
 #define XGBOOST_DATA_FILE_ITERATOR_H_
 
-#include <algorithm>  // for max_element
-#include <cstddef>    // for size_t
 #include <cstdint>    // for uint32_t
 #include <memory>     // for unique_ptr
 #include <string>     // for string
 #include <utility>    // for move
 
 #include "dmlc/data.h"        // for RowBlock, Parser
-#include "xgboost/c_api.h"    // for XGDMatrixSetDenseInfo, XGDMatrixFree, XGProxyDMatrixCreate
-#include "xgboost/linalg.h"   // for ArrayInterfaceStr, MakeVec
-#include "xgboost/logging.h"  // for CHECK
+#include "xgboost/c_api.h"    // for XGDMatrixFree, XGProxyDMatrixCreate
 
 namespace xgboost::data {
 [[nodiscard]] std::string ValidateFileFormat(std::string const& uri);
@@ -53,41 +49,7 @@ class FileIterator {
     XGDMatrixFree(proxy_);
   }
 
-  int Next() {
-    CHECK(parser_);
-    if (parser_->Next()) {
-      row_block_ = parser_->Value();
-      using linalg::MakeVec;
-
-      indptr_ = ArrayInterfaceStr(MakeVec(row_block_.offset, row_block_.size + 1));
-      values_ = ArrayInterfaceStr(MakeVec(row_block_.value, row_block_.offset[row_block_.size]));
-      indices_ = ArrayInterfaceStr(MakeVec(row_block_.index, row_block_.offset[row_block_.size]));
-
-      size_t n_columns = *std::max_element(row_block_.index,
-                                           row_block_.index + row_block_.offset[row_block_.size]);
-      // dmlc parser converts 1-based indexing back to 0-based indexing so we can ignore
-      // this condition and just add 1 to n_columns
-      n_columns += 1;
-
-      XGProxyDMatrixSetDataCSR(proxy_, indptr_.c_str(), indices_.c_str(),
-                               values_.c_str(), n_columns);
-
-      if (row_block_.label) {
-        XGDMatrixSetDenseInfo(proxy_, "label", row_block_.label, row_block_.size, 1);
-      }
-      if (row_block_.qid) {
-        XGDMatrixSetDenseInfo(proxy_, "qid", row_block_.qid, row_block_.size, 1);
-      }
-      if (row_block_.weight) {
-        XGDMatrixSetDenseInfo(proxy_, "weight", row_block_.weight, row_block_.size, 1);
-      }
-      // Continue iteration
-      return true;
-    } else {
-      // Stop iteration
-      return false;
-    }
-  }
+  int Next();
 
   auto Proxy() -> decltype(proxy_) { return proxy_; }
 
diff --git a/src/data/gradient_index.cc b/src/data/gradient_index.cc
index e3502e8c4..449b07d3d 100644
--- a/src/data/gradient_index.cc
+++ b/src/data/gradient_index.cc
@@ -193,7 +193,7 @@ float GHistIndexMatrix::GetFvalue(size_t ridx, size_t fidx, bool is_cat) const {
 
 float GHistIndexMatrix::GetFvalue(std::vector<std::uint32_t> const &ptrs,
                                   std::vector<float> const &values, std::vector<float> const &mins,
-                                  bst_row_t ridx, bst_feature_t fidx, bool is_cat) const {
+                                  bst_idx_t ridx, bst_feature_t fidx, bool is_cat) const {
   if (is_cat) {
     auto gidx = GetGindex(ridx, fidx);
     if (gidx == -1) {
diff --git a/src/data/gradient_index.h b/src/data/gradient_index.h
index 0bb93fc20..f1754fe35 100644
--- a/src/data/gradient_index.h
+++ b/src/data/gradient_index.h
@@ -149,7 +149,7 @@ class GHistIndexMatrix {
   /** @brief max_bin for each feature. */
   bst_bin_t max_numeric_bins_per_feat;
   /** @brief base row index for current page (used by external memory) */
-  bst_row_t base_rowid{0};
+  bst_idx_t base_rowid{0};
 
   [[nodiscard]] bst_bin_t MaxNumBinPerFeat() const {
     return std::max(static_cast<bst_bin_t>(cut.MaxCategory() + 1), max_numeric_bins_per_feat);
@@ -230,7 +230,7 @@ class GHistIndexMatrix {
    */
   [[nodiscard]] std::size_t RowIdx(size_t ridx) const { return row_ptr[ridx - base_rowid]; }
 
-  [[nodiscard]] bst_row_t Size() const { return row_ptr.empty() ? 0 : row_ptr.size() - 1; }
+  [[nodiscard]] bst_idx_t Size() const { return row_ptr.empty() ? 0 : row_ptr.size() - 1; }
   [[nodiscard]] bst_feature_t Features() const { return cut.Ptrs().size() - 1; }
 
   [[nodiscard]] bool ReadColumnPage(common::AlignedResourceReadStream* fi);
@@ -243,7 +243,7 @@ class GHistIndexMatrix {
   [[nodiscard]] float GetFvalue(size_t ridx, size_t fidx, bool is_cat) const;
   [[nodiscard]] float GetFvalue(std::vector<std::uint32_t> const& ptrs,
                                 std::vector<float> const& values, std::vector<float> const& mins,
-                                bst_row_t ridx, bst_feature_t fidx, bool is_cat) const;
+                                bst_idx_t ridx, bst_feature_t fidx, bool is_cat) const;
 
   [[nodiscard]] common::HistogramCuts& Cuts() { return cut; }
   [[nodiscard]] common::HistogramCuts const& Cuts() const { return cut; }
diff --git a/src/data/gradient_index.hip b/src/data/gradient_index.hip
deleted file mode 100644
index 7cc0c154d..000000000
--- a/src/data/gradient_index.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "gradient_index.cu"
-#endif
diff --git a/src/data/iterative_dmatrix.cc b/src/data/iterative_dmatrix.cc
index a6a39c3b8..70a6db595 100644
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@@ -132,7 +132,7 @@ void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p,
     return HostAdapterDispatch(proxy, [](auto const& value) { return value.NumCols(); });
   };
 
-  std::vector<std::size_t> column_sizes;
+  std::vector<bst_idx_t> column_sizes;
   auto const is_valid = data::IsValidFunctor{missing};
   auto nnz_cnt = [&]() {
     return HostAdapterDispatch(proxy, [&](auto const& value) {
diff --git a/src/data/iterative_dmatrix.hip b/src/data/iterative_dmatrix.hip
deleted file mode 100644
index cba78dbe1..000000000
--- a/src/data/iterative_dmatrix.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "iterative_dmatrix.cu"
-#endif
diff --git a/src/data/proxy_dmatrix.hip b/src/data/proxy_dmatrix.hip
deleted file mode 100644
index 6b50e6752..000000000
--- a/src/data/proxy_dmatrix.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "proxy_dmatrix.cu"
-#endif
diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc
index 99bf67ba0..4df1d5e53 100644
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -59,7 +59,7 @@ DMatrix* SimpleDMatrix::SliceCol(int num_slices, int slice_id) {
     auto& h_data = out_page.data.HostVector();
     auto& h_offset = out_page.offset.HostVector();
     size_t rptr{0};
-    for (bst_row_t i = 0; i < this->Info().num_row_; i++) {
+    for (bst_idx_t i = 0; i < this->Info().num_row_; i++) {
       auto inst = batch[i];
       auto prev_size = h_data.size();
       std::copy_if(inst.begin(), inst.end(), std::back_inserter(h_data),
diff --git a/src/data/simple_dmatrix.cuh b/src/data/simple_dmatrix.cuh
index 88a88c456..14f4ac289 100644
--- a/src/data/simple_dmatrix.cuh
+++ b/src/data/simple_dmatrix.cuh
@@ -54,7 +54,7 @@ void CopyDataToDMatrix(AdapterBatchT batch, common::Span<Entry> data,
 }
 
 template <typename AdapterBatchT>
-void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset, DeviceOrd device,
+void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_idx_t> offset, DeviceOrd device,
                      float missing) {
   dh::safe_cuda(cudaSetDevice(device.ordinal));
   IsValidFunctor is_valid(missing);
diff --git a/src/data/simple_dmatrix.hip b/src/data/simple_dmatrix.hip
deleted file mode 100644
index 9be8187e1..000000000
--- a/src/data/simple_dmatrix.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "simple_dmatrix.cu"
-#endif
diff --git a/src/data/sparse_page_dmatrix.hip b/src/data/sparse_page_dmatrix.hip
deleted file mode 100644
index 89fe2ed4b..000000000
--- a/src/data/sparse_page_dmatrix.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "sparse_page_dmatrix.cu"
-#endif
diff --git a/src/data/sparse_page_source.h b/src/data/sparse_page_source.h
index 796409c85..ad4587d46 100644
--- a/src/data/sparse_page_source.h
+++ b/src/data/sparse_page_source.h
@@ -1,5 +1,5 @@
 /**
- *  Copyright 2014-2023, XGBoost Contributors
+ *  Copyright 2014-2024, XGBoost Contributors
  * \file sparse_page_source.h
  */
 #ifndef XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_
@@ -7,23 +7,26 @@
 
 #include <algorithm>  // for min
 #include <atomic>     // for atomic
+#include <cstdio>     // for remove
 #include <future>     // for async
-#include <map>
-#include <memory>
-#include <mutex>  // for mutex
-#include <string>
-#include <thread>
-#include <utility>  // for pair, move
-#include <vector>
+#include <memory>     // for unique_ptr
+#include <mutex>      // for mutex
+#include <string>     // for string
+#include <utility>    // for pair, move
+#include <vector>     // for vector
 
-#include "../common/common.h"
-#include "../common/io.h"     // for PrivateMmapConstStream
-#include "../common/timer.h"  // for Monitor, Timer
-#include "adapter.h"
-#include "proxy_dmatrix.h"       // for DMatrixProxy
-#include "sparse_page_writer.h"  // for SparsePageFormat
-#include "xgboost/base.h"
-#include "xgboost/data.h"
+#if !defined(XGBOOST_USE_CUDA)
+#include "../common/common.h"  // for AssertGPUSupport
+#endif                         // !defined(XGBOOST_USE_CUDA)
+
+#include "../common/io.h"           // for PrivateMmapConstStream
+#include "../common/timer.h"        // for Monitor, Timer
+#include "proxy_dmatrix.h"          // for DMatrixProxy
+#include "sparse_page_writer.h"     // for SparsePageFormat
+#include "xgboost/base.h"           // for bst_feature_t
+#include "xgboost/data.h"           // for SparsePage, CSCPage
+#include "xgboost/global_config.h"  // for GlobalConfigThreadLocalStore
+#include "xgboost/logging.h"        // for CHECK_EQ
 
 namespace xgboost::data {
 inline void TryDeleteCacheFile(const std::string& file) {
@@ -185,6 +188,7 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
 
     exce_.Rethrow();
 
+    auto const config = *GlobalConfigThreadLocalStore::Get();
     for (std::int32_t i = 0; i < n_prefetch_batches; ++i, ++fetch_it) {
       fetch_it %= n_batches_;  // ring
       if (ring_->at(fetch_it).valid()) {
@@ -192,7 +196,8 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
       }
       auto const* self = this;  // make sure it's const
       CHECK_LT(fetch_it, cache_info_->offset.size());
-      ring_->at(fetch_it) = std::async(std::launch::async, [fetch_it, self, this]() {
+      ring_->at(fetch_it) = std::async(std::launch::async, [fetch_it, self, config, this]() {
+        *GlobalConfigThreadLocalStore::Get() = config;
         auto page = std::make_shared<S>();
         this->exce_.Run([&] {
           std::unique_ptr<SparsePageFormat<S>> fmt{CreatePageFormat<S>("raw")};
diff --git a/src/data/sparse_page_source.hip b/src/data/sparse_page_source.hip
deleted file mode 100644
index 3a3f71e2f..000000000
--- a/src/data/sparse_page_source.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "sparse_page_source.cu"
-#endif
diff --git a/src/gbm/gbtree.h b/src/gbm/gbtree.h
index a2d84d848..d6ed851c8 100644
--- a/src/gbm/gbtree.h
+++ b/src/gbm/gbtree.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2014-2023 by Contributors
+ * Copyright 2014-2024, XGBoost Contributors
  * \file gbtree.cc
  * \brief gradient boosted tree implementation.
  * \author Tianqi Chen
@@ -11,14 +11,12 @@
 
 #include <algorithm>
 #include <cstdint>  // std::int32_t
-#include <map>
 #include <memory>
+#include <numeric>  // for iota
 #include <string>
-#include <unordered_map>
 #include <utility>
 #include <vector>
 
-#include "../common/common.h"
 #include "../common/timer.h"
 #include "../tree/param.h"  // TrainParam
 #include "gbtree_model.h"
diff --git a/src/gbm/gbtree.hip b/src/gbm/gbtree.hip
deleted file mode 100644
index 76040e75f..000000000
--- a/src/gbm/gbtree.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "gbtree.cu"
-#endif
diff --git a/src/gbm/gbtree_model.cc b/src/gbm/gbtree_model.cc
index 14131865f..2edb456c9 100644
--- a/src/gbm/gbtree_model.cc
+++ b/src/gbm/gbtree_model.cc
@@ -106,30 +106,13 @@ void GBTreeModel::Load(dmlc::Stream* fi) {
   Validate(*this);
 }
 
-namespace {
-std::int32_t IOThreads(Context const* ctx) {
-  CHECK(ctx);
-  std::int32_t n_threads = ctx->Threads();
-  // CRAN checks for number of threads used by examples, but we might not have the right
-  // number of threads when serializing/unserializing models as nthread is a booster
-  // parameter, which is only effective after booster initialization.
-  //
-  // The threshold ratio of CPU time to user time for R is 2.5, we set the number of
-  // threads to 2.
-#if defined(XGBOOST_STRICT_R_MODE) && XGBOOST_STRICT_R_MODE == 1
-  n_threads = std::min(2, n_threads);
-#endif
-  return n_threads;
-}
-}  // namespace
-
 void GBTreeModel::SaveModel(Json* p_out) const {
   auto& out = *p_out;
   CHECK_EQ(param.num_trees, static_cast<int>(trees.size()));
   out["gbtree_model_param"] = ToJson(param);
   std::vector<Json> trees_json(trees.size());
 
-  common::ParallelFor(trees.size(), IOThreads(ctx_), [&](auto t) {
+  common::ParallelFor(trees.size(), ctx_->Threads(), [&](auto t) {
     auto const& tree = trees[t];
     Json jtree{Object{}};
     tree->SaveModel(&jtree);
@@ -167,7 +150,7 @@ void GBTreeModel::LoadModel(Json const& in) {
   CHECK_EQ(tree_info_json.size(), param.num_trees);
   tree_info.resize(param.num_trees);
 
-  common::ParallelFor(param.num_trees, IOThreads(ctx_), [&](auto t) {
+  common::ParallelFor(param.num_trees, ctx_->Threads(), [&](auto t) {
     auto tree_id = get<Integer const>(trees_json[t]["id"]);
     trees.at(tree_id).reset(new RegTree{});
     trees[tree_id]->LoadModel(trees_json[t]);
diff --git a/src/learner.cc b/src/learner.cc
index db72f7164..ca6704944 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2014-2023 by XGBoost Contributors
+ * Copyright 2014-2024, XGBoost Contributors
  * \file learner.cc
  * \brief Implementation of learning algorithm.
  * \author Tianqi Chen
@@ -18,7 +18,6 @@
 #include <cstdint>                        // for int32_t, uint32_t, int64_t, uint64_t
 #include <cstdlib>                        // for atoi
 #include <cstring>                        // for memcpy, size_t, memset
-#include <functional>                     // for less
 #include <iomanip>                        // for operator<<, setiosflags
 #include <iterator>                       // for back_insert_iterator, distance, back_inserter
 #include <limits>                         // for numeric_limits
@@ -846,7 +845,7 @@ class LearnerConfiguration : public Learner {
 
   void InitEstimation(MetaInfo const& info, linalg::Tensor<float, 1>* base_score) {
     base_score->Reshape(1);
-    collective::ApplyWithLabels(info, base_score->Data(),
+    collective::ApplyWithLabels(this->Ctx(), info, base_score->Data(),
                                 [&] { UsePtr(obj_)->InitEstimation(info, base_score); });
   }
 };
@@ -1472,7 +1471,7 @@ class LearnerImpl : public LearnerIO {
   void GetGradient(HostDeviceVector<bst_float> const& preds, MetaInfo const& info,
                    std::int32_t iter, linalg::Matrix<GradientPair>* out_gpair) {
     out_gpair->Reshape(info.num_row_, this->learner_model_param_.OutputLength());
-    collective::ApplyWithLabels(info, out_gpair->Data(),
+    collective::ApplyWithLabels(&ctx_, info, out_gpair->Data(),
                                 [&] { obj_->GetGradient(preds, info, iter, out_gpair); });
   }
 
diff --git a/src/linear/updater_gpu_coordinate.hip b/src/linear/updater_gpu_coordinate.hip
deleted file mode 100644
index b973a568f..000000000
--- a/src/linear/updater_gpu_coordinate.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "updater_gpu_coordinate.cu"
-#endif
diff --git a/src/metric/auc.cc b/src/metric/auc.cc
index b5d63bffc..81c731bf0 100644
--- a/src/metric/auc.cc
+++ b/src/metric/auc.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021-2023 by XGBoost Contributors
+ * Copyright 2021-2024, XGBoost Contributors
  */
 #include "auc.h"
 
@@ -112,7 +112,9 @@ double MultiClassOVR(Context const *ctx, common::Span<float const> predts, MetaI
 
   // we have 2 averages going in here, first is among workers, second is among
   // classes. allreduce sums up fp/tp auc for each class.
-  collective::GlobalSum(info, &results.Values());
+  auto rc = collective::GlobalSum(ctx, info, results);
+  collective::SafeColl(rc);
+
   double auc_sum{0};
   double tp_sum{0};
   for (size_t c = 0; c < n_classes; ++c) {
@@ -286,7 +288,7 @@ class EvalAUC : public MetricNoCache {
         InvalidGroupAUC();
       }
 
-      auc = collective::GlobalRatio(info, auc, static_cast<double>(valid_groups));
+      auc = collective::GlobalRatio(ctx_, info, auc, static_cast<double>(valid_groups));
       if (!std::isnan(auc)) {
         CHECK_LE(auc, 1) << "Total AUC across groups: " << auc * valid_groups
                          << ", valid groups: " << valid_groups;
@@ -307,7 +309,7 @@ class EvalAUC : public MetricNoCache {
         std::tie(fp, tp, auc) =
             static_cast<Curve *>(this)->EvalBinary(preds, info);
       }
-      auc = collective::GlobalRatio(info, auc, fp * tp);
+      auc = collective::GlobalRatio(ctx_, info, auc, fp * tp);
       if (!std::isnan(auc)) {
         CHECK_LE(auc, 1.0);
       }
diff --git a/src/metric/auc.hip b/src/metric/auc.hip
deleted file mode 100644
index a96cbbde5..000000000
--- a/src/metric/auc.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "auc.cu"
-#endif
diff --git a/src/metric/elementwise_metric.cu b/src/metric/elementwise_metric.cu
index 1886ff12f..3acb24b37 100644
--- a/src/metric/elementwise_metric.cu
+++ b/src/metric/elementwise_metric.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2015-2023 by XGBoost Contributors
+ * Copyright 2015-2024, XGBoost Contributors
  * \file elementwise_metric.cu
  * \brief evaluation metrics for elementwise binary or regression.
  * \author Kailong Chen, Tianqi Chen
@@ -10,15 +10,16 @@
 
 #include <array>
 #include <cmath>
+#include <numeric>  // for accumulate
 
-#include "../collective/communicator-inl.h"
-#include "../common/common.h"           // MetricNoCache
+#include "../common/common.h"  // for AssertGPUSupport
 #include "../common/math.h"
 #include "../common/optional_weight.h"  // OptionalWeights
 #include "../common/pseudo_huber.h"
 #include "../common/quantile_loss_utils.h"  // QuantileLossParam
 #include "../common/threading_utils.h"
-#include "metric_common.h"
+#include "metric_common.h"              // MetricNoCache
+#include "xgboost/collective/result.h"  // for SafeColl
 #include "xgboost/metric.h"
 
 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
@@ -30,8 +31,7 @@
 #include "../common/device_helpers.cuh"
 #endif  // defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
 
-namespace xgboost {
-namespace metric {
+namespace xgboost::metric {
 // tag the this file, used by force static link later.
 DMLC_REGISTRY_FILE_TAG(elementwise_metric);
 
@@ -199,7 +199,8 @@ class PseudoErrorLoss : public MetricNoCache {
           return std::make_tuple(v, wt);
         });
     std::array<double, 2> dat{result.Residue(), result.Weights()};
-    collective::GlobalSum(info, &dat);
+    auto rc = collective::GlobalSum(ctx_, info, linalg::MakeVec(dat.data(), dat.size()));
+    collective::SafeColl(rc);
     return EvalRowMAPE::GetFinal(dat[0], dat[1]);
   }
 };
@@ -243,11 +244,11 @@ struct EvalError {
 };
 
 struct EvalPoissonNegLogLik {
-  const char *Name() const {
+  [[nodiscard]] const char *Name() const {
     return "poisson-nloglik";
   }
 
-  XGBOOST_DEVICE bst_float EvalRow(bst_float y, bst_float py) const {
+  [[nodiscard]] XGBOOST_DEVICE bst_float EvalRow(bst_float y, bst_float py) const {
     const bst_float eps = 1e-16f;
     if (py < eps) py = eps;
     return common::LogGamma(y + 1.0f) + py - std::log(py) * y;
@@ -266,9 +267,9 @@ struct EvalPoissonNegLogLik {
  *   predt >= 0
  */
 struct EvalGammaDeviance {
-  const char *Name() const { return "gamma-deviance"; }
+  [[nodiscard]] const char *Name() const { return "gamma-deviance"; }
 
-  XGBOOST_DEVICE bst_float EvalRow(bst_float label, bst_float predt) const {
+  [[nodiscard]] XGBOOST_DEVICE bst_float EvalRow(bst_float label, bst_float predt) const {
     predt += kRtEps;
     label += kRtEps;
     return std::log(predt / label) + label / predt - 1;
@@ -287,7 +288,7 @@ struct EvalGammaNLogLik {
     return "gamma-nloglik";
   }
 
-  XGBOOST_DEVICE bst_float EvalRow(bst_float y, bst_float py) const {
+  [[nodiscard]] XGBOOST_DEVICE bst_float EvalRow(bst_float y, bst_float py) const {
     py = std::max(py, 1e-6f);
     // hardcoded dispersion.
     float constexpr kPsi = 1.0;
@@ -313,7 +314,7 @@ struct EvalTweedieNLogLik {
     CHECK(rho_ < 2 && rho_ >= 1)
         << "tweedie variance power must be in interval [1, 2)";
   }
-  const char *Name() const {
+  [[nodiscard]] const char *Name() const {
     static thread_local std::string name;
     std::ostringstream os;
     os << "tweedie-nloglik@" << rho_;
@@ -321,7 +322,7 @@ struct EvalTweedieNLogLik {
     return name.c_str();
   }
 
-  XGBOOST_DEVICE bst_float EvalRow(bst_float y, bst_float p) const {
+  [[nodiscard]] XGBOOST_DEVICE bst_float EvalRow(bst_float y, bst_float p) const {
     bst_float a = y * std::exp((1 - rho_) * std::log(p)) / (1 - rho_);
     bst_float b = std::exp((2 - rho_) * std::log(p)) / (2 - rho_);
     return -a + b;
@@ -366,7 +367,8 @@ struct EvalEWiseBase : public MetricNoCache {
         });
 
     std::array<double, 2> dat{result.Residue(), result.Weights()};
-    collective::GlobalSum(info, &dat);
+    auto rc = collective::GlobalSum(ctx_, info, linalg::MakeVec(dat.data(), dat.size()));
+    collective::SafeColl(rc);
     return Policy::GetFinal(dat[0], dat[1]);
   }
 
@@ -438,7 +440,8 @@ class QuantileError : public MetricNoCache {
     if (info.num_row_ == 0) {
       // empty DMatrix on distributed env
       std::array<double, 2> dat{0.0, 0.0};
-      collective::GlobalSum(info, &dat);
+      auto rc = collective::GlobalSum(ctx_, info, linalg::MakeVec(dat.data(), dat.size()));
+      collective::SafeColl(rc);
       CHECK_GT(dat[1], 0);
       return dat[0] / dat[1];
     }
@@ -476,7 +479,8 @@ class QuantileError : public MetricNoCache {
           return std::make_tuple(l, w);
         });
     std::array<double, 2> dat{result.Residue(), result.Weights()};
-    collective::GlobalSum(info, &dat);
+    auto rc = collective::GlobalSum(ctx, info, linalg::MakeVec(dat.data(), dat.size()));
+    collective::SafeColl(rc);
     CHECK_GT(dat[1], 0);
     return dat[0] / dat[1];
   }
@@ -501,5 +505,4 @@ class QuantileError : public MetricNoCache {
 XGBOOST_REGISTER_METRIC(QuantileError, "quantile")
     .describe("Quantile regression error.")
     .set_body([](const char*) { return new QuantileError{}; });
-}  // namespace metric
-}  // namespace xgboost
+}  // namespace xgboost::metric
diff --git a/src/metric/elementwise_metric.hip b/src/metric/elementwise_metric.hip
deleted file mode 100644
index 18e4916a4..000000000
--- a/src/metric/elementwise_metric.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "elementwise_metric.cu"
-#endif
diff --git a/src/metric/metric_common.h b/src/metric/metric_common.h
index 1b148ab0f..2b9239990 100644
--- a/src/metric/metric_common.h
+++ b/src/metric/metric_common.h
@@ -1,6 +1,5 @@
-/*!
- * Copyright 2018-2022 by Contributors
- * \file metric_common.h
+/**
+ * Copyright 2018-2024, Contributors
  */
 #ifndef XGBOOST_METRIC_METRIC_COMMON_H_
 #define XGBOOST_METRIC_METRIC_COMMON_H_
@@ -10,8 +9,6 @@
 #include <string>
 
 #include "../collective/aggregator.h"
-#include "../collective/communicator-inl.h"
-#include "../common/common.h"
 #include "xgboost/metric.h"
 
 namespace xgboost {
@@ -24,7 +21,7 @@ class MetricNoCache : public Metric {
   double Evaluate(HostDeviceVector<float> const &predts, std::shared_ptr<DMatrix> p_fmat) final {
     double result{0.0};
     auto const &info = p_fmat->Info();
-    collective::ApplyWithLabels(info, &result, sizeof(double),
+    collective::ApplyWithLabels(ctx_, info, &result, sizeof(double),
                                 [&] { result = this->Eval(predts, info); });
     return result;
   }
diff --git a/src/metric/multiclass_metric.cu b/src/metric/multiclass_metric.cu
index a6d215e6a..95341efea 100644
--- a/src/metric/multiclass_metric.cu
+++ b/src/metric/multiclass_metric.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2015-2023 by XGBoost Contributors
+ * Copyright 2015-2024, XGBoost Contributors
  * \file multiclass_metric.cc
  * \brief evaluation metrics for multiclass classification.
  * \author Kailong Chen, Tianqi Chen
@@ -9,8 +9,8 @@
 #include <array>
 #include <atomic>
 #include <cmath>
+#include <numeric>  // for accumulate
 
-#include "../collective/communicator-inl.h"
 #include "../common/math.h"
 #include "../common/threading_utils.h"
 #include "metric_common.h"  // MetricNoCache
@@ -24,8 +24,7 @@
 #include "../common/device_helpers.cuh"
 #endif  // XGBOOST_USE_CUDA || XGBOOST_USE_HIP
 
-namespace xgboost {
-namespace metric {
+namespace xgboost::metric {
 // tag the this file, used by force static link later.
 DMLC_REGISTRY_FILE_TAG(multiclass_metric);
 
@@ -40,11 +39,10 @@ class MultiClassMetricsReduction {
  public:
   MultiClassMetricsReduction() = default;
 
-  PackedReduceResult
-  CpuReduceMetrics(const HostDeviceVector<bst_float> &weights,
-                   const HostDeviceVector<bst_float> &labels,
-                   const HostDeviceVector<bst_float> &preds,
-                   const size_t n_class, int32_t n_threads) const {
+  [[nodiscard]] PackedReduceResult CpuReduceMetrics(const HostDeviceVector<bst_float>& weights,
+                                                    const HostDeviceVector<bst_float>& labels,
+                                                    const HostDeviceVector<bst_float>& preds,
+                                                    const size_t n_class, int32_t n_threads) const {
     size_t ndata = labels.Size();
 
     const auto& h_labels = labels.HostVector();
@@ -184,7 +182,8 @@ struct EvalMClassBase : public MetricNoCache {
       dat[0] = result.Residue();
       dat[1] = result.Weights();
     }
-    collective::GlobalSum(info, &dat);
+    auto rc = collective::GlobalSum(ctx_, info, linalg::MakeVec(dat.data(), dat.size()));
+    collective::SafeColl(rc);
     return Derived::GetFinal(dat[0], dat[1]);
   }
   /*!
@@ -247,5 +246,4 @@ XGBOOST_REGISTER_METRIC(MatchError, "merror")
 XGBOOST_REGISTER_METRIC(MultiLogLoss, "mlogloss")
     .describe("Multiclass negative loglikelihood.")
     .set_body([](const char*) { return new EvalMultiLogLoss(); });
-}  // namespace metric
-}  // namespace xgboost
+}  // namespace xgboost::metric
diff --git a/src/metric/multiclass_metric.hip b/src/metric/multiclass_metric.hip
deleted file mode 100644
index 4689644c8..000000000
--- a/src/metric/multiclass_metric.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "multiclass_metric.cu"
-#endif  // defined(XGBOOST_USE_HIP)
diff --git a/src/metric/rank_metric.cc b/src/metric/rank_metric.cc
index 6762aec32..53841c051 100644
--- a/src/metric/rank_metric.cc
+++ b/src/metric/rank_metric.cc
@@ -101,7 +101,7 @@ struct EvalAMS : public MetricNoCache {
     }
   }
 
-  const char* Name() const override {
+  [[nodiscard]] const char* Name() const override {
     return name_.c_str();
   }
 
@@ -159,7 +159,7 @@ struct EvalRank : public MetricNoCache, public EvalRankConfig {
       exc.Rethrow();
     }
 
-    return collective::GlobalRatio(info, sum_metric, static_cast<double>(ngroups));
+    return collective::GlobalRatio(ctx_, info, sum_metric, static_cast<double>(ngroups));
   }
 
   [[nodiscard]] const char* Name() const override {
@@ -274,7 +274,7 @@ class EvalRankWithCache : public Metric {
   double Evaluate(HostDeviceVector<float> const& preds, std::shared_ptr<DMatrix> p_fmat) override {
     double result{0.0};
     auto const& info = p_fmat->Info();
-    collective::ApplyWithLabels(info, &result, sizeof(double), [&] {
+    collective::ApplyWithLabels(ctx_, info, &result, sizeof(double), [&] {
       auto p_cache = cache_.CacheItem(p_fmat, ctx_, info, param_);
       if (p_cache->Param() != param_) {
         p_cache = cache_.ResetItem(p_fmat, ctx_, info, param_);
@@ -294,9 +294,10 @@ class EvalRankWithCache : public Metric {
 };
 
 namespace {
-double Finalize(Context const*, MetaInfo const& info, double score, double sw) {
+double Finalize(Context const* ctx, MetaInfo const& info, double score, double sw) {
   std::array<double, 2> dat{score, sw};
-  collective::GlobalSum(info, &dat);
+  auto rc = collective::GlobalSum(ctx, info, linalg::MakeVec(dat.data(), 2));
+  collective::SafeColl(rc);
   std::tie(score, sw) = std::tuple_cat(dat);
   if (sw > 0.0) {
     score = score / sw;
diff --git a/src/metric/rank_metric.cu b/src/metric/rank_metric.cu
index eb6f1b3a1..aab0f144b 100644
--- a/src/metric/rank_metric.cu
+++ b/src/metric/rank_metric.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2023 by XGBoost Contributors
+ * Copyright 2020-2024, XGBoost Contributors
  */
 #include <dmlc/registry.h>
 #include <thrust/iterator/counting_iterator.h>  // for make_counting_iterator
diff --git a/src/metric/rank_metric.hip b/src/metric/rank_metric.hip
deleted file mode 100644
index a8ed8b267..000000000
--- a/src/metric/rank_metric.hip
+++ /dev/null
@@ -1,5 +0,0 @@
-
-
-#if defined(XGBOOST_USE_HIP)
-#include "rank_metric.cu"
-#endif
diff --git a/src/metric/survival_metric.cu b/src/metric/survival_metric.cu
index 4cf77669b..099df4a02 100644
--- a/src/metric/survival_metric.cu
+++ b/src/metric/survival_metric.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019-2023 by Contributors
+ * Copyright 2019-2024, Contributors
  * \file survival_metric.cu
  * \brief Metrics for survival analysis
  * \author Avinash Barnwal, Hyunsu Cho and Toby Hocking
@@ -9,10 +9,9 @@
 
 #include <array>
 #include <memory>
+#include <numeric>  // for accumulate
 #include <vector>
 
-#include "../collective/communicator-inl.h"
-#include "../common/math.h"
 #include "../common/survival_util.h"
 #include "../common/threading_utils.h"
 #include "metric_common.h"  // MetricNoCache
@@ -30,8 +29,7 @@ using ProbabilityDistributionType = xgboost::common::ProbabilityDistributionType
 template <typename Distribution>
 using AFTLoss = xgboost::common::AFTLoss<Distribution>;
 
-namespace xgboost {
-namespace metric {
+namespace xgboost::metric {
 // tag the this file, used by force static link later.
 DMLC_REGISTRY_FILE_TAG(survival_metric);
 
@@ -43,12 +41,11 @@ class ElementWiseSurvivalMetricsReduction {
     policy_ = policy;
   }
 
-  PackedReduceResult
-  CpuReduceMetrics(const HostDeviceVector<bst_float> &weights,
-                   const HostDeviceVector<bst_float> &labels_lower_bound,
-                   const HostDeviceVector<bst_float> &labels_upper_bound,
-                   const HostDeviceVector<bst_float> &preds,
-                   int32_t n_threads) const {
+  [[nodiscard]] PackedReduceResult CpuReduceMetrics(
+      const HostDeviceVector<bst_float>& weights,
+      const HostDeviceVector<bst_float>& labels_lower_bound,
+      const HostDeviceVector<bst_float>& labels_upper_bound,
+      const HostDeviceVector<bst_float>& preds, int32_t n_threads) const {
     size_t ndata = labels_lower_bound.Size();
     CHECK_EQ(ndata, labels_upper_bound.Size());
 
@@ -156,7 +153,7 @@ class ElementWiseSurvivalMetricsReduction {
 struct EvalIntervalRegressionAccuracy {
   void Configure(const Args&) {}
 
-  const char* Name() const {
+  [[nodiscard]] const char* Name() const {
     return "interval-regression-accuracy";
   }
 
@@ -178,7 +175,7 @@ struct EvalAFTNLogLik {
     param_.UpdateAllowUnknown(args);
   }
 
-  const char* Name() const {
+  [[nodiscard]] const char* Name() const {
     return "aft-nloglik";
   }
 
@@ -214,7 +211,8 @@ struct EvalEWiseSurvivalBase : public MetricNoCache {
                                   info.labels_upper_bound_, preds);
 
     std::array<double, 2> dat{result.Residue(), result.Weights()};
-    collective::GlobalSum(info, &dat);
+    auto rc = collective::GlobalSum(ctx_, info, linalg::MakeVec(dat.data(), dat.size()));
+    collective::SafeColl(rc);
     return Policy::GetFinal(dat[0], dat[1]);
   }
 
@@ -231,7 +229,7 @@ struct EvalEWiseSurvivalBase : public MetricNoCache {
 // This class exists because we want to perform dispatch according to the distribution type at
 // configuration time, not at prediction time.
 struct AFTNLogLikDispatcher : public MetricNoCache {
-  const char* Name() const override {
+  [[nodiscard]] const char* Name() const override {
     return "aft-nloglik";
   }
 
@@ -283,5 +281,4 @@ XGBOOST_REGISTER_METRIC(IntervalRegressionAccuracy, "interval-regression-accurac
       return new EvalEWiseSurvivalBase<EvalIntervalRegressionAccuracy>();
     });
 
-}  // namespace metric
-}  // namespace xgboost
+}  // namespace xgboost::metric
diff --git a/src/metric/survival_metric.hip b/src/metric/survival_metric.hip
deleted file mode 100644
index 84a7d1ec2..000000000
--- a/src/metric/survival_metric.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "survival_metric.cu"
-#endif
diff --git a/src/objective/adaptive.cc b/src/objective/adaptive.cc
index 53676a4b8..2aef51824 100644
--- a/src/objective/adaptive.cc
+++ b/src/objective/adaptive.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2022-2023 by XGBoost Contributors
+ * Copyright 2022-2024, XGBoost Contributors
  */
 #include "adaptive.h"
 
@@ -85,7 +85,7 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit
   size_t n_leaf = nidx.size();
   if (nptr.empty()) {
     std::vector<float> quantiles;
-    UpdateLeafValues(&quantiles, nidx, info, learning_rate, p_tree);
+    UpdateLeafValues(ctx, &quantiles, nidx, info, learning_rate, p_tree);
     return;
   }
 
@@ -100,7 +100,7 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit
                                         predt.Size() / info.num_row_);
 
   collective::ApplyWithLabels(
-      info, static_cast<void*>(quantiles.data()), quantiles.size() * sizeof(float), [&] {
+      ctx, info, static_cast<void*>(quantiles.data()), quantiles.size() * sizeof(float), [&] {
         // loop over each leaf
         common::ParallelFor(quantiles.size(), ctx->Threads(), [&](size_t k) {
           auto nidx = h_node_idx[k];
@@ -134,7 +134,7 @@ void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& posit
         });
       });
 
-  UpdateLeafValues(&quantiles, nidx, info, learning_rate, p_tree);
+  UpdateLeafValues(ctx, &quantiles, nidx, info, learning_rate, p_tree);
 }
 
 #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
diff --git a/src/objective/adaptive.cu b/src/objective/adaptive.cu
index d5c7bfda9..5a0b17210 100644
--- a/src/objective/adaptive.cu
+++ b/src/objective/adaptive.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2022-2023 by XGBoost Contributors
+ * Copyright 2022-2024, XGBoost Contributors
  */
 #include <thrust/sort.h>
 
@@ -157,7 +157,7 @@ void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
 
   if (nptr.Empty()) {
     std::vector<float> quantiles;
-    UpdateLeafValues(&quantiles, nidx.ConstHostVector(), info, learning_rate, p_tree);
+    UpdateLeafValues(ctx, &quantiles, nidx.ConstHostVector(), info, learning_rate, p_tree);
   }
 
   predt.SetDevice(ctx->Device());
@@ -167,7 +167,7 @@ void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
   auto t_predt = d_predt.Slice(linalg::All(), group_idx);
 
   HostDeviceVector<float> quantiles;
-  collective::ApplyWithLabels(info, &quantiles, [&] {
+  collective::ApplyWithLabels(ctx, info, &quantiles, [&] {
     auto d_labels = info.labels.View(ctx->Device()).Slice(linalg::All(), IdxY(info, group_idx));
     auto d_row_index = dh::ToSpan(ridx);
     auto seg_beg = nptr.DevicePointer();
@@ -193,6 +193,7 @@ void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> pos
                                         w_it + d_weights.size(), &quantiles);
     }
   });
-  UpdateLeafValues(&quantiles.HostVector(), nidx.ConstHostVector(), info, learning_rate, p_tree);
+  UpdateLeafValues(ctx, &quantiles.HostVector(), nidx.ConstHostVector(), info, learning_rate,
+                   p_tree);
 }
 }  // namespace xgboost::obj::detail
diff --git a/src/objective/adaptive.h b/src/objective/adaptive.h
index a64f37f63..cbe69e79a 100644
--- a/src/objective/adaptive.h
+++ b/src/objective/adaptive.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2022-2023 by XGBoost Contributors
+ * Copyright 2022-2024, XGBoost Contributors
  */
 #pragma once
 
@@ -17,8 +17,7 @@
 #include "xgboost/host_device_vector.h"  // HostDeviceVector
 #include "xgboost/tree_model.h"          // RegTree
 
-namespace xgboost {
-namespace obj {
+namespace xgboost::obj {
 namespace detail {
 inline void FillMissingLeaf(std::vector<bst_node_t> const& maybe_missing,
                             std::vector<bst_node_t>* p_nidx, std::vector<size_t>* p_nptr) {
@@ -36,13 +35,14 @@ inline void FillMissingLeaf(std::vector<bst_node_t> const& maybe_missing,
   }
 }
 
-inline void UpdateLeafValues(std::vector<float>* p_quantiles, std::vector<bst_node_t> const& nidx,
-                             MetaInfo const& info, float learning_rate, RegTree* p_tree) {
+inline void UpdateLeafValues(Context const* ctx, std::vector<float>* p_quantiles,
+                             std::vector<bst_node_t> const& nidx, MetaInfo const& info,
+                             float learning_rate, RegTree* p_tree) {
   auto& tree = *p_tree;
   auto& quantiles = *p_quantiles;
   auto const& h_node_idx = nidx;
 
-  size_t n_leaf = collective::GlobalMax(info, h_node_idx.size());
+  size_t n_leaf = collective::GlobalMax(ctx, info, h_node_idx.size());
   CHECK(quantiles.empty() || quantiles.size() == n_leaf);
   if (quantiles.empty()) {
     quantiles.resize(n_leaf, std::numeric_limits<float>::quiet_NaN());
@@ -52,12 +52,16 @@ inline void UpdateLeafValues(std::vector<float>* p_quantiles, std::vector<bst_no
   std::vector<int32_t> n_valids(quantiles.size());
   std::transform(quantiles.cbegin(), quantiles.cend(), n_valids.begin(),
                  [](float q) { return static_cast<int32_t>(!std::isnan(q)); });
-  collective::GlobalSum(info, &n_valids);
+  auto rc = collective::GlobalSum(ctx, info, linalg::MakeVec(n_valids.data(), n_valids.size()));
+  collective::SafeColl(rc);
+
   // convert to 0 for all reduce
   std::replace_if(
       quantiles.begin(), quantiles.end(), [](float q) { return std::isnan(q); }, 0.f);
   // use the mean value
-  collective::GlobalSum(info, &quantiles);
+  rc = collective::GlobalSum(ctx, info, linalg::MakeVec(quantiles.data(), quantiles.size()));
+  collective::SafeColl(rc);
+
   for (size_t i = 0; i < n_leaf; ++i) {
     if (n_valids[i] > 0) {
       quantiles[i] /= static_cast<float>(n_valids[i]);
@@ -105,5 +109,4 @@ inline void UpdateTreeLeaf(Context const* ctx, HostDeviceVector<bst_node_t> cons
                                predt, alpha, p_tree);
   }
 }
-}  // namespace obj
-}  // namespace xgboost
+}  // namespace xgboost::obj
diff --git a/src/objective/adaptive.hip b/src/objective/adaptive.hip
deleted file mode 100644
index 7558ac176..000000000
--- a/src/objective/adaptive.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "adaptive.cu"
-#endif
diff --git a/src/objective/aft_obj.hip b/src/objective/aft_obj.hip
deleted file mode 100644
index 24d5bbc15..000000000
--- a/src/objective/aft_obj.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "aft_obj.cu"
-#endif
diff --git a/src/objective/hinge.hip b/src/objective/hinge.hip
deleted file mode 100644
index 08d3541b6..000000000
--- a/src/objective/hinge.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "hinge.cu"
-#endif
diff --git a/src/objective/lambdarank_obj.cc b/src/objective/lambdarank_obj.cc
index efddf636e..e39134ea4 100644
--- a/src/objective/lambdarank_obj.cc
+++ b/src/objective/lambdarank_obj.cc
@@ -222,7 +222,7 @@ class LambdaRankObj : public FitIntercept {
     };
 
     MakePairs(ctx_, iter, p_cache_, g, g_label, g_rank, loop);
-    if (sum_lambda > 0.0) {
+    if (sum_lambda > 0.0 && param_.lambdarank_normalization) {
       double norm = std::log2(1.0 + sum_lambda) / sum_lambda;
       std::transform(g_gpair.Values().data(), g_gpair.Values().data() + g_gpair.Size(),
                      g_gpair.Values().data(), [norm](GradientPair const& g) { return g * norm; });
@@ -474,7 +474,6 @@ class LambdaRankMAP : public LambdaRankObj<LambdaRankMAP, ltr::MAPCache> {
  public:
   void GetGradientImpl(std::int32_t iter, const HostDeviceVector<float>& predt,
                        const MetaInfo& info, linalg::Matrix<GradientPair>* out_gpair) {
-    CHECK(param_.ndcg_exp_gain) << "NDCG gain can not be set for the MAP objective.";
     if (ctx_->IsCUDA()) {
       return cuda_impl::LambdaRankGetGradientMAP(
           ctx_, iter, predt, info, GetCache(), ti_plus_.View(ctx_->Device()),
@@ -564,7 +563,6 @@ class LambdaRankPairwise : public LambdaRankObj<LambdaRankPairwise, ltr::Ranking
  public:
   void GetGradientImpl(std::int32_t iter, const HostDeviceVector<float>& predt,
                        const MetaInfo& info, linalg::Matrix<GradientPair>* out_gpair) {
-    CHECK(param_.ndcg_exp_gain) << "NDCG gain can not be set for the pairwise objective.";
     if (ctx_->IsCUDA()) {
       return cuda_impl::LambdaRankGetGradientPairwise(
           ctx_, iter, predt, info, GetCache(), ti_plus_.View(ctx_->Device()),
@@ -610,6 +608,13 @@ class LambdaRankPairwise : public LambdaRankObj<LambdaRankPairwise, ltr::Ranking
   [[nodiscard]] const char* DefaultEvalMetric() const override {
     return this->RankEvalMetric("ndcg");
   }
+
+  [[nodiscard]] Json DefaultMetricConfig() const override {
+    Json config{Object{}};
+    config["name"] = String{DefaultEvalMetric()};
+    config["lambdarank_param"] = ToJson(param_);
+    return config;
+  }
 };
 
 #if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_HIP)
diff --git a/src/objective/lambdarank_obj.cu b/src/objective/lambdarank_obj.cu
index 47d7957e8..e6f5be64c 100644
--- a/src/objective/lambdarank_obj.cu
+++ b/src/objective/lambdarank_obj.cu
@@ -270,12 +270,13 @@ void CalcGrad(Context const* ctx, MetaInfo const& info, std::shared_ptr<ltr::Ran
    */
   auto d_weights = common::MakeOptionalWeights(ctx, info.weights_);
   auto w_norm = p_cache->WeightNorm();
+  auto norm = p_cache->Param().lambdarank_normalization;
   thrust::for_each_n(ctx->CUDACtx()->CTP(), thrust::make_counting_iterator(0ul), d_gpair.Size(),
                      [=] XGBOOST_DEVICE(std::size_t i) mutable {
                        auto g = dh::SegmentId(d_gptr, i);
                        auto sum_lambda = thrust::get<2>(d_max_lambdas[g]);
                        // Normalization
-                       if (sum_lambda > 0.0) {
+                       if (sum_lambda > 0.0 && norm) {
                          double norm = std::log2(1.0 + sum_lambda) / sum_lambda;
                          d_gpair(i, 0) *= norm;
                        }
diff --git a/src/objective/lambdarank_obj.hip b/src/objective/lambdarank_obj.hip
deleted file mode 100644
index a99255fdd..000000000
--- a/src/objective/lambdarank_obj.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "lambdarank_obj.cu"
-#endif
diff --git a/src/objective/multiclass_obj.hip b/src/objective/multiclass_obj.hip
deleted file mode 100644
index 914398d38..000000000
--- a/src/objective/multiclass_obj.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "multiclass_obj.cu"
-#endif
diff --git a/src/objective/quantile_obj.cu b/src/objective/quantile_obj.cu
index 444151295..f4713fbc8 100644
--- a/src/objective/quantile_obj.cu
+++ b/src/objective/quantile_obj.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023 by XGBoost contributors
+ * Copyright 2023-2024, XGBoost contributors
  */
 #include <array>                            // std::array
 #include <cstddef>                          // std::size_t
@@ -170,7 +170,9 @@ class QuantileRegression : public ObjFunction {
     double meanq = temp(0) * sw;
 
     std::array<double, 2> dat{meanq, sw};
-    collective::GlobalSum(info, &dat);
+    auto rc = collective::GlobalSum(ctx_, info, linalg::MakeVec(dat.data(), dat.size()));
+    collective::SafeColl(rc);
+
     std::tie(meanq, sw) = std::tuple_cat(dat);
     meanq /= (sw + kRtEps);
     base_score->Reshape(1);
diff --git a/src/objective/quantile_obj.hip b/src/objective/quantile_obj.hip
deleted file mode 100644
index e755a5515..000000000
--- a/src/objective/quantile_obj.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "quantile_obj.cu"
-#endif
diff --git a/src/objective/regression_obj.cu b/src/objective/regression_obj.cu
index 20e04ac59..fdb06474e 100644
--- a/src/objective/regression_obj.cu
+++ b/src/objective/regression_obj.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2015-2023 by XGBoost Contributors
+ * Copyright 2015-2024, XGBoost Contributors
  * \file regression_obj.cu
  * \brief Definition of single-value regression and classification objectives.
  * \author Tianqi Chen, Kailong Chen
@@ -672,8 +672,12 @@ class MeanAbsoluteError : public ObjFunction {
     std::transform(linalg::cbegin(out), linalg::cend(out), linalg::begin(out),
                    [w](float v) { return v * w; });
 
-    collective::GlobalSum(info, &out.Values());
-    collective::GlobalSum(info, &w, 1);
+    auto rc = collective::Success() << [&] {
+      return collective::GlobalSum(ctx_, info, out);
+    } << [&] {
+      return collective::GlobalSum(ctx_, info, linalg::MakeVec(&w, 1));
+    };
+    collective::SafeColl(rc);
 
     if (common::CloseTo(w, 0.0)) {
       // Mostly for handling empty dataset test.
diff --git a/src/objective/regression_obj.hip b/src/objective/regression_obj.hip
deleted file mode 100644
index 1812685af..000000000
--- a/src/objective/regression_obj.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "regression_obj.cu"
-#endif
diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index d97b527f0..f253493fc 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -184,7 +184,7 @@ void FVecDrop(std::size_t const block_size, std::size_t const fvec_offset,
 static std::size_t constexpr kUnroll = 8;
 
 struct SparsePageView {
-  bst_row_t base_rowid;
+  bst_idx_t base_rowid;
   HostSparsePageView view;
 
   explicit SparsePageView(SparsePage const *p) : base_rowid{p->base_rowid} { view = p->GetView(); }
@@ -193,7 +193,7 @@ struct SparsePageView {
 };
 
 struct SingleInstanceView {
-  bst_row_t base_rowid{};
+  bst_idx_t base_rowid{};
   SparsePage::Inst const &inst;
 
   explicit SingleInstanceView(SparsePage::Inst const &instance) : inst{instance} {}
@@ -214,7 +214,7 @@ struct GHistIndexMatrixView {
   std::vector<float> const& values_;
 
  public:
-  size_t base_rowid;
+  bst_idx_t base_rowid;
 
  public:
   GHistIndexMatrixView(GHistIndexMatrix const &_page, uint64_t n_feat,
@@ -292,7 +292,7 @@ class AdapterView {
 
   [[nodiscard]] size_t Size() const { return adapter_->NumRows(); }
 
-  bst_row_t const static base_rowid = 0;  // NOLINT
+  bst_idx_t const static base_rowid = 0;  // NOLINT
 };
 
 template <typename DataView, size_t block_of_rows_size>
@@ -698,6 +698,67 @@ class CPUPredictor : public Predictor {
     }
   }
 
+  template <typename DataView>
+  void PredictContributionKernel(DataView batch, const MetaInfo& info,
+                                 const gbm::GBTreeModel& model,
+                                 const std::vector<bst_float>* tree_weights,
+                                 std::vector<std::vector<float>>* mean_values,
+                                 std::vector<RegTree::FVec>* feat_vecs,
+                                 std::vector<bst_float>* contribs, uint32_t ntree_limit,
+                                 bool approximate, int condition,
+                                 unsigned condition_feature) const {
+    const int num_feature = model.learner_model_param->num_feature;
+    const int ngroup = model.learner_model_param->num_output_group;
+    CHECK_NE(ngroup, 0);
+    size_t const ncolumns = num_feature + 1;
+    CHECK_NE(ncolumns, 0);
+    auto base_margin = info.base_margin_.View(ctx_->Device());
+    auto base_score = model.learner_model_param->BaseScore(ctx_->Device())(0);
+
+    // parallel over local batch
+    common::ParallelFor(batch.Size(), this->ctx_->Threads(), [&](auto i) {
+      auto row_idx = batch.base_rowid + i;
+      RegTree::FVec &feats = (*feat_vecs)[omp_get_thread_num()];
+      if (feats.Size() == 0) {
+        feats.Init(num_feature);
+      }
+      std::vector<bst_float> this_tree_contribs(ncolumns);
+      // loop over all classes
+      for (int gid = 0; gid < ngroup; ++gid) {
+        bst_float* p_contribs = &(*contribs)[(row_idx * ngroup + gid) * ncolumns];
+        feats.Fill(batch[i]);
+        // calculate contributions
+        for (unsigned j = 0; j < ntree_limit; ++j) {
+          auto *tree_mean_values = &mean_values->at(j);
+          std::fill(this_tree_contribs.begin(), this_tree_contribs.end(), 0);
+          if (model.tree_info[j] != gid) {
+            continue;
+          }
+          if (!approximate) {
+            CalculateContributions(*model.trees[j], feats, tree_mean_values,
+                                   &this_tree_contribs[0], condition, condition_feature);
+          } else {
+            model.trees[j]->CalculateContributionsApprox(
+                feats, tree_mean_values, &this_tree_contribs[0]);
+          }
+          for (size_t ci = 0; ci < ncolumns; ++ci) {
+            p_contribs[ci] +=
+                this_tree_contribs[ci] *
+                (tree_weights == nullptr ? 1 : (*tree_weights)[j]);
+          }
+        }
+        feats.Drop();
+        // add base margin to BIAS
+        if (base_margin.Size() != 0) {
+          CHECK_EQ(base_margin.Shape(1), ngroup);
+          p_contribs[ncolumns - 1] += base_margin(row_idx, gid);
+        } else {
+          p_contribs[ncolumns - 1] += base_score;
+        }
+      }
+    });
+  }
+
  public:
   explicit CPUPredictor(Context const *ctx) : Predictor::Predictor{ctx} {}
 
@@ -861,7 +922,6 @@ class CPUPredictor : public Predictor {
     CHECK(!p_fmat->Info().IsColumnSplit())
         << "Predict contribution support for column-wise data split is not yet implemented.";
     auto const n_threads = this->ctx_->Threads();
-    const int num_feature = model.learner_model_param->num_feature;
     std::vector<RegTree::FVec> feat_vecs;
     InitThreadTemp(n_threads, &feat_vecs);
     const MetaInfo& info = p_fmat->Info();
@@ -869,10 +929,7 @@ class CPUPredictor : public Predictor {
     if (ntree_limit == 0 || ntree_limit > model.trees.size()) {
       ntree_limit = static_cast<unsigned>(model.trees.size());
     }
-    const int ngroup = model.learner_model_param->num_output_group;
-    CHECK_NE(ngroup, 0);
-    size_t const ncolumns = num_feature + 1;
-    CHECK_NE(ncolumns, 0);
+    size_t const ncolumns = model.learner_model_param->num_feature + 1;
     // allocate space for (number of features + bias) times the number of rows
     std::vector<bst_float>& contribs = out_contribs->HostVector();
     contribs.resize(info.num_row_ * ncolumns * model.learner_model_param->num_output_group);
@@ -884,53 +941,22 @@ class CPUPredictor : public Predictor {
     common::ParallelFor(ntree_limit, n_threads, [&](bst_omp_uint i) {
       FillNodeMeanValues(model.trees[i].get(), &(mean_values[i]));
     });
-    auto base_margin = info.base_margin_.View(ctx_->Device());
-    auto base_score = model.learner_model_param->BaseScore(ctx_->Device())(0);
     // start collecting the contributions
-    for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
-      auto page = batch.GetView();
-      // parallel over local batch
-      common::ParallelFor(batch.Size(), n_threads, [&](auto i) {
-        auto row_idx = batch.base_rowid + i;
-        RegTree::FVec &feats = feat_vecs[omp_get_thread_num()];
-        if (feats.Size() == 0) {
-          feats.Init(num_feature);
-        }
-        std::vector<bst_float> this_tree_contribs(ncolumns);
-        // loop over all classes
-        for (int gid = 0; gid < ngroup; ++gid) {
-          bst_float* p_contribs = &contribs[(row_idx * ngroup + gid) * ncolumns];
-          feats.Fill(page[i]);
-          // calculate contributions
-          for (unsigned j = 0; j < ntree_limit; ++j) {
-            auto *tree_mean_values = &mean_values.at(j);
-            std::fill(this_tree_contribs.begin(), this_tree_contribs.end(), 0);
-            if (model.tree_info[j] != gid) {
-              continue;
-            }
-            if (!approximate) {
-              CalculateContributions(*model.trees[j], feats, tree_mean_values,
-                                     &this_tree_contribs[0], condition, condition_feature);
-            } else {
-              model.trees[j]->CalculateContributionsApprox(
-                  feats, tree_mean_values, &this_tree_contribs[0]);
-            }
-            for (size_t ci = 0; ci < ncolumns; ++ci) {
-              p_contribs[ci] +=
-                  this_tree_contribs[ci] *
-                  (tree_weights == nullptr ? 1 : (*tree_weights)[j]);
-            }
-          }
-          feats.Drop();
-          // add base margin to BIAS
-          if (base_margin.Size() != 0) {
-            CHECK_EQ(base_margin.Shape(1), ngroup);
-            p_contribs[ncolumns - 1] += base_margin(row_idx, gid);
-          } else {
-            p_contribs[ncolumns - 1] += base_score;
-          }
-        }
-      });
+    if (!p_fmat->PageExists<SparsePage>()) {
+      std::vector<Entry> workspace(info.num_col_ * kUnroll * n_threads);
+      auto ft = p_fmat->Info().feature_types.ConstHostVector();
+      for (const auto &batch : p_fmat->GetBatches<GHistIndexMatrix>(ctx_, {})) {
+        PredictContributionKernel(
+            GHistIndexMatrixView{batch, info.num_col_, ft, workspace, n_threads},
+            info, model, tree_weights, &mean_values, &feat_vecs, &contribs, ntree_limit,
+            approximate, condition, condition_feature);
+      }
+    } else {
+      for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
+        PredictContributionKernel(
+            SparsePageView{&batch}, info, model, tree_weights, &mean_values, &feat_vecs,
+            &contribs, ntree_limit, approximate, condition, condition_feature);
+      }
     }
   }
 
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index 1aa1cfa1a..69d5b4bf5 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -67,12 +67,12 @@ struct TreeView {
 
 struct SparsePageView {
   common::Span<const Entry> d_data;
-  common::Span<const bst_row_t> d_row_ptr;
+  common::Span<const bst_idx_t> d_row_ptr;
   bst_feature_t num_features;
 
   SparsePageView() = default;
   XGBOOST_DEVICE SparsePageView(common::Span<const Entry> data,
-                                common::Span<const bst_row_t> row_ptr,
+                                common::Span<const bst_idx_t> row_ptr,
                                 bst_feature_t num_features)
       : d_data{data}, d_row_ptr{row_ptr}, num_features(num_features) {}
   [[nodiscard]] __device__ float GetElement(size_t ridx, size_t fidx) const {
@@ -113,7 +113,7 @@ struct SparsePageLoader {
   float* smem;
 
   __device__ SparsePageLoader(SparsePageView data, bool use_shared, bst_feature_t num_features,
-                              bst_row_t num_rows, size_t entry_start, float)
+                              bst_idx_t num_rows, size_t entry_start, float)
       : use_shared(use_shared),
         data(data) {
     extern __shared__ float _smem[];
@@ -146,7 +146,7 @@ struct SparsePageLoader {
 
 struct EllpackLoader {
   EllpackDeviceAccessor const& matrix;
-  XGBOOST_DEVICE EllpackLoader(EllpackDeviceAccessor const& m, bool, bst_feature_t, bst_row_t,
+  XGBOOST_DEVICE EllpackLoader(EllpackDeviceAccessor const& m, bool, bst_feature_t, bst_idx_t,
                                size_t, float)
       : matrix{m} {}
   [[nodiscard]] __device__ __forceinline__ float GetElement(size_t ridx, size_t fidx) const {
@@ -177,7 +177,7 @@ struct DeviceAdapterLoader {
   using BatchT = Batch;
 
   XGBOOST_DEV_INLINE DeviceAdapterLoader(Batch const batch, bool use_shared,
-                                         bst_feature_t num_features, bst_row_t num_rows,
+                                         bst_feature_t num_features, bst_idx_t num_rows,
                                          size_t entry_start, float missing)
       : batch{batch}, columns{num_features}, use_shared{use_shared}, is_valid{missing} {
     extern __shared__ float _smem[];
@@ -215,7 +215,7 @@ struct DeviceAdapterLoader {
 };
 
 template <bool has_missing, bool has_categorical, typename Loader>
-__device__ bst_node_t GetLeafIndex(bst_row_t ridx, TreeView const &tree,
+__device__ bst_node_t GetLeafIndex(bst_idx_t ridx, TreeView const &tree,
                                    Loader *loader) {
   bst_node_t nidx = 0;
   RegTree::Node n = tree.d_tree[nidx];
@@ -230,7 +230,7 @@ __device__ bst_node_t GetLeafIndex(bst_row_t ridx, TreeView const &tree,
 }
 
 template <bool has_missing, typename Loader>
-__device__ float GetLeafWeight(bst_row_t ridx, TreeView const &tree,
+__device__ float GetLeafWeight(bst_idx_t ridx, TreeView const &tree,
                                Loader *loader) {
   bst_node_t nidx = -1;
   if (tree.HasCategoricalSplit()) {
@@ -255,7 +255,7 @@ PredictLeafKernel(Data data, common::Span<const RegTree::Node> d_nodes,
                   size_t tree_begin, size_t tree_end, size_t num_features,
                   size_t num_rows, size_t entry_start, bool use_shared,
                   float missing) {
-  bst_row_t ridx = blockDim.x * blockIdx.x + threadIdx.x;
+  bst_idx_t ridx = blockDim.x * blockIdx.x + threadIdx.x;
   if (ridx >= num_rows) {
     return;
   }
@@ -670,7 +670,7 @@ __global__ void MaskBitVectorKernel(
   }
 }
 
-__device__ bst_node_t GetLeafIndexByBitVector(bst_row_t ridx, TreeView const& tree,
+__device__ bst_node_t GetLeafIndexByBitVector(bst_idx_t ridx, TreeView const& tree,
                                               BitVector const& decision_bits,
                                               BitVector const& missing_bits, std::size_t num_nodes,
                                               std::size_t tree_offset) {
@@ -688,7 +688,7 @@ __device__ bst_node_t GetLeafIndexByBitVector(bst_row_t ridx, TreeView const& tr
   return nidx;
 }
 
-__device__ float GetLeafWeightByBitVector(bst_row_t ridx, TreeView const& tree,
+__device__ float GetLeafWeightByBitVector(bst_idx_t ridx, TreeView const& tree,
                                           BitVector const& decision_bits,
                                           BitVector const& missing_bits, std::size_t num_nodes,
                                           std::size_t tree_offset) {
@@ -1048,6 +1048,9 @@ class GPUPredictor : public xgboost::Predictor {
     if (tree_weights != nullptr) {
       LOG(FATAL) << "Dart booster feature " << not_implemented;
     }
+    if (!p_fmat->PageExists<SparsePage>()) {
+      LOG(FATAL) << "SHAP value for QuantileDMatrix is not yet implemented for GPU.";
+    }
     CHECK(!p_fmat->Info().IsColumnSplit())
         << "Predict contribution support for column-wise data split is not yet implemented.";
     dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
@@ -1108,6 +1111,9 @@ class GPUPredictor : public xgboost::Predictor {
     if (tree_weights != nullptr) {
       LOG(FATAL) << "Dart booster feature " << not_implemented;
     }
+    if (!p_fmat->PageExists<SparsePage>()) {
+      LOG(FATAL) << "SHAP value for QuantileDMatrix is not yet implemented for GPU.";
+    }
     dh::safe_cuda(cudaSetDevice(ctx_->Ordinal()));
     out_contribs->SetDevice(ctx_->Device());
     if (tree_end == 0 || tree_end > model.trees.size()) {
@@ -1171,7 +1177,7 @@ class GPUPredictor : public xgboost::Predictor {
     auto max_shared_memory_bytes = ConfigureDevice(ctx_->Device());
 
     const MetaInfo& info = p_fmat->Info();
-    bst_row_t num_rows = info.num_row_;
+    bst_idx_t num_rows = info.num_row_;
     if (tree_end == 0 || tree_end > model.trees.size()) {
       tree_end = static_cast<uint32_t>(model.trees.size());
     }
@@ -1196,7 +1202,7 @@ class GPUPredictor : public xgboost::Predictor {
       for (auto const& batch : p_fmat->GetBatches<SparsePage>()) {
         batch.data.SetDevice(ctx_->Device());
         batch.offset.SetDevice(ctx_->Device());
-        bst_row_t batch_offset = 0;
+        bst_idx_t batch_offset = 0;
         SparsePageView data{batch.data.DeviceSpan(), batch.offset.DeviceSpan(),
                             model.learner_model_param->num_feature};
         size_t num_rows = batch.Size();
@@ -1219,7 +1225,7 @@ class GPUPredictor : public xgboost::Predictor {
       }
     } else {
       for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
-        bst_row_t batch_offset = 0;
+        bst_idx_t batch_offset = 0;
         EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_->Device())};
         size_t num_rows = batch.Size();
         auto grid =
diff --git a/src/predictor/gpu_predictor.hip b/src/predictor/gpu_predictor.hip
deleted file mode 100644
index 33760f6dd..000000000
--- a/src/predictor/gpu_predictor.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "gpu_predictor.cu"
-#endif
diff --git a/src/predictor/predictor.cc b/src/predictor/predictor.cc
index 8d7e9699a..5c4b3a0d8 100644
--- a/src/predictor/predictor.cc
+++ b/src/predictor/predictor.cc
@@ -9,7 +9,7 @@
 #include <string>                        // for string, to_string
 
 #include "../gbm/gbtree_model.h"         // for GBTreeModel
-#include "xgboost/base.h"                // for bst_float, Args, bst_group_t, bst_row_t
+#include "xgboost/base.h"                // for bst_float, Args, bst_group_t, bst_idx_t
 #include "xgboost/context.h"             // for Context
 #include "xgboost/data.h"                // for MetaInfo
 #include "xgboost/host_device_vector.h"  // for HostDeviceVector
@@ -34,7 +34,7 @@ Predictor* Predictor::Create(std::string const& name, Context const* ctx) {
 }
 
 template <int32_t D>
-void ValidateBaseMarginShape(linalg::Tensor<float, D> const& margin, bst_row_t n_samples,
+void ValidateBaseMarginShape(linalg::Tensor<float, D> const& margin, bst_idx_t n_samples,
                              bst_group_t n_groups) {
   // FIXME: Bindings other than Python doesn't have shape.
   std::string expected{"Invalid shape of base_margin. Expected: (" + std::to_string(n_samples) +
diff --git a/src/tree/common_row_partitioner.h b/src/tree/common_row_partitioner.h
index 4360c0b13..293e7d1d4 100644
--- a/src/tree/common_row_partitioner.h
+++ b/src/tree/common_row_partitioner.h
@@ -28,7 +28,7 @@ class ColumnSplitHelper {
  public:
   ColumnSplitHelper() = default;
 
-  ColumnSplitHelper(bst_row_t num_row,
+  ColumnSplitHelper(bst_idx_t num_row,
                     common::PartitionBuilder<kPartitionBlockSize>* partition_builder,
                     common::RowSetCollection* row_set_collection)
       : partition_builder_{partition_builder}, row_set_collection_{row_set_collection} {
@@ -85,10 +85,10 @@ class ColumnSplitHelper {
 
 class CommonRowPartitioner {
  public:
-  bst_row_t base_rowid = 0;
+  bst_idx_t base_rowid = 0;
 
   CommonRowPartitioner() = default;
-  CommonRowPartitioner(Context const* ctx, bst_row_t num_row, bst_row_t _base_rowid,
+  CommonRowPartitioner(Context const* ctx, bst_idx_t num_row, bst_idx_t _base_rowid,
                        bool is_col_split)
       : base_rowid{_base_rowid}, is_col_split_{is_col_split} {
     row_set_collection_.Clear();
diff --git a/src/tree/constraints.hip b/src/tree/constraints.hip
deleted file mode 100644
index b8d6208cf..000000000
--- a/src/tree/constraints.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "constraints.cu"
-#endif
diff --git a/src/tree/fit_stump.cc b/src/tree/fit_stump.cc
index 6bafa77d0..edd56fb2d 100644
--- a/src/tree/fit_stump.cc
+++ b/src/tree/fit_stump.cc
@@ -1,7 +1,7 @@
 /**
- * Copyright 2022 by XGBoost Contributors
+ * Copyright 2022-2024, XGBoost Contributors
  *
- * \brief Utilities for estimating initial score.
+ * @brief Utilities for estimating initial score.
  */
 #include "fit_stump.h"
 
@@ -44,8 +44,11 @@ void FitStump(Context const* ctx, MetaInfo const& info,
     }
   }
   CHECK(h_sum.CContiguous());
-
-  collective::GlobalSum(info, reinterpret_cast<double*>(h_sum.Values().data()), h_sum.Size() * 2);
+  auto as_double = linalg::MakeTensorView(
+      ctx, common::Span{reinterpret_cast<double*>(h_sum.Values().data()), h_sum.Size() * 2},
+      h_sum.Size() * 2);
+  auto rc = collective::GlobalSum(ctx, info, as_double);
+  collective::SafeColl(rc);
 
   for (std::size_t i = 0; i < h_sum.Size(); ++i) {
     out(i) = static_cast<float>(CalcUnregularizedWeight(h_sum(i).GetGrad(), h_sum(i).GetHess()));
diff --git a/src/tree/fit_stump.cu b/src/tree/fit_stump.cu
index 03055e7c9..aea7ba4d7 100644
--- a/src/tree/fit_stump.cu
+++ b/src/tree/fit_stump.cu
@@ -1,19 +1,18 @@
 /**
- * Copyright 2022-2023 by XGBoost Contributors
+ * Copyright 2022-2024, XGBoost Contributors
  *
- * \brief Utilities for estimating initial score.
+ * @brief Utilities for estimating initial score.
  */
 #if !defined(NOMINMAX) && defined(_WIN32)
 #define NOMINMAX
-#endif                                            // !defined(NOMINMAX)
-#include <thrust/execution_policy.h>              // cuda::par
-#include <thrust/iterator/counting_iterator.h>    // thrust::make_counting_iterator
+#endif                                          // !defined(NOMINMAX)
+#include <thrust/execution_policy.h>            // cuda::par
+#include <thrust/iterator/counting_iterator.h>  // thrust::make_counting_iterator
 
-#include <cstddef>                                // std::size_t
+#include <cstddef>  // std::size_t
 
-#include "../collective/aggregator.cuh"
-#include "../collective/communicator-inl.cuh"
-#include "../common/device_helpers.cuh"           // dh::MakeTransformIterator
+#include "../collective/aggregator.cuh"  // for GlobalSum
+#include "../common/device_helpers.cuh"  // dh::MakeTransformIterator
 #include "fit_stump.h"
 #include "xgboost/base.h"     // GradientPairPrecise, GradientPair, XGBOOST_DEVICE
 #include "xgboost/context.h"  // Context
diff --git a/src/tree/fit_stump.hip b/src/tree/fit_stump.hip
deleted file mode 100644
index 6b4ddd0af..000000000
--- a/src/tree/fit_stump.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "fit_stump.cu"
-#endif
diff --git a/src/tree/gpu_hist/evaluate_splits.hip b/src/tree/gpu_hist/evaluate_splits.hip
deleted file mode 100644
index 4469d1c1f..000000000
--- a/src/tree/gpu_hist/evaluate_splits.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "evaluate_splits.cu"
-#endif
diff --git a/src/tree/gpu_hist/evaluator.hip b/src/tree/gpu_hist/evaluator.hip
deleted file mode 100644
index b29dd089a..000000000
--- a/src/tree/gpu_hist/evaluator.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "evaluator.cu"
-#endif
diff --git a/src/tree/gpu_hist/feature_groups.hip b/src/tree/gpu_hist/feature_groups.hip
deleted file mode 100644
index ebc9aa533..000000000
--- a/src/tree/gpu_hist/feature_groups.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "feature_groups.cu"
-#endif
diff --git a/src/tree/gpu_hist/gradient_based_sampler.cu b/src/tree/gpu_hist/gradient_based_sampler.cu
index 58add0a93..f9a3819ad 100644
--- a/src/tree/gpu_hist/gradient_based_sampler.cu
+++ b/src/tree/gpu_hist/gradient_based_sampler.cu
@@ -1,13 +1,13 @@
 /**
- * Copyright 2019-2023 by XGBoost Contributors
+ * Copyright 2019-2024, XGBoost Contributors
  */
 #include <thrust/functional.h>
 #include <thrust/random.h>
+#include <thrust/sort.h>  // for sort
 #include <thrust/transform.h>
 #include <xgboost/host_device_vector.h>
 #include <xgboost/logging.h>
 
-#include <algorithm>
 #include <cstddef>  // for size_t
 #include <limits>
 #include <utility>
@@ -277,7 +277,7 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c
                                                                 common::Span<GradientPair> gpair,
                                                                 DMatrix* dmat) {
   auto cuctx = ctx->CUDACtx();
-  bst_row_t n_rows = dmat->Info().num_row_;
+  bst_idx_t n_rows = dmat->Info().num_row_;
   size_t threshold_index = GradientBasedSampler::CalculateThresholdIndex(
       gpair, dh::ToSpan(threshold_), dh::ToSpan(grad_sum_), n_rows * subsample_);
 
diff --git a/src/tree/gpu_hist/gradient_based_sampler.hip b/src/tree/gpu_hist/gradient_based_sampler.hip
deleted file mode 100644
index e7094cd3e..000000000
--- a/src/tree/gpu_hist/gradient_based_sampler.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "gradient_based_sampler.cu"
-#endif
diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index 62e40f4d4..7f5a20dbf 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2023 by XGBoost Contributors
+ * Copyright 2020-2024, XGBoost Contributors
  */
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/reduce.h>
@@ -52,7 +52,7 @@ struct Clip : public thrust::unary_function<GradientPair, Pair> {
  *
  * to avoid outliers, as the full reduction is reproducible on GPU with reduction tree.
  */
-GradientQuantiser::GradientQuantiser(Context const*, common::Span<GradientPair const> gpair,
+GradientQuantiser::GradientQuantiser(Context const* ctx, common::Span<GradientPair const> gpair,
                                      MetaInfo const& info) {
   using GradientSumT = GradientPairPrecise;
   using T = typename GradientSumT::ValueT;
@@ -66,11 +66,14 @@ GradientQuantiser::GradientQuantiser(Context const*, common::Span<GradientPair c
   // Treat pair as array of 4 primitive types to allreduce
   using ReduceT = typename decltype(p.first)::ValueT;
   static_assert(sizeof(Pair) == sizeof(ReduceT) * 4, "Expected to reduce four elements.");
-  collective::GlobalSum(info, reinterpret_cast<ReduceT*>(&p), 4);
+  auto rc = collective::GlobalSum(ctx, info, linalg::MakeVec(reinterpret_cast<ReduceT*>(&p), 4));
+  collective::SafeColl(rc);
+
   GradientPair positive_sum{p.first}, negative_sum{p.second};
 
   std::size_t total_rows = gpair.size();
-  collective::GlobalSum(info, &total_rows, 1);
+  rc = collective::GlobalSum(ctx, info, linalg::MakeVec(&total_rows, 1));
+  collective::SafeColl(rc);
 
   auto histogram_rounding =
       GradientSumT{common::CreateRoundingFactor<T>(
diff --git a/src/tree/gpu_hist/histogram.hip b/src/tree/gpu_hist/histogram.hip
deleted file mode 100644
index d505b3fd3..000000000
--- a/src/tree/gpu_hist/histogram.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "histogram.cu"
-#endif
diff --git a/src/tree/gpu_hist/row_partitioner.hip b/src/tree/gpu_hist/row_partitioner.hip
deleted file mode 100644
index ac03ac0d7..000000000
--- a/src/tree/gpu_hist/row_partitioner.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "row_partitioner.cu"
-#endif
diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h
index bc534d351..d25a41cb0 100644
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021-2023 by XGBoost Contributors
+ * Copyright 2021-2024, XGBoost Contributors
  */
 #ifndef XGBOOST_TREE_HIST_EVALUATE_SPLITS_H_
 #define XGBOOST_TREE_HIST_EVALUATE_SPLITS_H_
@@ -26,6 +26,47 @@
 #include "xgboost/linalg.h"            // for Constants, Vector
 
 namespace xgboost::tree {
+/**
+ * @brief Gather the expand entries from all the workers.
+ * @param entries Local expand entries on this worker.
+ * @return Global expand entries gathered from all workers.
+ */
+template <typename ExpandEntry>
+std::enable_if_t<std::is_same_v<ExpandEntry, CPUExpandEntry> ||
+                     std::is_same_v<ExpandEntry, MultiExpandEntry>,
+                 std::vector<ExpandEntry>>
+AllgatherColumnSplit(std::vector<ExpandEntry> const &entries) {
+  auto const n_entries = entries.size();
+
+  // First, gather all the primitive fields.
+  std::vector<ExpandEntry> local_entries(n_entries);
+
+  // Collect and serialize all entries
+  std::vector<std::vector<char>> serialized_entries;
+  for (std::size_t i = 0; i < n_entries; ++i) {
+    Json jentry{Object{}};
+    entries[i].Save(&jentry);
+
+    std::vector<char> out;
+    Json::Dump(jentry, &out, std::ios::binary);
+
+    serialized_entries.emplace_back(std::move(out));
+  }
+  auto all_serialized = collective::VectorAllgatherV(serialized_entries);
+  CHECK_GE(all_serialized.size(), local_entries.size());
+
+  std::vector<ExpandEntry> all_entries(all_serialized.size());
+  std::transform(all_serialized.cbegin(), all_serialized.cend(), all_entries.begin(),
+                 [](std::vector<char> const &e) {
+                   ExpandEntry entry;
+                   auto je = Json::Load(StringView{e.data(), e.size()}, std::ios::binary);
+                   entry.Load(je);
+                   return entry;
+                 });
+
+  return all_entries;
+}
+
 class HistEvaluator {
  private:
   struct NodeEntry {
@@ -36,8 +77,8 @@ class HistEvaluator {
   };
 
  private:
-  Context const* ctx_;
-  TrainParam const* param_;
+  Context const *ctx_;
+  TrainParam const *param_;
   std::shared_ptr<common::ColumnSampler> column_sampler_;
   TreeEvaluator tree_evaluator_;
   bool is_col_split_{false};
@@ -202,7 +243,7 @@ class HistEvaluator {
       common::CatBitField cat_bits{best.cat_bits};
       bst_bin_t partition = d_step == 1 ? (best_thresh - it_begin + 1) : (best_thresh - f_begin);
       CHECK_GT(partition, 0);
-      std::for_each(sorted_idx.begin(), sorted_idx.begin() + partition, [&](size_t c) {
+      std::for_each(sorted_idx.begin(), sorted_idx.begin() + partition, [&](std::size_t c) {
         auto cat = cut_val[c + f_begin];
         cat_bits.Set(cat);
       });
@@ -285,57 +326,23 @@ class HistEvaluator {
     return left_sum;
   }
 
-  /**
-   * @brief Gather the expand entries from all the workers.
-   * @param entries Local expand entries on this worker.
-   * @return Global expand entries gathered from all workers.
-   */
-  std::vector<CPUExpandEntry> Allgather(std::vector<CPUExpandEntry> const &entries) {
-    auto const world = collective::GetWorldSize();
-    auto const num_entries = entries.size();
-
-    // First, gather all the primitive fields.
-    std::vector<CPUExpandEntry> local_entries(num_entries);
-    std::vector<uint32_t> cat_bits;
-    std::vector<std::size_t> cat_bits_sizes;
-    for (std::size_t i = 0; i < num_entries; i++) {
-      local_entries[i].CopyAndCollect(entries[i], &cat_bits, &cat_bits_sizes);
-    }
-    auto all_entries = collective::Allgather(local_entries);
-
-    // Gather all the cat_bits.
-    auto gathered = collective::SpecialAllgatherV(cat_bits, cat_bits_sizes);
-
-    common::ParallelFor(num_entries * world, ctx_->Threads(), [&] (auto i) {
-      // Copy the cat_bits back into all expand entries.
-      all_entries[i].split.cat_bits.resize(gathered.sizes[i]);
-      std::copy_n(gathered.result.cbegin() + gathered.offsets[i], gathered.sizes[i],
-                  all_entries[i].split.cat_bits.begin());
-    });
-
-    return all_entries;
-  }
-
  public:
   void EvaluateSplits(const BoundedHistCollection &hist, common::HistogramCuts const &cut,
                       common::Span<FeatureType const> feature_types, const RegTree &tree,
                       std::vector<CPUExpandEntry> *p_entries) {
     auto n_threads = ctx_->Threads();
-    auto& entries = *p_entries;
+    auto &entries = *p_entries;
     // All nodes are on the same level, so we can store the shared ptr.
-    std::vector<std::shared_ptr<HostDeviceVector<bst_feature_t>>> features(
-        entries.size());
+    std::vector<std::shared_ptr<HostDeviceVector<bst_feature_t>>> features(entries.size());
     for (size_t nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
       auto nidx = entries[nidx_in_set].nid;
-      features[nidx_in_set] =
-          column_sampler_->GetFeatureSet(tree.GetDepth(nidx));
+      features[nidx_in_set] = column_sampler_->GetFeatureSet(tree.GetDepth(nidx));
     }
     CHECK(!features.empty());
-    const size_t grain_size =
-        std::max<size_t>(1, features.front()->Size() / n_threads);
-    common::BlockedSpace2d space(entries.size(), [&](size_t nidx_in_set) {
-      return features[nidx_in_set]->Size();
-    }, grain_size);
+    const size_t grain_size = std::max<size_t>(1, features.front()->Size() / n_threads);
+    common::BlockedSpace2d space(
+        entries.size(), [&](size_t nidx_in_set) { return features[nidx_in_set]->Size(); },
+        grain_size);
 
     std::vector<CPUExpandEntry> tloc_candidates(n_threads * entries.size());
     for (size_t i = 0; i < entries.size(); ++i) {
@@ -344,7 +351,7 @@ class HistEvaluator {
       }
     }
     auto evaluator = tree_evaluator_.GetEvaluator();
-    auto const& cut_ptrs = cut.Ptrs();
+    auto const &cut_ptrs = cut.Ptrs();
 
     common::ParallelFor2d(space, n_threads, [&](size_t nidx_in_set, common::Range1d r) {
       auto tidx = omp_get_thread_num();
@@ -385,18 +392,16 @@ class HistEvaluator {
       }
     });
 
-    for (unsigned nidx_in_set = 0; nidx_in_set < entries.size();
-         ++nidx_in_set) {
+    for (unsigned nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
       for (auto tidx = 0; tidx < n_threads; ++tidx) {
-        entries[nidx_in_set].split.Update(
-            tloc_candidates[n_threads * nidx_in_set + tidx].split);
+        entries[nidx_in_set].split.Update(tloc_candidates[n_threads * nidx_in_set + tidx].split);
       }
     }
 
     if (is_col_split_) {
       // With column-wise data split, we gather the best splits from all the workers and update the
       // expand entries accordingly.
-      auto all_entries = Allgather(entries);
+      auto all_entries = AllgatherColumnSplit(entries);
       for (auto worker = 0; worker < collective::GetWorldSize(); ++worker) {
         for (std::size_t nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
           entries[nidx_in_set].split.Update(
@@ -407,7 +412,7 @@ class HistEvaluator {
   }
 
   // Add splits to tree, handles all statistic
-  void ApplyTreeSplit(CPUExpandEntry const& candidate, RegTree *p_tree) {
+  void ApplyTreeSplit(CPUExpandEntry const &candidate, RegTree *p_tree) {
     auto evaluator = tree_evaluator_.GetEvaluator();
     RegTree &tree = *p_tree;
 
@@ -437,8 +442,7 @@ class HistEvaluator {
     auto left_child = tree[candidate.nid].LeftChild();
     auto right_child = tree[candidate.nid].RightChild();
     tree_evaluator_.AddSplit(candidate.nid, left_child, right_child,
-                             tree[candidate.nid].SplitIndex(), left_weight,
-                             right_weight);
+                             tree[candidate.nid].SplitIndex(), left_weight, right_weight);
     evaluator = tree_evaluator_.GetEvaluator();
 
     snode_.resize(tree.GetNodes().size());
@@ -449,8 +453,7 @@ class HistEvaluator {
     snode_.at(right_child).root_gain =
         evaluator.CalcGain(candidate.nid, *param_, GradStats{candidate.split.right_sum});
 
-    interaction_constraints_.Split(candidate.nid,
-                                   tree[candidate.nid].SplitIndex(), left_child,
+    interaction_constraints_.Split(candidate.nid, tree[candidate.nid].SplitIndex(), left_child,
                                    right_child);
   }
 
@@ -571,53 +574,6 @@ class HistMultiEvaluator {
     return false;
   }
 
-  /**
-   * @brief Gather the expand entries from all the workers.
-   * @param entries Local expand entries on this worker.
-   * @return Global expand entries gathered from all workers.
-   */
-  std::vector<MultiExpandEntry> Allgather(std::vector<MultiExpandEntry> const &entries) {
-    auto const world = collective::GetWorldSize();
-    auto const num_entries = entries.size();
-
-    // First, gather all the primitive fields.
-    std::vector<MultiExpandEntry> local_entries(num_entries);
-    std::vector<uint32_t> cat_bits;
-    std::vector<std::size_t> cat_bits_sizes;
-    std::vector<GradientPairPrecise> gradients;
-    for (std::size_t i = 0; i < num_entries; i++) {
-      local_entries[i].CopyAndCollect(entries[i], &cat_bits, &cat_bits_sizes, &gradients);
-    }
-    auto all_entries = collective::Allgather(local_entries);
-
-    // Gather all the cat_bits.
-    auto gathered_cat_bits = collective::SpecialAllgatherV(cat_bits, cat_bits_sizes);
-
-    // Gather all the gradients.
-    auto const num_gradients = gradients.size();
-    auto const all_gradients = collective::Allgather(gradients);
-
-    auto const total_entries = num_entries * world;
-    auto const gradients_per_entry = num_gradients / num_entries;
-    auto const gradients_per_side = gradients_per_entry / 2;
-    common::ParallelFor(total_entries, ctx_->Threads(), [&] (auto i) {
-      // Copy the cat_bits back into all expand entries.
-      all_entries[i].split.cat_bits.resize(gathered_cat_bits.sizes[i]);
-      std::copy_n(gathered_cat_bits.result.cbegin() + gathered_cat_bits.offsets[i],
-                  gathered_cat_bits.sizes[i], all_entries[i].split.cat_bits.begin());
-
-      // Copy the gradients back into all expand entries.
-      all_entries[i].split.left_sum.resize(gradients_per_side);
-      std::copy_n(all_gradients.cbegin() + i * gradients_per_entry, gradients_per_side,
-                  all_entries[i].split.left_sum.begin());
-      all_entries[i].split.right_sum.resize(gradients_per_side);
-      std::copy_n(all_gradients.cbegin() + i * gradients_per_entry + gradients_per_side,
-                  gradients_per_side, all_entries[i].split.right_sum.begin());
-    });
-
-    return all_entries;
-  }
-
  public:
   void EvaluateSplits(RegTree const &tree, common::Span<const BoundedHistCollection *> hist,
                       common::HistogramCuts const &cut, std::vector<MultiExpandEntry> *p_entries) {
@@ -676,7 +632,7 @@ class HistMultiEvaluator {
     if (is_col_split_) {
       // With column-wise data split, we gather the best splits from all the workers and update the
       // expand entries accordingly.
-      auto all_entries = Allgather(entries);
+      auto all_entries = AllgatherColumnSplit(entries);
       for (auto worker = 0; worker < collective::GetWorldSize(); ++worker) {
         for (std::size_t nidx_in_set = 0; nidx_in_set < entries.size(); ++nidx_in_set) {
           entries[nidx_in_set].split.Update(
diff --git a/src/tree/hist/expand_entry.h b/src/tree/hist/expand_entry.h
index d6315877d..fd16397e1 100644
--- a/src/tree/hist/expand_entry.h
+++ b/src/tree/hist/expand_entry.h
@@ -90,7 +90,6 @@ struct ExpandEntryImpl {
     }
 
     self->split.is_cat = get<Boolean const>(split["is_cat"]);
-
     self->LoadGrad(split);
   }
 };
@@ -106,8 +105,8 @@ struct CPUExpandEntry : public ExpandEntryImpl<CPUExpandEntry> {
   void SaveGrad(Json* p_out) const {
     auto& out = *p_out;
     auto save = [&](std::string const& name, GradStats const& sum) {
-      out[name] = F32Array{2};
-      auto& array = get<F32Array>(out[name]);
+      out[name] = F64Array{2};
+      auto& array = get<F64Array>(out[name]);
       array[0] = sum.GetGrad();
       array[1] = sum.GetHess();
     };
@@ -115,9 +114,9 @@ struct CPUExpandEntry : public ExpandEntryImpl<CPUExpandEntry> {
     save("right_sum", this->split.right_sum);
   }
   void LoadGrad(Json const& in) {
-    auto const& left_sum = get<F32Array const>(in["left_sum"]);
+    auto const& left_sum = get<F64Array const>(in["left_sum"]);
     this->split.left_sum = GradStats{left_sum[0], left_sum[1]};
-    auto const& right_sum = get<F32Array const>(in["right_sum"]);
+    auto const& right_sum = get<F64Array const>(in["right_sum"]);
     this->split.right_sum = GradStats{right_sum[0], right_sum[1]};
   }
 
@@ -173,8 +172,8 @@ struct MultiExpandEntry : public ExpandEntryImpl<MultiExpandEntry> {
   void SaveGrad(Json* p_out) const {
     auto& out = *p_out;
     auto save = [&](std::string const& name, std::vector<GradientPairPrecise> const& sum) {
-      out[name] = F32Array{sum.size() * 2};
-      auto& array = get<F32Array>(out[name]);
+      out[name] = F64Array{sum.size() * 2};
+      auto& array = get<F64Array>(out[name]);
       for (std::size_t i = 0, j = 0; i < sum.size(); i++, j += 2) {
         array[j] = sum[i].GetGrad();
         array[j + 1] = sum[i].GetHess();
@@ -185,7 +184,7 @@ struct MultiExpandEntry : public ExpandEntryImpl<MultiExpandEntry> {
   }
   void LoadGrad(Json const& in) {
     auto load = [&](std::string const& name, std::vector<GradientPairPrecise>* p_sum) {
-      auto const& array = get<F32Array const>(in[name]);
+      auto const& array = get<F64Array const>(in[name]);
       auto& sum = *p_sum;
       sum.resize(array.size() / 2);
       for (std::size_t i = 0, j = 0; i < sum.size(); ++i, j += 2) {
diff --git a/src/tree/hist/sampler.h b/src/tree/hist/sampler.h
index 803e40d54..11b4ac1c6 100644
--- a/src/tree/hist/sampler.h
+++ b/src/tree/hist/sampler.h
@@ -54,7 +54,7 @@ inline void SampleGradient(Context const* ctx, TrainParam param,
   if (param.subsample >= 1.0) {
     return;
   }
-  bst_row_t n_samples = out.Shape(0);
+  bst_idx_t n_samples = out.Shape(0);
   auto& rnd = common::GlobalRandom();
 
 #if XGBOOST_CUSTOMIZE_GLOBAL_PRNG
diff --git a/src/tree/tree_model.cc b/src/tree/tree_model.cc
index f18b51926..45834cc77 100644
--- a/src/tree/tree_model.cc
+++ b/src/tree/tree_model.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2015-2023, XGBoost Contributors
+ * Copyright 2015-2024, XGBoost Contributors
  * \file tree_model.cc
  * \brief model structure for tree
  */
@@ -8,6 +8,7 @@
 #include <xgboost/json.h>
 #include <xgboost/tree_model.h>
 
+#include <array>  // for array
 #include <cmath>
 #include <iomanip>
 #include <limits>
@@ -15,7 +16,7 @@
 #include <type_traits>
 
 #include "../common/categorical.h"
-#include "../common/common.h"  // for EscapeU8
+#include "../common/common.h"    // for EscapeU8
 #include "../predictor/predict_fn.h"
 #include "io_utils.h"  // for GetElem
 #include "param.h"
@@ -31,26 +32,50 @@ namespace tree {
 DMLC_REGISTER_PARAMETER(TrainParam);
 }
 
+namespace {
+template <typename Float>
+std::enable_if_t<std::is_floating_point_v<Float>, std::string> ToStr(Float value) {
+  int32_t constexpr kFloatMaxPrecision = std::numeric_limits<float>::max_digits10;
+  static_assert(std::is_floating_point<Float>::value,
+                "Use std::to_string instead for non-floating point values.");
+  std::stringstream ss;
+  ss << std::setprecision(kFloatMaxPrecision) << value;
+  return ss.str();
+}
+
+template <typename Float>
+std::string ToStr(linalg::VectorView<Float> value, bst_target_t limit) {
+  int32_t constexpr kFloatMaxPrecision = std::numeric_limits<float>::max_digits10;
+  static_assert(std::is_floating_point<Float>::value,
+                "Use std::to_string instead for non-floating point values.");
+  std::stringstream ss;
+  ss << std::setprecision(kFloatMaxPrecision);
+  if (value.Size() == 1) {
+    ss << value(0);
+    return ss.str();
+  }
+  CHECK_GE(limit, 2);
+  auto n = std::min(static_cast<bst_target_t>(value.Size() - 1), limit - 1);
+  ss << "[";
+  for (std::size_t i = 0; i < n; ++i) {
+    ss << value(i) << ", ";
+  }
+  if (value.Size() > limit) {
+    ss << "..., ";
+  }
+  ss << value(value.Size() - 1) << "]";
+  return ss.str();
+}
+}  // namespace
 /*!
  * \brief Base class for dump model implementation, modeling closely after code generator.
  */
 class TreeGenerator {
  protected:
-  static int32_t constexpr kFloatMaxPrecision =
-      std::numeric_limits<bst_float>::max_digits10;
   FeatureMap const& fmap_;
   std::stringstream ss_;
   bool const with_stats_;
 
-  template <typename Float>
-  static std::string ToStr(Float value) {
-    static_assert(std::is_floating_point<Float>::value,
-                  "Use std::to_string instead for non-floating point values.");
-    std::stringstream ss;
-    ss << std::setprecision(kFloatMaxPrecision) << value;
-    return ss.str();
-  }
-
   static std::string Tabs(uint32_t n) {
     std::string res;
     for (uint32_t i = 0; i < n; ++i) {
@@ -258,10 +283,10 @@ class TextGenerator : public TreeGenerator {
         kLeafTemplate,
         {{"{tabs}",  SuperT::Tabs(depth)},
          {"{nid}",   std::to_string(nid)},
-         {"{leaf}",  SuperT::ToStr(tree[nid].LeafValue())},
+         {"{leaf}",  ToStr(tree[nid].LeafValue())},
          {"{stats}", with_stats_ ?
           SuperT::Match(kStatTemplate,
-                        {{"{cover}", SuperT::ToStr(tree.Stat(nid).sum_hess)}}) : ""}});
+                        {{"{cover}", ToStr(tree.Stat(nid).sum_hess)}}) : ""}});
     return result;
   }
 
@@ -311,14 +336,14 @@ class TextGenerator : public TreeGenerator {
     static std::string const kQuantitiveTemplate =
         "{tabs}{nid}:[{fname}<{cond}] yes={left},no={right},missing={missing}";
     auto cond = tree[nid].SplitCond();
-    return SplitNodeImpl(tree, nid, kQuantitiveTemplate, SuperT::ToStr(cond), depth);
+    return SplitNodeImpl(tree, nid, kQuantitiveTemplate, ToStr(cond), depth);
   }
 
   std::string PlainNode(RegTree const& tree, int32_t nid, uint32_t depth) const override {
     auto cond = tree[nid].SplitCond();
     static std::string const kNodeTemplate =
         "{tabs}{nid}:[{fname}<{cond}] yes={left},no={right},missing={missing}";
-    return SplitNodeImpl(tree, nid, kNodeTemplate, SuperT::ToStr(cond), depth);
+    return SplitNodeImpl(tree, nid, kNodeTemplate, ToStr(cond), depth);
   }
 
   std::string Categorical(RegTree const &tree, int32_t nid,
@@ -336,8 +361,8 @@ class TextGenerator : public TreeGenerator {
     static std::string const kStatTemplate = ",gain={loss_chg},cover={sum_hess}";
     std::string const result = SuperT::Match(
         kStatTemplate,
-        {{"{loss_chg}", SuperT::ToStr(tree.Stat(nid).loss_chg)},
-         {"{sum_hess}", SuperT::ToStr(tree.Stat(nid).sum_hess)}});
+        {{"{loss_chg}", ToStr(tree.Stat(nid).loss_chg)},
+         {"{sum_hess}", ToStr(tree.Stat(nid).sum_hess)}});
     return result;
   }
 
@@ -393,11 +418,11 @@ class JsonGenerator : public TreeGenerator {
     std::string result = SuperT::Match(
         kLeafTemplate,
         {{"{nid}",  std::to_string(nid)},
-         {"{leaf}", SuperT::ToStr(tree[nid].LeafValue())},
+         {"{leaf}", ToStr(tree[nid].LeafValue())},
          {"{stat}", with_stats_ ? SuperT::Match(
              kStatTemplate,
              {{"{sum_hess}",
-               SuperT::ToStr(tree.Stat(nid).sum_hess)}})  : ""}});
+               ToStr(tree.Stat(nid).sum_hess)}})  : ""}});
     return result;
   }
 
@@ -468,7 +493,7 @@ class JsonGenerator : public TreeGenerator {
         R"I("split_condition": {cond}, "yes": {left}, "no": {right}, )I"
         R"I("missing": {missing})I";
     bst_float cond = tree[nid].SplitCond();
-    return SplitNodeImpl(tree, nid, kQuantitiveTemplate, SuperT::ToStr(cond), depth);
+    return SplitNodeImpl(tree, nid, kQuantitiveTemplate, ToStr(cond), depth);
   }
 
   std::string PlainNode(RegTree const& tree, int32_t nid, uint32_t depth) const override {
@@ -477,7 +502,7 @@ class JsonGenerator : public TreeGenerator {
         R"I( "nodeid": {nid}, "depth": {depth}, "split": "{fname}", )I"
         R"I("split_condition": {cond}, "yes": {left}, "no": {right}, )I"
         R"I("missing": {missing})I";
-    return SplitNodeImpl(tree, nid, kNodeTemplate, SuperT::ToStr(cond), depth);
+    return SplitNodeImpl(tree, nid, kNodeTemplate, ToStr(cond), depth);
   }
 
   std::string NodeStat(RegTree const& tree, int32_t nid) const override {
@@ -485,8 +510,8 @@ class JsonGenerator : public TreeGenerator {
         R"S(, "gain": {loss_chg}, "cover": {sum_hess})S";
     auto result = SuperT::Match(
         kStatTemplate,
-        {{"{loss_chg}", SuperT::ToStr(tree.Stat(nid).loss_chg)},
-         {"{sum_hess}", SuperT::ToStr(tree.Stat(nid).sum_hess)}});
+        {{"{loss_chg}", ToStr(tree.Stat(nid).loss_chg)},
+         {"{sum_hess}", ToStr(tree.Stat(nid).sum_hess)}});
     return result;
   }
 
@@ -622,11 +647,11 @@ class GraphvizGenerator : public TreeGenerator {
 
  protected:
   template <bool is_categorical>
-  std::string BuildEdge(RegTree const &tree, bst_node_t nid, int32_t child, bool left) const {
+  std::string BuildEdge(RegTree const &tree, bst_node_t nidx, int32_t child, bool left) const {
     static std::string const kEdgeTemplate =
         "    {nid} -> {child} [label=\"{branch}\" color=\"{color}\"]\n";
     // Is this the default child for missing value?
-    bool is_missing = tree[nid].DefaultChild() == child;
+    bool is_missing = tree.DefaultChild(nidx) == child;
     std::string branch;
     if (is_categorical) {
       branch = std::string{left ? "no" : "yes"} + std::string{is_missing ? ", missing" : ""};
@@ -635,7 +660,7 @@ class GraphvizGenerator : public TreeGenerator {
     }
     std::string buffer =
         SuperT::Match(kEdgeTemplate,
-                {{"{nid}", std::to_string(nid)},
+                {{"{nid}", std::to_string(nidx)},
                  {"{child}", std::to_string(child)},
                  {"{color}", is_missing ? param_.yes_color : param_.no_color},
                  {"{branch}", branch}});
@@ -644,68 +669,77 @@ class GraphvizGenerator : public TreeGenerator {
 
   // Only indicator is different, so we combine all different node types into this
   // function.
-  std::string PlainNode(RegTree const& tree, int32_t nid, uint32_t) const override {
-    auto split_index = tree[nid].SplitIndex();
-    auto cond = tree[nid].SplitCond();
+  std::string PlainNode(RegTree const& tree, bst_node_t nidx, uint32_t) const override {
+    auto split_index = tree.SplitIndex(nidx);
+    auto cond = tree.SplitCond(nidx);
     static std::string const kNodeTemplate = "    {nid} [ label=\"{fname}{<}{cond}\" {params}]\n";
 
     bool has_less =
         (split_index >= fmap_.Size()) || fmap_.TypeOf(split_index) != FeatureMap::kIndicator;
     std::string result =
-        SuperT::Match(kNodeTemplate, {{"{nid}", std::to_string(nid)},
+        SuperT::Match(kNodeTemplate, {{"{nid}", std::to_string(nidx)},
                                       {"{fname}", GetFeatureName(fmap_, split_index)},
                                       {"{<}", has_less ? "<" : ""},
-                                      {"{cond}", has_less ? SuperT::ToStr(cond) : ""},
+                                      {"{cond}", has_less ? ToStr(cond) : ""},
                                       {"{params}", param_.condition_node_params}});
 
-    result += BuildEdge<false>(tree, nid, tree[nid].LeftChild(), true);
-    result += BuildEdge<false>(tree, nid, tree[nid].RightChild(), false);
+    result += BuildEdge<false>(tree, nidx, tree.LeftChild(nidx), true);
+    result += BuildEdge<false>(tree, nidx, tree.RightChild(nidx), false);
 
     return result;
   };
 
-  std::string Categorical(RegTree const& tree, int32_t nid, uint32_t) const override {
+  std::string Categorical(RegTree const& tree, bst_node_t nidx, uint32_t) const override {
     static std::string const kLabelTemplate =
         "    {nid} [ label=\"{fname}:{cond}\" {params}]\n";
-    auto cats = GetSplitCategories(tree, nid);
+    auto cats = GetSplitCategories(tree, nidx);
     auto cats_str = PrintCatsAsSet(cats);
-    auto split_index = tree[nid].SplitIndex();
+    auto split_index = tree.SplitIndex(nidx);
 
     std::string result =
-        SuperT::Match(kLabelTemplate, {{"{nid}", std::to_string(nid)},
+        SuperT::Match(kLabelTemplate, {{"{nid}", std::to_string(nidx)},
                                        {"{fname}", GetFeatureName(fmap_, split_index)},
                                        {"{cond}", cats_str},
                                        {"{params}", param_.condition_node_params}});
 
-    result += BuildEdge<true>(tree, nid, tree[nid].LeftChild(), true);
-    result += BuildEdge<true>(tree, nid, tree[nid].RightChild(), false);
+    result += BuildEdge<true>(tree, nidx, tree.LeftChild(nidx), true);
+    result += BuildEdge<true>(tree, nidx, tree.RightChild(nidx), false);
 
     return result;
   }
 
-  std::string LeafNode(RegTree const& tree, int32_t nid, uint32_t) const override {
-    static std::string const kLeafTemplate =
-        "    {nid} [ label=\"leaf={leaf-value}\" {params}]\n";
-    auto result = SuperT::Match(kLeafTemplate, {
-        {"{nid}",        std::to_string(nid)},
-        {"{leaf-value}", ToStr(tree[nid].LeafValue())},
-        {"{params}",     param_.leaf_node_params}});
-    return result;
-  };
+  std::string LeafNode(RegTree const& tree, bst_node_t nidx, uint32_t) const override {
+    static std::string const kLeafTemplate = "    {nid} [ label=\"leaf={leaf-value}\" {params}]\n";
+    // hardcoded limit to avoid dumping long arrays into dot graph.
+    bst_target_t constexpr kLimit{3};
+    if (tree.IsMultiTarget()) {
+      auto value = tree.GetMultiTargetTree()->LeafValue(nidx);
+      auto result = SuperT::Match(kLeafTemplate, {{"{nid}", std::to_string(nidx)},
+                                                  {"{leaf-value}", ToStr(value, kLimit)},
+                                                  {"{params}", param_.leaf_node_params}});
+      return result;
+    } else {
+      auto value = tree[nidx].LeafValue();
+      auto result = SuperT::Match(kLeafTemplate, {{"{nid}", std::to_string(nidx)},
+                                                  {"{leaf-value}", ToStr(value)},
+                                                  {"{params}", param_.leaf_node_params}});
+      return result;
+    }
+  }
 
-  std::string BuildTree(RegTree const& tree, int32_t nid, uint32_t depth) override {
-    if (tree[nid].IsLeaf()) {
-      return this->LeafNode(tree, nid, depth);
+  std::string BuildTree(RegTree const& tree, bst_node_t nidx, uint32_t depth) override {
+    if (tree.IsLeaf(nidx)) {
+      return this->LeafNode(tree, nidx, depth);
     }
     static std::string const kNodeTemplate = "{parent}\n{left}\n{right}";
-    auto node = tree.GetSplitTypes()[nid] == FeatureType::kCategorical
-                    ? this->Categorical(tree, nid, depth)
-                    : this->PlainNode(tree, nid, depth);
+    auto node = tree.GetSplitTypes()[nidx] == FeatureType::kCategorical
+                    ? this->Categorical(tree, nidx, depth)
+                    : this->PlainNode(tree, nidx, depth);
     auto result = SuperT::Match(
         kNodeTemplate,
         {{"{parent}", node},
-         {"{left}",   this->BuildTree(tree, tree[nid].LeftChild(), depth+1)},
-         {"{right}",  this->BuildTree(tree, tree[nid].RightChild(), depth+1)}});
+         {"{left}",   this->BuildTree(tree, tree.LeftChild(nidx), depth+1)},
+         {"{right}",  this->BuildTree(tree, tree.RightChild(nidx), depth+1)}});
     return result;
   }
 
@@ -733,7 +767,9 @@ XGBOOST_REGISTER_TREE_IO(GraphvizGenerator, "dot")
 constexpr bst_node_t RegTree::kRoot;
 
 std::string RegTree::DumpModel(const FeatureMap& fmap, bool with_stats, std::string format) const {
-  CHECK(!IsMultiTarget());
+  if (this->IsMultiTarget() && format != "dot") {
+    LOG(FATAL) << format << " tree dump " << MTNotImplemented();
+  }
   std::unique_ptr<TreeGenerator> builder{TreeGenerator::Create(format, fmap, with_stats)};
   builder->BuildTree(*this);
 
diff --git a/src/tree/updater_approx.cc b/src/tree/updater_approx.cc
index 94e7547ee..68317fc41 100644
--- a/src/tree/updater_approx.cc
+++ b/src/tree/updater_approx.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021-2023 by XGBoost contributors
+ * Copyright 2021-2024, XGBoost contributors
  *
  * \brief Implementation for the approx tree method.
  */
@@ -107,7 +107,10 @@ class GloablApproxBuilder {
     for (auto const &g : gpair) {
       root_sum.Add(g);
     }
-    collective::GlobalSum(p_fmat->Info(), reinterpret_cast<double *>(&root_sum), 2);
+    auto rc = collective::GlobalSum(ctx_, p_fmat->Info(),
+                                    linalg::MakeVec(reinterpret_cast<double *>(&root_sum), 2));
+    collective::SafeColl(rc);
+
     std::vector<CPUExpandEntry> nodes{best};
     this->histogram_builder_.BuildRootHist(p_fmat, p_tree, partitioner_,
                                            linalg::MakeTensorView(ctx_, gpair, gpair.size(), 1),
diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc
index ef166fae5..45018da17 100644
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -106,6 +106,9 @@ class ColMaker: public TreeUpdater {
     if (dmat->Info().HasCategorical()) {
       LOG(FATAL) << error::NoCategorical("Updater `grow_colmaker` or `exact` tree method");
     }
+    if (param->colsample_bynode - 1.0 != 0.0) {
+      LOG(FATAL) << "column sample by node is not yet supported by the exact tree method";
+    }
     this->LazyGetColumnDensity(dmat);
     // rescale learning rate according to size of trees
     interaction_constraints_.Configure(*param, dmat->Info().num_row_);
@@ -440,9 +443,8 @@ class ColMaker: public TreeUpdater {
     }
 
     // update the solution candidate
-    virtual void UpdateSolution(const SortedCSCPage &batch,
-                                const std::vector<bst_feature_t> &feat_set,
-                                const std::vector<GradientPair> &gpair, DMatrix *) {
+    void UpdateSolution(SortedCSCPage const &batch, const std::vector<bst_feature_t> &feat_set,
+                        const std::vector<GradientPair> &gpair) {
       // start enumeration
       const auto num_features = feat_set.size();
       CHECK(this->ctx_);
@@ -466,17 +468,15 @@ class ColMaker: public TreeUpdater {
             }
           });
     }
+
     // find splits at current level, do split per level
-    inline void FindSplit(int depth,
-                          const std::vector<int> &qexpand,
-                          const std::vector<GradientPair> &gpair,
-                          DMatrix *p_fmat,
-                          RegTree *p_tree) {
+    void FindSplit(bst_node_t depth, const std::vector<int> &qexpand,
+                   std::vector<GradientPair> const &gpair, DMatrix *p_fmat, RegTree *p_tree) {
       auto evaluator = tree_evaluator_.GetEvaluator();
 
       auto feat_set = column_sampler_->GetFeatureSet(depth);
       for (const auto &batch : p_fmat->GetBatches<SortedCSCPage>(ctx_)) {
-        this->UpdateSolution(batch, feat_set->HostVector(), gpair, p_fmat);
+        this->UpdateSolution(batch, feat_set->HostVector(), gpair);
       }
       // after this each thread's stemp will get the best candidates, aggregate results
       this->SyncBestSolution(qexpand);
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 4c535e7eb..83bcf7a7c 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2017-2023 by XGBoost contributors
+ * Copyright 2017-2024, XGBoost contributors
  */
 #include <thrust/copy.h>
 #include <thrust/reduce.h>
@@ -192,7 +192,7 @@ struct GPUHistMakerDevice {
   std::unique_ptr<FeatureGroups> feature_groups;
 
   GPUHistMakerDevice(Context const* ctx, bool is_external_memory,
-                     common::Span<FeatureType const> _feature_types, bst_row_t _n_rows,
+                     common::Span<FeatureType const> _feature_types, bst_idx_t _n_rows,
                      TrainParam _param, std::shared_ptr<common::ColumnSampler> column_sampler,
                      uint32_t n_features, BatchParam batch_param, MetaInfo const& info)
       : evaluator_{_param, n_features, ctx->Device()},
@@ -735,7 +735,9 @@ struct GPUHistMakerDevice {
         dh::Reduce(ctx_->CUDACtx()->CTP(), gpair_it, gpair_it + gpair.size(),
                    GradientPairInt64{}, thrust::plus<GradientPairInt64>{});
     using ReduceT = typename decltype(root_sum_quantised)::ValueT;
-    collective::GlobalSum(info_, reinterpret_cast<ReduceT*>(&root_sum_quantised), 2);
+    auto rc = collective::GlobalSum(
+        ctx_, info_, linalg::MakeVec(reinterpret_cast<ReduceT*>(&root_sum_quantised), 2));
+    collective::SafeColl(rc);
 
     hist.AllocateHistograms({kRootNIdx});
     this->BuildHist(kRootNIdx);
diff --git a/src/tree/updater_gpu_hist.hip b/src/tree/updater_gpu_hist.hip
deleted file mode 100644
index e0f3be6a3..000000000
--- a/src/tree/updater_gpu_hist.hip
+++ /dev/null
@@ -1,4 +0,0 @@
-
-#if defined(XGBOOST_USE_HIP)
-#include "updater_gpu_hist.cu"
-#endif
diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index c2aaedafa..ced277773 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2017-2023, XGBoost Contributors
+ * Copyright 2017-2024, XGBoost Contributors
  * \file updater_quantile_hist.cc
  * \brief use quantized feature values to construct a tree
  * \author Philip Cho, Tianqi Checn, Egor Smirnov
@@ -149,9 +149,6 @@ class MultiTargetHistBuilder {
   }
 
   void InitData(DMatrix *p_fmat, RegTree const *p_tree) {
-    if (collective::IsDistributed()) {
-      LOG(FATAL) << "Distributed training for vector-leaf is not yet supported.";
-    }
     monitor_->Start(__func__);
 
     p_last_fmat_ = p_fmat;
@@ -202,8 +199,10 @@ class MultiTargetHistBuilder {
       }
     }
     CHECK(root_sum.CContiguous());
-    collective::GlobalSum(p_fmat->Info(), reinterpret_cast<double *>(root_sum.Values().data()),
-                          root_sum.Size() * 2);
+    auto rc = collective::GlobalSum(
+        ctx_, p_fmat->Info(),
+        linalg::MakeVec(reinterpret_cast<double *>(root_sum.Values().data()), root_sum.Size() * 2));
+    collective::SafeColl(rc);
 
     histogram_builder_->BuildRootHist(p_fmat, p_tree, partitioner_, gpair, best, HistBatch(param_));
 
@@ -411,7 +410,9 @@ class HistUpdater {
         for (auto const &grad : gpair_h) {
           grad_stat.Add(grad.GetGrad(), grad.GetHess());
         }
-        collective::GlobalSum(p_fmat->Info(), reinterpret_cast<double *>(&grad_stat), 2);
+        auto rc = collective::GlobalSum(ctx_, p_fmat->Info(),
+                                        linalg::MakeVec(reinterpret_cast<double *>(&grad_stat), 2));
+        collective::SafeColl(rc);
       }
 
       auto weight = evaluator_->InitRoot(GradStats{grad_stat});
@@ -474,6 +475,7 @@ class QuantileHistMaker : public TreeUpdater {
   std::unique_ptr<HistUpdater> p_impl_{nullptr};
   std::unique_ptr<MultiTargetHistBuilder> p_mtimpl_{nullptr};
   std::shared_ptr<common::ColumnSampler> column_sampler_;
+
   common::Monitor monitor_;
   ObjInfo const *task_{nullptr};
   HistMakerTrainParam hist_param_;
diff --git a/tests/buildkite/conftest.sh b/tests/buildkite/conftest.sh
index 3df79b58d..44043910b 100755
--- a/tests/buildkite/conftest.sh
+++ b/tests/buildkite/conftest.sh
@@ -24,7 +24,7 @@ set -x
 
 CUDA_VERSION=11.8.0
 NCCL_VERSION=2.16.5-1
-RAPIDS_VERSION=23.12
+RAPIDS_VERSION=24.04
 SPARK_VERSION=3.4.0
 JDK_VERSION=8
 R_VERSION=4.3.2
@@ -39,13 +39,14 @@ fi
 if [[ -n $BUILDKITE_PULL_REQUEST && $BUILDKITE_PULL_REQUEST != "false" ]]
 then
   is_pull_request=1
-  export BRANCH_NAME=PR-$BUILDKITE_PULL_REQUEST
+  BRANCH_NAME=PR-$BUILDKITE_PULL_REQUEST
 else
   is_pull_request=0
-  export BRANCH_NAME=$BUILDKITE_BRANCH
+  BRANCH_NAME=$BUILDKITE_BRANCH
 fi
+export BRANCH_NAME=${BRANCH_NAME//\//-}
 
-if [[ $BUILDKITE_BRANCH == "master" || $BUILDKITE_BRANCH == "release_"* ]]
+if [[ $BRANCH_NAME == "master" || $BRANCH_NAME == "release_"* ]]
 then
   is_release_branch=1
   enforce_daily_budget=0
diff --git a/tests/buildkite/infrastructure/README.md b/tests/buildkite/infrastructure/README.md
new file mode 100644
index 000000000..cc3e552e7
--- /dev/null
+++ b/tests/buildkite/infrastructure/README.md
@@ -0,0 +1,106 @@
+BuildKite CI Infrastructure
+===========================
+
+# Worker image builder (`worker-image-pipeline/`)
+
+Use EC2 Image Builder to build machine images in a deterministic fashion.
+The machine images are used to initialize workers in the CI/CD pipelines.
+
+## Editing bootstrap scripts
+
+Currently, we create two pipelines for machine images: one for Linux workers and another
+for Windows workers.
+You can edit the bootstrap scripts to change how the worker machines are initialized.
+
+* `linux-amd64-gpu-bootstrap.yml`: Bootstrap script for Linux worker machines
+* `windows-gpu-bootstrap.yml`: Bootstrap script for Windows worker machines
+
+## Creating and running Image Builder pipelines
+
+Run the following commands to create and run pipelines in EC2 Image Builder service:
+```bash
+python worker-image-pipeline/create_worker_image_pipelines.py --aws-region us-west-2
+python worker-image-pipeline/run_pipelines.py --aws-region us-west-2
+```
+Go to the AWS CloudFormation console and verify the existence of two CloudFormation stacks:
+* `buildkite-windows-gpu-worker`
+* `buildkite-linux-amd64-gpu-worker`
+
+Then go to the EC2 Image Builder console to check the status of the image builds. You may
+want to inspect the log output should a build fails.
+Once the new machine images are done building, see the next section to deploy the new
+images to the worker machines.
+
+# Elastic CI Stack for AWS (`aws-stack-creator/`)
+
+Use EC2 Autoscaling groups to launch worker machines in EC2. BuildKite periodically sends
+messages to the Autoscaling groups to increase or decrease the number of workers according
+to the number of outstanding testing jobs.
+
+## Deploy an updated CI stack with new machine images
+
+First, edit `aws-stack-creator/metadata.py` to update the `AMI_ID` fields:
+```python
+AMI_ID = {
+    # Managed by XGBoost team
+    "linux-amd64-gpu": {
+        "us-west-2": "...",
+    },
+    "linux-amd64-mgpu": {
+        "us-west-2": "...",
+    },
+    "windows-gpu": {
+        "us-west-2": "...",
+    },
+    "windows-cpu": {
+        "us-west-2": "...",
+    },
+    # Managed by BuildKite
+    # from https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml
+    "linux-amd64-cpu": {
+        "us-west-2": "...",
+    },
+    "pipeline-loader": {
+        "us-west-2": "...",
+    },
+    "linux-arm64-cpu": {
+        "us-west-2": "...",
+    },
+}
+```
+AMI IDs uniquely identify the machine images in the EC2 service.
+Go to the EC2 Image Builder console to find the AMI IDs for the new machine images
+(see the previous section), and update the following fields:
+
+* `AMI_ID["linux-amd64-gpu"]["us-west-2"]`:
+  Use the latest output from the `buildkite-linux-amd64-gpu-worker` pipeline
+* `AMI_ID["linux-amd64-mgpu"]["us-west-2"]`:
+  Should be identical to `AMI_ID["linux-amd64-gpu"]["us-west-2"]`
+* `AMI_ID["windows-gpu"]["us-west-2"]`:
+  Use the latest output from the `buildkite-windows-gpu-worker` pipeline
+* `AMI_ID["windows-cpu"]["us-west-2"]`:
+  Should be identical to  `AMI_ID["windows-gpu"]["us-west-2"]`
+
+Next, visit https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml
+to look up the AMI IDs for the following fields:
+
+* `AMI_ID["linux-amd64-cpu"]["us-west-2"]`: Copy and paste the AMI ID from the field
+  `Mappings/AWSRegion2AMI/us-west-2/linuxamd64`
+* `AMI_ID["pipeline-loader"]["us-west-2"]`:
+   Should be identical to `AMI_ID["linux-amd64-cpu"]["us-west-2"]`
+* `AMI_ID["linux-arm64-cpu"]["us-west-2"]`: Copy and paste the AMI ID from the field
+  `Mappings/AWSRegion2AMI/us-west-2/linuxarm64`
+
+Finally, run the following commands to deploy the new machine images:
+```
+python aws-stack-creator/create_stack.py --aws-region us-west-2 --agent-token AGENT_TOKEN
+```
+Go to the AWS CloudFormation console and verify the existence of the following
+CloudFormation stacks:
+* `buildkite-pipeline-loader-autoscaling-group`
+* `buildkite-linux-amd64-cpu-autoscaling-group`
+* `buildkite-linux-amd64-gpu-autoscaling-group`
+* `buildkite-linux-amd64-mgpu-autoscaling-group`
+* `buildkite-linux-arm64-cpu-autoscaling-group`
+* `buildkite-windows-cpu-autoscaling-group`
+* `buildkite-windows-gpu-autoscaling-group`
diff --git a/tests/buildkite/infrastructure/aws-stack-creator/metadata.py b/tests/buildkite/infrastructure/aws-stack-creator/metadata.py
index 3b56a2d8c..e086021da 100644
--- a/tests/buildkite/infrastructure/aws-stack-creator/metadata.py
+++ b/tests/buildkite/infrastructure/aws-stack-creator/metadata.py
@@ -1,27 +1,27 @@
 AMI_ID = {
     # Managed by XGBoost team
     "linux-amd64-gpu": {
-        "us-west-2": "ami-08c3bc1dd5ec8bc5c",
+        "us-west-2": "ami-070080d04e81c5e39",
     },
     "linux-amd64-mgpu": {
-        "us-west-2": "ami-08c3bc1dd5ec8bc5c",
+        "us-west-2": "ami-070080d04e81c5e39",
     },
     "windows-gpu": {
-        "us-west-2": "ami-03c7f2156f93b22a7",
+        "us-west-2": "ami-07c14abcf529d816a",
     },
     "windows-cpu": {
-        "us-west-2": "ami-03c7f2156f93b22a7",
+        "us-west-2": "ami-07c14abcf529d816a",
     },
     # Managed by BuildKite
     # from https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml
     "linux-amd64-cpu": {
-        "us-west-2": "ami-015e64acb52b3e595",
+        "us-west-2": "ami-0180f7fb0f07eb0bc",
     },
     "pipeline-loader": {
-        "us-west-2": "ami-015e64acb52b3e595",
+        "us-west-2": "ami-0180f7fb0f07eb0bc",
     },
     "linux-arm64-cpu": {
-        "us-west-2": "ami-0884e9c23a2fa98d0",
+        "us-west-2": "ami-00686bdc2043a5505",
     },
 }
 
diff --git a/tests/buildkite/infrastructure/worker-image-pipeline/windows-gpu-bootstrap.yml b/tests/buildkite/infrastructure/worker-image-pipeline/windows-gpu-bootstrap.yml
index e4d212fda..128351e0d 100644
--- a/tests/buildkite/infrastructure/worker-image-pipeline/windows-gpu-bootstrap.yml
+++ b/tests/buildkite/infrastructure/worker-image-pipeline/windows-gpu-bootstrap.yml
@@ -15,9 +15,9 @@ phases:
               choco --version
               choco feature enable -n=allowGlobalConfirmation
 
-              # CMake 3.27
-              Write-Host '>>> Installing CMake 3.27...'
-              choco install cmake --version 3.27.9 --installargs "ADD_CMAKE_TO_PATH=System"
+              # CMake 3.29.2
+              Write-Host '>>> Installing CMake 3.29.2...'
+              choco install cmake --version 3.29.2 --installargs "ADD_CMAKE_TO_PATH=System"
               if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
 
               # Notepad++
@@ -53,9 +53,9 @@ phases:
                   "--wait --passive --norestart --includeOptional"
               if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
 
-              # Install CUDA 11.8
-              Write-Host '>>> Installing CUDA 11.8...'
-              choco install cuda --version=11.8.0.52206
+              # Install CUDA 12.4
+              Write-Host '>>> Installing CUDA 12.4...'
+              choco install cuda --version=12.4.1.551
               if ($LASTEXITCODE -ne 0) { throw "Last command failed" }
 
               # Install R
diff --git a/tests/ci_build/Dockerfile.gpu b/tests/ci_build/Dockerfile.gpu
index 0a5adb6ea..255dd9d71 100644
--- a/tests/ci_build/Dockerfile.gpu
+++ b/tests/ci_build/Dockerfile.gpu
@@ -21,14 +21,14 @@ ENV PATH=/opt/mambaforge/bin:$PATH
 
 # Create new Conda environment with cuDF, Dask, and cuPy
 RUN \
-    conda install -c conda-forge mamba && \
-    mamba create -n gpu_test -c rapidsai-nightly -c rapidsai -c nvidia -c conda-forge -c defaults \
+    export NCCL_SHORT_VER=$(echo "$NCCL_VERSION_ARG" | cut -d "-" -f 1) && \
+    mamba create -y -n gpu_test -c rapidsai -c nvidia -c conda-forge \
         python=3.10 cudf=$RAPIDS_VERSION_ARG* rmm=$RAPIDS_VERSION_ARG* cudatoolkit=$CUDA_VERSION_ARG \
-        nccl>=$(cut -d "-" -f 1 << $NCCL_VERSION_ARG) \
-        dask \
+        "nccl>=${NCCL_SHORT_VER}" \
+        dask=2024.1.1 \
         dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \
         numpy pytest pytest-timeout scipy scikit-learn pandas matplotlib wheel python-kubernetes urllib3 graphviz hypothesis \
-        pyspark>=3.4.0 cloudpickle cuda-python && \
+        "pyspark>=3.4.0" cloudpickle cuda-python && \
     mamba clean --all && \
     conda run --no-capture-output -n gpu_test pip install buildkite-test-collector
 
diff --git a/tests/ci_build/Dockerfile.jvm b/tests/ci_build/Dockerfile.jvm
index 43fbd8ff5..a115fd52c 100644
--- a/tests/ci_build/Dockerfile.jvm
+++ b/tests/ci_build/Dockerfile.jvm
@@ -15,9 +15,9 @@ RUN \
     wget -nv -nc https://cmake.org/files/v3.18/cmake-3.18.0-Linux-x86_64.sh --no-check-certificate && \
     bash cmake-3.18.0-Linux-x86_64.sh --skip-license --prefix=/usr && \
     # Maven
-    wget -nv -nc https://archive.apache.org/dist/maven/maven-3/3.6.1/binaries/apache-maven-3.6.1-bin.tar.gz && \
-    tar xvf apache-maven-3.6.1-bin.tar.gz -C /opt && \
-    ln -s /opt/apache-maven-3.6.1/ /opt/maven
+    wget -nv -nc https://archive.apache.org/dist/maven/maven-3/3.6.3/binaries/apache-maven-3.6.3-bin.tar.gz && \
+    tar xvf apache-maven-3.6.3-bin.tar.gz -C /opt && \
+    ln -s /opt/apache-maven-3.6.3/ /opt/maven
 
 ENV PATH=/opt/mambaforge/bin:/opt/maven/bin:$PATH
 ENV CC=/opt/rh/devtoolset-9/root/usr/bin/gcc
diff --git a/tests/ci_build/Dockerfile.jvm_cross b/tests/ci_build/Dockerfile.jvm_cross
index fdfae310a..5c4bb569b 100644
--- a/tests/ci_build/Dockerfile.jvm_cross
+++ b/tests/ci_build/Dockerfile.jvm_cross
@@ -17,9 +17,9 @@ RUN \
     bash conda.sh -b -p /opt/mambaforge && \
     /opt/mambaforge/bin/pip install awscli && \
     # Maven
-    wget -nv https://archive.apache.org/dist/maven/maven-3/3.6.1/binaries/apache-maven-3.6.1-bin.tar.gz && \
-    tar xvf apache-maven-3.6.1-bin.tar.gz -C /opt && \
-    ln -s /opt/apache-maven-3.6.1/ /opt/maven && \
+    wget -nv https://archive.apache.org/dist/maven/maven-3/3.6.3/binaries/apache-maven-3.6.3-bin.tar.gz && \
+    tar xvf apache-maven-3.6.3-bin.tar.gz -C /opt && \
+    ln -s /opt/apache-maven-3.6.3/ /opt/maven && \
     # Spark with scala 2.12
     mkdir -p /opt/spark-scala-2.12 && \
     wget -nv https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop3.tgz && \
diff --git a/tests/ci_build/Dockerfile.jvm_gpu_build b/tests/ci_build/Dockerfile.jvm_gpu_build
index 86ce7e72a..cee418942 100644
--- a/tests/ci_build/Dockerfile.jvm_gpu_build
+++ b/tests/ci_build/Dockerfile.jvm_gpu_build
@@ -18,9 +18,9 @@ RUN \
     wget -nv -nc https://cmake.org/files/v3.18/cmake-3.18.0-Linux-x86_64.sh --no-check-certificate && \
     bash cmake-3.18.0-Linux-x86_64.sh --skip-license --prefix=/usr && \
     # Maven
-    wget -nv -nc https://archive.apache.org/dist/maven/maven-3/3.6.1/binaries/apache-maven-3.6.1-bin.tar.gz && \
-    tar xvf apache-maven-3.6.1-bin.tar.gz -C /opt && \
-    ln -s /opt/apache-maven-3.6.1/ /opt/maven
+    wget -nv -nc https://archive.apache.org/dist/maven/maven-3/3.6.3/binaries/apache-maven-3.6.3-bin.tar.gz && \
+    tar xvf apache-maven-3.6.3-bin.tar.gz -C /opt && \
+    ln -s /opt/apache-maven-3.6.3/ /opt/maven
 
 # NCCL2 (License: https://docs.nvidia.com/deeplearning/sdk/nccl-sla/index.html)
 RUN \
diff --git a/tests/ci_build/lint_python.py b/tests/ci_build/lint_python.py
index 91b748b4c..741ef7558 100644
--- a/tests/ci_build/lint_python.py
+++ b/tests/ci_build/lint_python.py
@@ -100,6 +100,7 @@ class LintersPaths:
         # demo
         "demo/json-model/json_parser.py",
         "demo/guide-python/external_memory.py",
+        "demo/guide-python/sklearn_examples.py",
         "demo/guide-python/continuation.py",
         "demo/guide-python/callbacks.py",
         "demo/guide-python/cat_in_the_dat.py",
diff --git a/tests/ci_build/test_r_package.py b/tests/ci_build/test_r_package.py
index dd73f850b..ddcf48674 100644
--- a/tests/ci_build/test_r_package.py
+++ b/tests/ci_build/test_r_package.py
@@ -277,6 +277,19 @@ def test_with_cmake(args: argparse.Namespace) -> None:
                     "Release",
                 ]
             )
+        elif args.compiler == "none":
+            subprocess.check_call(
+                [
+                    "cmake",
+                    os.path.pardir,
+                    "-DUSE_OPENMP=ON",
+                    "-DR_LIB=ON",
+                    "-DCMAKE_CONFIGURATION_TYPES=Release",
+                    "-G",
+                    "Unix Makefiles",
+                ]
+            )
+            subprocess.check_call(["make", "-j", "install"])
         else:
             raise ValueError("Wrong compiler")
     with DirectoryExcursion(R_PACKAGE):
@@ -333,9 +346,9 @@ if __name__ == "__main__":
     parser.add_argument(
         "--compiler",
         type=str,
-        choices=["mingw", "msvc"],
+        choices=["mingw", "msvc", "none"],
         help="Compiler used for compiling CXX code. Only relevant for windows build",
-        default="mingw",
+        default="none",
         required=False,
     )
     parser.add_argument(
diff --git a/tests/cpp/CMakeLists.txt b/tests/cpp/CMakeLists.txt
index 4d2ed6b18..fbf0bb901 100644
--- a/tests/cpp/CMakeLists.txt
+++ b/tests/cpp/CMakeLists.txt
@@ -34,14 +34,14 @@ if(PLUGIN_SYCL)
     ${xgboost_SOURCE_DIR}/rabit/include)
 
   target_compile_definitions(plugin_sycl_test PUBLIC -DXGBOOST_USE_SYCL=1)
-
   target_link_libraries(plugin_sycl_test PUBLIC -fsycl)
+  target_link_libraries(plugin_sycl_test PRIVATE ${GTEST_LIBRARIES})
 
   set_target_properties(plugin_sycl_test PROPERTIES
-  COMPILE_FLAGS -fsycl
-  CXX_STANDARD 17
-  CXX_STANDARD_REQUIRED ON
-  POSITION_INDEPENDENT_CODE ON)
+    COMPILE_FLAGS -fsycl
+    CXX_STANDARD 17
+    CXX_STANDARD_REQUIRED ON
+    POSITION_INDEPENDENT_CODE ON)
   if(USE_OPENMP)
     find_package(OpenMP REQUIRED)
     set_target_properties(plugin_sycl_test PROPERTIES
@@ -81,7 +81,7 @@ target_include_directories(testxgboost
   ${xgboost_SOURCE_DIR}/rabit/include)
 target_link_libraries(testxgboost
   PRIVATE
-  ${GTEST_LIBRARIES})
+  GTest::gtest GTest::gmock)
 
 set_output_directory(testxgboost ${xgboost_BINARY_DIR})
 
diff --git a/tests/cpp/c_api/test_c_api.cc b/tests/cpp/c_api/test_c_api.cc
index fb3e0e36f..ea3857650 100644
--- a/tests/cpp/c_api/test_c_api.cc
+++ b/tests/cpp/c_api/test_c_api.cc
@@ -439,7 +439,7 @@ void MakeLabelForTest(std::shared_ptr<DMatrix> Xy, DMatrixHandle cxy) {
   XGDMatrixSetInfoFromInterface(cxy, "label", s_y_int.c_str());
 }
 
-auto MakeSimpleDMatrixForTest(bst_row_t n_samples, bst_feature_t n_features, Json dconfig) {
+auto MakeSimpleDMatrixForTest(bst_idx_t n_samples, bst_feature_t n_features, Json dconfig) {
   HostDeviceVector<float> storage;
   auto arr_int = RandomDataGenerator{n_samples, n_features, 0.5f}.GenerateArrayInterface(&storage);
 
@@ -456,7 +456,7 @@ auto MakeSimpleDMatrixForTest(bst_row_t n_samples, bst_feature_t n_features, Jso
   return std::pair{p_fmat, Xy};
 }
 
-auto MakeQDMForTest(Context const *ctx, bst_row_t n_samples, bst_feature_t n_features,
+auto MakeQDMForTest(Context const *ctx, bst_idx_t n_samples, bst_feature_t n_features,
                     Json dconfig) {
   bst_bin_t n_bins{16};
   dconfig["max_bin"] = Integer{n_bins};
@@ -488,7 +488,7 @@ auto MakeQDMForTest(Context const *ctx, bst_row_t n_samples, bst_feature_t n_fea
   return std::pair{p_fmat, Xy};
 }
 
-auto MakeExtMemForTest(bst_row_t n_samples, bst_feature_t n_features, Json dconfig) {
+auto MakeExtMemForTest(bst_idx_t n_samples, bst_feature_t n_features, Json dconfig) {
   std::size_t n_batches{4};
   NumpyArrayIterForTest iter_0{0.0f, n_samples, n_features, n_batches};
   std::string s_dconfig;
@@ -530,7 +530,7 @@ void CheckResult(Context const *ctx, bst_feature_t n_features, std::shared_ptr<D
 }
 
 void TestXGDMatrixGetQuantileCut(Context const *ctx) {
-  bst_row_t n_samples{1024};
+  bst_idx_t n_samples{1024};
   bst_feature_t n_features{16};
 
   Json dconfig{Object{}};
diff --git a/tests/cpp/collective/test_allgather.cc b/tests/cpp/collective/test_allgather.cc
index decad8786..b25db54cb 100644
--- a/tests/cpp/collective/test_allgather.cc
+++ b/tests/cpp/collective/test_allgather.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>   // for ASSERT_EQ
 #include <xgboost/span.h>  // for Span, oper...
@@ -34,8 +34,8 @@ class Worker : public WorkerForTest {
       std::vector<std::int32_t> data(comm_.World(), 0);
       data[comm_.Rank()] = comm_.Rank();
 
-      auto rc = RingAllgather(this->comm_, common::Span{data.data(), data.size()}, 1);
-      ASSERT_TRUE(rc.OK()) << rc.Report();
+      auto rc = RingAllgather(this->comm_, common::Span{data.data(), data.size()});
+      SafeColl(rc);
 
       for (std::int32_t r = 0; r < comm_.World(); ++r) {
         ASSERT_EQ(data[r], r);
@@ -51,8 +51,8 @@ class Worker : public WorkerForTest {
       auto seg = s_data.subspan(comm_.Rank() * n, n);
       std::iota(seg.begin(), seg.end(), comm_.Rank());
 
-      auto rc = RingAllgather(comm_, common::Span{data.data(), data.size()}, n);
-      ASSERT_TRUE(rc.OK()) << rc.Report();
+      auto rc = RingAllgather(comm_, common::Span{data.data(), data.size()});
+      SafeColl(rc);
 
       for (std::int32_t r = 0; r < comm_.World(); ++r) {
         auto seg = s_data.subspan(r * n, n);
@@ -81,7 +81,7 @@ class Worker : public WorkerForTest {
     std::vector<std::int32_t> data(comm_.Rank() + 1, comm_.Rank());
     std::vector<std::int32_t> result;
     auto rc = RingAllgatherV(comm_, common::Span{data.data(), data.size()}, &result);
-    ASSERT_TRUE(rc.OK()) << rc.Report();
+    SafeColl(rc);
     ASSERT_EQ(result.size(), (1 + comm_.World()) * comm_.World() / 2);
     CheckV(result);
   }
@@ -91,7 +91,7 @@ class Worker : public WorkerForTest {
     std::int32_t n{comm_.Rank()};
     std::vector<std::int32_t> result;
     auto rc = RingAllgatherV(comm_, common::Span{&n, 1}, &result);
-    ASSERT_TRUE(rc.OK()) << rc.Report();
+    SafeColl(rc);
     for (std::int32_t i = 0; i < comm_.World(); ++i) {
       ASSERT_EQ(result[i], i);
     }
@@ -104,8 +104,8 @@ class Worker : public WorkerForTest {
 
     std::vector<std::int64_t> sizes(comm_.World(), 0);
     sizes[comm_.Rank()] = s_data.size_bytes();
-    auto rc = RingAllgather(comm_, common::Span{sizes.data(), sizes.size()}, 1);
-    ASSERT_TRUE(rc.OK()) << rc.Report();
+    auto rc = RingAllgather(comm_, common::Span{sizes.data(), sizes.size()});
+    SafeColl(rc);
     std::shared_ptr<Coll> pcoll{new Coll{}};
 
     std::vector<std::int64_t> recv_segments(comm_.World() + 1, 0);
diff --git a/tests/cpp/collective/test_allgather.cu b/tests/cpp/collective/test_allgather.cu
index fe2e1f3c1..dd7920d6c 100644
--- a/tests/cpp/collective/test_allgather.cu
+++ b/tests/cpp/collective/test_allgather.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
 #if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
 #include <gtest/gtest.h>
@@ -33,8 +33,8 @@ class Worker : public NCCLWorkerForTest {
       // get size
       std::vector<std::int64_t> sizes(comm_.World(), -1);
       sizes[comm_.Rank()] = s_data.size_bytes();
-      auto rc = RingAllgather(comm_, common::Span{sizes.data(), sizes.size()}, 1);
-      ASSERT_TRUE(rc.OK()) << rc.Report();
+      auto rc = RingAllgather(comm_, common::Span{sizes.data(), sizes.size()});
+      SafeColl(rc);
       // create result
       dh::device_vector<std::int32_t> result(comm_.World(), -1);
       auto s_result = common::EraseType(dh::ToSpan(result));
@@ -42,7 +42,7 @@ class Worker : public NCCLWorkerForTest {
       std::vector<std::int64_t> recv_seg(nccl_comm_->World() + 1, 0);
       rc = nccl_coll_->AllgatherV(*nccl_comm_, s_data, common::Span{sizes.data(), sizes.size()},
                                   common::Span{recv_seg.data(), recv_seg.size()}, s_result, algo);
-      ASSERT_TRUE(rc.OK()) << rc.Report();
+      SafeColl(rc);
 
       for (std::int32_t i = 0; i < comm_.World(); ++i) {
         ASSERT_EQ(result[i], i);
@@ -57,8 +57,8 @@ class Worker : public NCCLWorkerForTest {
       // get size
       std::vector<std::int64_t> sizes(nccl_comm_->World(), 0);
       sizes[comm_.Rank()] = dh::ToSpan(data).size_bytes();
-      auto rc = RingAllgather(comm_, common::Span{sizes.data(), sizes.size()}, 1);
-      ASSERT_TRUE(rc.OK()) << rc.Report();
+      auto rc = RingAllgather(comm_, common::Span{sizes.data(), sizes.size()});
+      SafeColl(rc);
       auto n_bytes = std::accumulate(sizes.cbegin(), sizes.cend(), 0);
       // create result
       dh::device_vector<std::int32_t> result(n_bytes / sizeof(std::int32_t), -1);
@@ -67,7 +67,7 @@ class Worker : public NCCLWorkerForTest {
       std::vector<std::int64_t> recv_seg(nccl_comm_->World() + 1, 0);
       rc = nccl_coll_->AllgatherV(*nccl_comm_, s_data, common::Span{sizes.data(), sizes.size()},
                                   common::Span{recv_seg.data(), recv_seg.size()}, s_result, algo);
-      ASSERT_TRUE(rc.OK()) << rc.Report();
+      SafeColl(rc);
       // check segment size
       if (algo != AllgatherVAlgo::kBcast) {
         auto size = recv_seg[nccl_comm_->Rank() + 1] - recv_seg[nccl_comm_->Rank()];
diff --git a/tests/cpp/collective/test_allreduce.cc b/tests/cpp/collective/test_allreduce.cc
index 21b4d9fd0..13a6ca656 100644
--- a/tests/cpp/collective/test_allreduce.cc
+++ b/tests/cpp/collective/test_allreduce.cc
@@ -1,11 +1,12 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 
+#include <numeric>  // for iota
+
 #include "../../../src/collective/allreduce.h"
 #include "../../../src/collective/coll.h"  // for Coll
-#include "../../../src/collective/tracker.h"
 #include "../../../src/common/type.h"  // for EraseType
 #include "test_worker.h"               // for WorkerForTest, TestDistributed
 
@@ -58,7 +59,7 @@ class AllreduceWorker : public WorkerForTest {
     auto pcoll = std::shared_ptr<Coll>{new Coll{}};
     auto rc = pcoll->Allreduce(comm_, common::EraseType(common::Span{data.data(), data.size()}),
                                ArrayInterfaceHandler::kU4, Op::kBitwiseOR);
-    ASSERT_TRUE(rc.OK()) << rc.Report();
+    SafeColl(rc);
     for (auto v : data) {
       ASSERT_EQ(v, ~std::uint32_t{0});
     }
diff --git a/tests/cpp/collective/test_allreduce.cu b/tests/cpp/collective/test_allreduce.cu
index 42c883fa7..10d56a814 100644
--- a/tests/cpp/collective/test_allreduce.cu
+++ b/tests/cpp/collective/test_allreduce.cu
@@ -1,11 +1,11 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
 #if defined(XGBOOST_USE_NCCL) || defined(XGBOOST_USE_RCCL)
 #include <gtest/gtest.h>
 #include <thrust/host_vector.h>  // for host_vector
 
-#include "../../../src/common/common.h"
+#include "../../../src/common/common.h"            // for AllVisibleGPUs
 #include "../../../src/common/device_helpers.cuh"  // for ToSpan,  device_vector
 #include "../../../src/common/type.h"              // for EraseType
 #include "test_worker.cuh"                         // for NCCLWorkerForTest
@@ -24,7 +24,7 @@ class Worker : public NCCLWorkerForTest {
     data[comm_.Rank()] = ~std::uint32_t{0};
     auto rc = nccl_coll_->Allreduce(*nccl_comm_, common::EraseType(dh::ToSpan(data)),
                                     ArrayInterfaceHandler::kU4, Op::kBitwiseOR);
-    ASSERT_TRUE(rc.OK()) << rc.Report();
+    SafeColl(rc);
     thrust::host_vector<std::uint32_t> h_data(data.size());
     thrust::copy(data.cbegin(), data.cend(), h_data.begin());
     for (auto v : h_data) {
@@ -36,7 +36,7 @@ class Worker : public NCCLWorkerForTest {
     dh::device_vector<double> data(314, 1.5);
     auto rc = nccl_coll_->Allreduce(*nccl_comm_, common::EraseType(dh::ToSpan(data)),
                                     ArrayInterfaceHandler::kF8, Op::kSum);
-    ASSERT_TRUE(rc.OK()) << rc.Report();
+    SafeColl(rc);
     for (std::size_t i = 0; i < data.size(); ++i) {
       auto v = data[i];
       ASSERT_EQ(v, 1.5 * static_cast<double>(comm_.World())) << i;
diff --git a/tests/cpp/collective/test_broadcast.cc b/tests/cpp/collective/test_broadcast.cc
index 4d0d87e93..1b1d73428 100644
--- a/tests/cpp/collective/test_broadcast.cc
+++ b/tests/cpp/collective/test_broadcast.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/collective/socket.h>
@@ -10,7 +10,6 @@
 #include <vector>   // for vector
 
 #include "../../../src/collective/broadcast.h"  // for Broadcast
-#include "../../../src/collective/tracker.h"    // for GetHostAddress
 #include "test_worker.h"                        // for WorkerForTest, TestDistributed
 
 namespace xgboost::collective {
@@ -24,14 +23,14 @@ class Worker : public WorkerForTest {
       // basic test
       std::vector<std::int32_t> data(1, comm_.Rank());
       auto rc = Broadcast(this->comm_, common::Span{data.data(), data.size()}, r);
-      ASSERT_TRUE(rc.OK()) << rc.Report();
+      SafeColl(rc);
       ASSERT_EQ(data[0], r);
     }
 
     for (std::int32_t r = 0; r < comm_.World(); ++r) {
       std::vector<std::int32_t> data(1 << 16, comm_.Rank());
       auto rc = Broadcast(this->comm_, common::Span{data.data(), data.size()}, r);
-      ASSERT_TRUE(rc.OK()) << rc.Report();
+      SafeColl(rc);
       ASSERT_EQ(data[0], r);
     }
   }
@@ -41,11 +40,11 @@ class BroadcastTest : public SocketTest {};
 }  // namespace
 
 TEST_F(BroadcastTest, Basic) {
-  std::int32_t n_workers = std::min(7u, std::thread::hardware_concurrency());
+  std::int32_t n_workers = std::min(2u, std::thread::hardware_concurrency());
   TestDistributed(n_workers, [=](std::string host, std::int32_t port, std::chrono::seconds timeout,
                                  std::int32_t r) {
     Worker worker{host, port, timeout, n_workers, r};
     worker.Run();
   });
-}  // namespace
+}
 }  // namespace xgboost::collective
diff --git a/tests/cpp/collective/test_coll_c_api.cc b/tests/cpp/collective/test_coll_c_api.cc
index d80fbc140..c7229ff77 100644
--- a/tests/cpp/collective/test_coll_c_api.cc
+++ b/tests/cpp/collective/test_coll_c_api.cc
@@ -25,13 +25,13 @@ TEST_F(TrackerAPITest, CAPI) {
   auto config_str = Json::Dump(config);
   auto rc = XGTrackerCreate(config_str.c_str(), &handle);
   ASSERT_EQ(rc, 0);
-  rc = XGTrackerRun(handle);
+  rc = XGTrackerRun(handle, nullptr);
   ASSERT_EQ(rc, 0);
 
   std::thread bg_wait{[&] {
     Json config{Object{}};
     auto config_str = Json::Dump(config);
-    auto rc = XGTrackerWait(handle, config_str.c_str());
+    auto rc = XGTrackerWaitFor(handle, config_str.c_str());
     ASSERT_EQ(rc, 0);
   }};
 
@@ -42,8 +42,8 @@ TEST_F(TrackerAPITest, CAPI) {
 
   std::string host;
   ASSERT_TRUE(GetHostAddress(&host).OK());
-  ASSERT_EQ(host, get<String const>(args["DMLC_TRACKER_URI"]));
-  auto port = get<Integer const>(args["DMLC_TRACKER_PORT"]);
+  ASSERT_EQ(host, get<String const>(args["dmlc_tracker_uri"]));
+  auto port = get<Integer const>(args["dmlc_tracker_port"]);
   ASSERT_NE(port, 0);
 
   std::vector<std::thread> workers;
diff --git a/tests/cpp/collective/test_comm.cc b/tests/cpp/collective/test_comm.cc
index 8e69b2f8e..c1eb06465 100644
--- a/tests/cpp/collective/test_comm.cc
+++ b/tests/cpp/collective/test_comm.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 
@@ -14,7 +14,7 @@ class CommTest : public TrackerTest {};
 
 TEST_F(CommTest, Channel) {
   auto n_workers = 4;
-  RabitTracker tracker{host, n_workers, 0, timeout};
+  RabitTracker tracker{MakeTrackerConfig(host, n_workers, timeout)};
   auto fut = tracker.Run();
 
   std::vector<std::thread> workers;
@@ -29,7 +29,7 @@ TEST_F(CommTest, Channel) {
           return p_chan->SendAll(
               EraseType(common::Span<std::int32_t const>{&i, static_cast<std::size_t>(1)}));
         } << [&] { return p_chan->Block(); };
-        ASSERT_TRUE(rc.OK()) << rc.Report();
+        SafeColl(rc);
       } else {
         auto p_chan = worker.Comm().Chan(i - 1);
         std::int32_t r{-1};
@@ -37,7 +37,7 @@ TEST_F(CommTest, Channel) {
           return p_chan->RecvAll(
               EraseType(common::Span<std::int32_t>{&r, static_cast<std::size_t>(1)}));
         } << [&] { return p_chan->Block(); };
-        ASSERT_TRUE(rc.OK()) << rc.Report();
+        SafeColl(rc);
         ASSERT_EQ(r, i - 1);
       }
     });
diff --git a/tests/cpp/collective/test_comm_group.cc b/tests/cpp/collective/test_comm_group.cc
index 0f6bc23a2..3b1b5c5df 100644
--- a/tests/cpp/collective/test_comm_group.cc
+++ b/tests/cpp/collective/test_comm_group.cc
@@ -17,17 +17,6 @@
 
 namespace xgboost::collective {
 namespace {
-auto MakeConfig(std::string host, std::int32_t port, std::chrono::seconds timeout, std::int32_t r) {
-  Json config{Object{}};
-  config["dmlc_communicator"] = std::string{"rabit"};
-  config["DMLC_TRACKER_URI"] = host;
-  config["DMLC_TRACKER_PORT"] = port;
-  config["dmlc_timeout_sec"] = static_cast<std::int64_t>(timeout.count());
-  config["DMLC_TASK_ID"] = std::to_string(r);
-  config["dmlc_retry"] = 2;
-  return config;
-}
-
 class CommGroupTest : public SocketTest {};
 }  // namespace
 
@@ -36,7 +25,7 @@ TEST_F(CommGroupTest, Basic) {
   TestDistributed(n_workers, [&](std::string host, std::int32_t port, std::chrono::seconds timeout,
                                  std::int32_t r) {
     Context ctx;
-    auto config = MakeConfig(host, port, timeout, r);
+    auto config = MakeDistributedTestConfig(host, port, timeout, r);
     std::unique_ptr<CommGroup> ptr{CommGroup::Create(config)};
     ASSERT_TRUE(ptr->IsDistributed());
     ASSERT_EQ(ptr->World(), n_workers);
@@ -52,7 +41,7 @@ TEST_F(CommGroupTest, BasicGPU) {
   TestDistributed(n_workers, [&](std::string host, std::int32_t port, std::chrono::seconds timeout,
                                  std::int32_t r) {
     auto ctx = MakeCUDACtx(r);
-    auto config = MakeConfig(host, port, timeout, r);
+    auto config = MakeDistributedTestConfig(host, port, timeout, r);
     std::unique_ptr<CommGroup> ptr{CommGroup::Create(config)};
     auto const& comm = ptr->Ctx(&ctx, DeviceOrd::CUDA(0));
     ASSERT_EQ(comm.TaskID(), std::to_string(r));
diff --git a/tests/cpp/collective/test_loop.cc b/tests/cpp/collective/test_loop.cc
index e5ef987f3..34e0c1de8 100644
--- a/tests/cpp/collective/test_loop.cc
+++ b/tests/cpp/collective/test_loop.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>                // for ASSERT_TRUE, ASSERT_EQ
 #include <xgboost/collective/socket.h>  // for TCPSocket, Connect, SocketFinalize, SocketStartup
@@ -28,18 +28,23 @@ class LoopTest : public ::testing::Test {
 
     auto domain = SockDomain::kV4;
     pair_.first = TCPSocket::Create(domain);
-    auto port = pair_.first.BindHost();
-    pair_.first.Listen();
+    std::int32_t port{0};
+    auto rc = Success() << [&] {
+      return pair_.first.BindHost(&port);
+    } << [&] {
+      return pair_.first.Listen();
+    };
+    SafeColl(rc);
 
     auto const& addr = SockAddrV4::Loopback().Addr();
-    auto rc = Connect(StringView{addr}, port, 1, timeout, &pair_.second);
-    ASSERT_TRUE(rc.OK());
+    rc = Connect(StringView{addr}, port, 1, timeout, &pair_.second);
+    SafeColl(rc);
     rc = pair_.second.NonBlocking(true);
-    ASSERT_TRUE(rc.OK());
+    SafeColl(rc);
 
     pair_.first = pair_.first.Accept();
     rc = pair_.first.NonBlocking(true);
-    ASSERT_TRUE(rc.OK());
+    SafeColl(rc);
 
     loop_ = std::shared_ptr<Loop>{new Loop{timeout}};
   }
@@ -74,8 +79,26 @@ TEST_F(LoopTest, Op) {
   loop_->Submit(rop);
 
   auto rc = loop_->Block();
-  ASSERT_TRUE(rc.OK()) << rc.Report();
+  SafeColl(rc);
 
   ASSERT_EQ(rbuf[0], wbuf[0]);
 }
+
+TEST_F(LoopTest, Block) {
+  // We need to ensure that a blocking call doesn't go unanswered.
+  auto op = Loop::Op::Sleep(2);
+
+  common::Timer t;
+  t.Start();
+  loop_->Submit(op);
+  t.Stop();
+  // submit is non-blocking
+  ASSERT_LT(t.ElapsedSeconds(), 1);
+
+  t.Start();
+  auto rc = loop_->Block();
+  t.Stop();
+  SafeColl(rc);
+  ASSERT_GE(t.ElapsedSeconds(), 1);
+}
 }  // namespace xgboost::collective
diff --git a/tests/cpp/collective/test_rabit_communicator.cc b/tests/cpp/collective/test_rabit_communicator.cc
index ba22d8fdb..9711e1aed 100644
--- a/tests/cpp/collective/test_rabit_communicator.cc
+++ b/tests/cpp/collective/test_rabit_communicator.cc
@@ -1,13 +1,12 @@
-/*!
- * Copyright 2022 XGBoost contributors
+/**
+ * Copyright 2022-2024, XGBoost contributors
  */
 #include <gtest/gtest.h>
 
 #include "../../../src/collective/rabit_communicator.h"
+#include "../helpers.h"
 
-namespace xgboost {
-namespace collective {
-
+namespace xgboost::collective {
 TEST(RabitCommunicatorSimpleTest, ThrowOnWorldSizeTooSmall) {
   auto construct = []() { RabitCommunicator comm{0, 0}; };
   EXPECT_THROW(construct(), dmlc::Error);
@@ -35,5 +34,37 @@ TEST(RabitCommunicatorSimpleTest, IsNotDistributed) {
   EXPECT_FALSE(comm.IsDistributed());
 }
 
-}  // namespace collective
-}  // namespace xgboost
+namespace {
+void VerifyVectorAllgatherV() {
+  auto n_workers = collective::GetWorldSize();
+  ASSERT_EQ(n_workers, 3);
+  auto rank = collective::GetRank();
+  // Construct input that has different length for each worker.
+  std::vector<std::vector<char>> inputs;
+  for (std::int32_t i = 0; i < rank + 1; ++i) {
+    std::vector<char> in;
+    for (std::int32_t j = 0; j < rank + 1; ++j) {
+      in.push_back(static_cast<char>(j));
+    }
+    inputs.emplace_back(std::move(in));
+  }
+
+  auto outputs = VectorAllgatherV(inputs);
+
+  ASSERT_EQ(outputs.size(), (1 + n_workers) * n_workers / 2);
+  auto const& res = outputs;
+
+  for (std::int32_t i = 0; i < n_workers; ++i) {
+    std::int32_t k = 0;
+    for (auto v : res[i]) {
+      ASSERT_EQ(v, k++);
+    }
+  }
+}
+}  // namespace
+
+TEST(VectorAllgatherV, Basic) {
+  std::int32_t n_workers{3};
+  RunWithInMemoryCommunicator(n_workers, VerifyVectorAllgatherV);
+}
+}  // namespace xgboost::collective
diff --git a/tests/cpp/collective/test_result.cc b/tests/cpp/collective/test_result.cc
new file mode 100644
index 000000000..1c7194f92
--- /dev/null
+++ b/tests/cpp/collective/test_result.cc
@@ -0,0 +1,31 @@
+/**
+ *  Copyright 2024, XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+#include <xgboost/collective/result.h>
+
+namespace xgboost::collective {
+TEST(Result, Concat) {
+  auto rc0 = Fail("foo");
+  auto rc1 = Fail("bar");
+  auto rc = std::move(rc0) + std::move(rc1);
+  ASSERT_NE(rc.Report().find("foo"), std::string::npos);
+  ASSERT_NE(rc.Report().find("bar"), std::string::npos);
+
+  auto rc2 = Fail("Another", std::move(rc));
+  auto assert_that = [](Result const& rc) {
+    ASSERT_NE(rc.Report().find("Another"), std::string::npos);
+    ASSERT_NE(rc.Report().find("foo"), std::string::npos);
+    ASSERT_NE(rc.Report().find("bar"), std::string::npos);
+  };
+  assert_that(rc2);
+
+  auto empty = Success();
+  auto rc3 = std::move(empty) + std::move(rc2);
+  assert_that(rc3);
+
+  empty = Success();
+  auto rc4 = std::move(rc3) + std::move(empty);
+  assert_that(rc4);
+}
+}  // namespace xgboost::collective
diff --git a/tests/cpp/collective/test_socket.cc b/tests/cpp/collective/test_socket.cc
index ced795fef..ea57da9b4 100644
--- a/tests/cpp/collective/test_socket.cc
+++ b/tests/cpp/collective/test_socket.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2022-2023, XGBoost Contributors
+ * Copyright 2022-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/collective/socket.h>
@@ -21,14 +21,19 @@ TEST_F(SocketTest, Basic) {
   auto run_test = [msg](SockDomain domain) {
     auto server = TCPSocket::Create(domain);
     ASSERT_EQ(server.Domain(), domain);
-    auto port = server.BindHost();
-    server.Listen();
+    std::int32_t port{0};
+    auto rc = Success() << [&] {
+      return server.BindHost(&port);
+    } << [&] {
+      return server.Listen();
+    };
+    SafeColl(rc);
 
     TCPSocket client;
     if (domain == SockDomain::kV4) {
       auto const& addr = SockAddrV4::Loopback().Addr();
       auto rc = Connect(StringView{addr}, port, 1, std::chrono::seconds{3}, &client);
-      ASSERT_TRUE(rc.OK()) << rc.Report();
+      SafeColl(rc);
     } else {
       auto const& addr = SockAddrV6::Loopback().Addr();
       auto rc = Connect(StringView{addr}, port, 1, std::chrono::seconds{3}, &client);
@@ -45,7 +50,8 @@ TEST_F(SocketTest, Basic) {
     accepted.Send(msg);
 
     std::string str;
-    client.Recv(&str);
+    rc = client.Recv(&str);
+    SafeColl(rc);
     ASSERT_EQ(StringView{str}, msg);
   };
 
diff --git a/tests/cpp/collective/test_tracker.cc b/tests/cpp/collective/test_tracker.cc
index 0dce33c0c..8d6cbeff2 100644
--- a/tests/cpp/collective/test_tracker.cc
+++ b/tests/cpp/collective/test_tracker.cc
@@ -1,6 +1,7 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
 #include <chrono>   // for seconds
@@ -10,6 +11,7 @@
 #include <vector>   // for vector
 
 #include "../../../src/collective/comm.h"
+#include "../helpers.h"  // for GMockThrow
 #include "test_worker.h"
 
 namespace xgboost::collective {
@@ -20,13 +22,13 @@ class PrintWorker : public WorkerForTest {
 
   void Print() {
     auto rc = comm_.LogTracker("ack:" + std::to_string(this->comm_.Rank()));
-    ASSERT_TRUE(rc.OK()) << rc.Report();
+    SafeColl(rc);
   }
 };
 }  // namespace
 
 TEST_F(TrackerTest, Bootstrap) {
-  RabitTracker tracker{host, n_workers, 0, timeout};
+  RabitTracker tracker{MakeTrackerConfig(host, n_workers, timeout)};
   ASSERT_FALSE(tracker.Ready());
   auto fut = tracker.Run();
 
@@ -34,7 +36,7 @@ TEST_F(TrackerTest, Bootstrap) {
 
   auto args = tracker.WorkerArgs();
   ASSERT_TRUE(tracker.Ready());
-  ASSERT_EQ(get<String const>(args["DMLC_TRACKER_URI"]), host);
+  ASSERT_EQ(get<String const>(args["dmlc_tracker_uri"]), host);
 
   std::int32_t port = tracker.Port();
 
@@ -44,12 +46,11 @@ TEST_F(TrackerTest, Bootstrap) {
   for (auto &w : workers) {
     w.join();
   }
-
-  ASSERT_TRUE(fut.get().OK());
+  SafeColl(fut.get());
 }
 
 TEST_F(TrackerTest, Print) {
-  RabitTracker tracker{host, n_workers, 0, timeout};
+  RabitTracker tracker{MakeTrackerConfig(host, n_workers, timeout)};
   auto fut = tracker.Run();
 
   std::vector<std::thread> workers;
@@ -73,4 +74,47 @@ TEST_F(TrackerTest, Print) {
 }
 
 TEST_F(TrackerTest, GetHostAddress) { ASSERT_TRUE(host.find("127.") == std::string::npos); }
+
+/**
+ * Test connecting the tracker after it has finished. This should not hang the workers.
+ */
+TEST_F(TrackerTest, AfterShutdown) {
+  RabitTracker tracker{MakeTrackerConfig(host, n_workers, timeout)};
+  auto fut = tracker.Run();
+
+  std::vector<std::thread> workers;
+  auto rc = tracker.WaitUntilReady();
+  ASSERT_TRUE(rc.OK());
+
+  std::int32_t port = tracker.Port();
+
+  // Launch no-op workers to cause the tracker to shutdown.
+  for (std::int32_t i = 0; i < n_workers; ++i) {
+    workers.emplace_back([=] { WorkerForTest worker{host, port, timeout, n_workers, i}; });
+  }
+
+  for (auto &w : workers) {
+    w.join();
+  }
+
+  ASSERT_TRUE(fut.get().OK());
+
+  // Launch workers again, they should fail.
+  workers.clear();
+  for (std::int32_t i = 0; i < n_workers; ++i) {
+    auto assert_that = [=] {
+      WorkerForTest worker{host, port, timeout, n_workers, i};
+    };
+    // On a Linux platform, the connection will be refused, on Apple platform, this gets
+    // an operation now in progress poll failure, on Windows, it's a timeout error.
+#if defined(__linux__)
+    workers.emplace_back([=] { ASSERT_THAT(assert_that, GMockThrow("Connection refused")); });
+#else
+    workers.emplace_back([=] { ASSERT_THAT(assert_that, GMockThrow("Failed to connect to")); });
+#endif
+  }
+  for (auto &w : workers) {
+    w.join();
+  }
+}
 }  // namespace xgboost::collective
diff --git a/tests/cpp/collective/test_worker.h b/tests/cpp/collective/test_worker.h
index acee0f297..c84df528f 100644
--- a/tests/cpp/collective/test_worker.h
+++ b/tests/cpp/collective/test_worker.h
@@ -1,11 +1,12 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
 #pragma once
 #include <gtest/gtest.h>
 
 #include <chrono>   // for seconds
 #include <cstdint>  // for int32_t
+#include <fstream>  // for ifstream
 #include <string>   // for string
 #include <thread>   // for thread
 #include <utility>  // for move
@@ -36,7 +37,7 @@ class WorkerForTest {
         comm_{tracker_host_, tracker_port_, timeout, retry_, task_id_, DefaultNcclName()} {
     CHECK_EQ(world_size_, comm_.World());
   }
-  virtual ~WorkerForTest() = default;
+  virtual ~WorkerForTest() noexcept(false) { SafeColl(comm_.Shutdown()); }
   auto& Comm() { return comm_; }
 
   void LimitSockBuf(std::int32_t n_bytes) {
@@ -86,19 +87,30 @@ class TrackerTest : public SocketTest {
   void SetUp() override {
     SocketTest::SetUp();
     auto rc = GetHostAddress(&host);
-    ASSERT_TRUE(rc.OK()) << rc.Report();
+    SafeColl(rc);
   }
 };
 
+inline Json MakeTrackerConfig(std::string host, std::int32_t n_workers,
+                              std::chrono::seconds timeout) {
+  Json config{Object{}};
+  config["host"] = host;
+  config["port"] = Integer{0};
+  config["n_workers"] = Integer{n_workers};
+  config["sortby"] = Integer{static_cast<std::int32_t>(Tracker::SortBy::kHost)};
+  config["timeout"] = timeout.count();
+  return config;
+}
+
 template <typename WorkerFn>
 void TestDistributed(std::int32_t n_workers, WorkerFn worker_fn) {
   std::chrono::seconds timeout{2};
 
   std::string host;
   auto rc = GetHostAddress(&host);
-  ASSERT_TRUE(rc.OK()) << rc.Report();
+  SafeColl(rc);
   LOG(INFO) << "Using " << n_workers << " workers for test.";
-  RabitTracker tracker{StringView{host}, n_workers, 0, timeout};
+  RabitTracker tracker{MakeTrackerConfig(host, n_workers, timeout)};
   auto fut = tracker.Run();
 
   std::vector<std::thread> workers;
@@ -114,4 +126,15 @@ void TestDistributed(std::int32_t n_workers, WorkerFn worker_fn) {
 
   ASSERT_TRUE(fut.get().OK());
 }
+inline auto MakeDistributedTestConfig(std::string host, std::int32_t port,
+                                      std::chrono::seconds timeout, std::int32_t r) {
+  Json config{Object{}};
+  config["dmlc_communicator"] = std::string{"rabit"};
+  config["dmlc_tracker_uri"] = host;
+  config["dmlc_tracker_port"] = port;
+  config["dmlc_timeout_sec"] = static_cast<std::int64_t>(timeout.count());
+  config["dmlc_task_id"] = std::to_string(r);
+  config["dmlc_retry"] = 2;
+  return config;
+}
 }  // namespace xgboost::collective
diff --git a/tests/cpp/common/test_device_helpers.cu b/tests/cpp/common/test_device_helpers.cu
index f62c9d34a..bbfcf867d 100644
--- a/tests/cpp/common/test_device_helpers.cu
+++ b/tests/cpp/common/test_device_helpers.cu
@@ -1,14 +1,16 @@
-/*!
- * Copyright 2017-2021 XGBoost contributors
+/**
+ * Copyright 2017-2024, XGBoost contributors
  */
+#include <thrust/device_vector.h>
+#include <thrust/sort.h>  // for is_sorted
+#include <xgboost/base.h>
+
 #include <cstddef>
 #include <cstdint>
-#include <thrust/device_vector.h>
 #include <vector>
-#include <xgboost/base.h>
+
 #include "../../../src/common/device_helpers.cuh"
 #include "../../../src/common/quantile.h"
-#include "../helpers.h"
 #include "gtest/gtest.h"
 
 TEST(SumReduce, Test) {
diff --git a/tests/cpp/common/test_hist_util.cc b/tests/cpp/common/test_hist_util.cc
index 5391bc2cf..24e67c9aa 100644
--- a/tests/cpp/common/test_hist_util.cc
+++ b/tests/cpp/common/test_hist_util.cc
@@ -1,10 +1,9 @@
 /**
- * Copyright 2019-2023 by XGBoost Contributors
+ * Copyright 2019-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <vector>
 #include <string>
-#include <utility>
 
 #include "../../../src/common/hist_util.h"
 #include "../../../src/data/gradient_index.h"
@@ -135,7 +134,7 @@ TEST(CutsBuilder, SearchGroupInd) {
   group[2] = 7;
   group[3] = 5;
 
-  p_mat->SetInfo("group", group.data(), DataType::kUInt32, kNumGroups);
+  p_mat->SetInfo("group", Make1dInterfaceTest(group.data(), group.size()));
 
   HistogramCuts hmat;
 
@@ -348,7 +347,8 @@ void TestSketchFromWeights(bool with_group) {
     for (size_t i = 0; i < kGroups; ++i) {
       groups[i] = kRows / kGroups;
     }
-    info.SetInfo(ctx, "group", groups.data(), DataType::kUInt32, kGroups);
+    auto sg = linalg::Make1dInterface(groups.data(), kGroups);
+    info.SetInfo(ctx, "group", sg.c_str());
   }
 
   info.num_row_ = kRows;
@@ -356,10 +356,10 @@ void TestSketchFromWeights(bool with_group) {
 
   // Assign weights.
   if (with_group) {
-    m->SetInfo("group", groups.data(), DataType::kUInt32, kGroups);
+    m->SetInfo("group", Make1dInterfaceTest(groups.data(), kGroups));
   }
 
-  m->SetInfo("weight", h_weights.data(), DataType::kFloat32, h_weights.size());
+  m->SetInfo("weight", Make1dInterfaceTest(h_weights.data(), h_weights.size()));
   m->Info().num_col_ = kCols;
   m->Info().num_row_ = kRows;
   ASSERT_EQ(cuts.Ptrs().size(), kCols + 1);
diff --git a/tests/cpp/common/test_hist_util.cu b/tests/cpp/common/test_hist_util.cu
index ab42bb365..ce78e9a58 100644
--- a/tests/cpp/common/test_hist_util.cu
+++ b/tests/cpp/common/test_hist_util.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2019-2023 by XGBoost Contributors
+ * Copyright 2019-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <thrust/device_vector.h>
@@ -181,7 +181,7 @@ void TestMixedSketch() {
 TEST(HistUtil, DeviceSketchMixedFeatures) { TestMixedSketch(); }
 
 TEST(HistUtil, RemoveDuplicatedCategories) {
-  bst_row_t n_samples = 512;
+  bst_idx_t n_samples = 512;
   bst_feature_t n_features = 3;
   bst_cat_t n_categories = 5;
 
@@ -210,13 +210,13 @@ TEST(HistUtil, RemoveDuplicatedCategories) {
       FeatureType::kNumerical, FeatureType::kCategorical, FeatureType::kNumerical};
   ASSERT_EQ(info.feature_types.Size(), n_features);
 
-  HostDeviceVector<bst_row_t> cuts_ptr{0, n_samples, n_samples * 2, n_samples * 3};
+  HostDeviceVector<bst_idx_t> cuts_ptr{0, n_samples, n_samples * 2, n_samples * 3};
   cuts_ptr.SetDevice(DeviceOrd::CUDA(0));
 
   dh::device_vector<float> weight(n_samples * n_features, 0);
   dh::Iota(dh::ToSpan(weight), ctx.CUDACtx()->Stream());
 
-  dh::caching_device_vector<bst_row_t> columns_ptr(4);
+  dh::caching_device_vector<bst_idx_t> columns_ptr(4);
   for (std::size_t i = 0; i < columns_ptr.size(); ++i) {
     columns_ptr[i] = i * n_samples;
   }
@@ -641,7 +641,7 @@ void TestGetColumnSize(std::size_t n_samples) {
 }  // namespace
 
 TEST(HistUtil, GetColumnSize) {
-  bst_row_t n_samples = 4096;
+  bst_idx_t n_samples = 4096;
   TestGetColumnSize(n_samples);
 }
 
@@ -684,7 +684,7 @@ TEST(HistUtil, DeviceSketchFromGroupWeights) {
   for (size_t i = 0; i < kGroups; ++i) {
     groups[i] = kRows / kGroups;
   }
-  m->SetInfo("group", groups.data(), DataType::kUInt32, kGroups);
+  m->SetInfo("group", Make1dInterfaceTest(groups.data(), kGroups));
   HistogramCuts weighted_cuts = DeviceSketch(&ctx, m.get(), kBins, 0);
 
   // sketch with no weight
@@ -729,7 +729,7 @@ void TestAdapterSketchFromWeights(bool with_group) {
     for (size_t i = 0; i < kGroups; ++i) {
       groups[i] = kRows / kGroups;
     }
-    info.SetInfo(ctx, "group", groups.data(), DataType::kUInt32, kGroups);
+    info.SetInfo(ctx, "group", Make1dInterfaceTest(groups.data(), kGroups));
   }
 
   info.weights_.SetDevice(DeviceOrd::CUDA(0));
@@ -748,10 +748,10 @@ void TestAdapterSketchFromWeights(bool with_group) {
 
   auto dmat = GetDMatrixFromData(storage.HostVector(), kRows, kCols);
   if (with_group) {
-    dmat->Info().SetInfo(ctx, "group", groups.data(), DataType::kUInt32, kGroups);
+    dmat->Info().SetInfo(ctx, "group", Make1dInterfaceTest(groups.data(), kGroups));
   }
 
-  dmat->Info().SetInfo(ctx, "weight", h_weights.data(), DataType::kFloat32, h_weights.size());
+  dmat->Info().SetInfo(ctx, "weight", Make1dInterfaceTest(h_weights.data(), h_weights.size()));
   dmat->Info().num_col_ = kCols;
   dmat->Info().num_row_ = kRows;
   ASSERT_EQ(cuts.Ptrs().size(), kCols + 1);
@@ -797,11 +797,11 @@ TEST(HistUtil, AdapterSketchFromWeights) {
 
 namespace {
 class DeviceSketchWithHessianTest
-    : public ::testing::TestWithParam<std::tuple<bool, bst_row_t, bst_bin_t>> {
+    : public ::testing::TestWithParam<std::tuple<bool, bst_idx_t, bst_bin_t>> {
   bst_feature_t n_features_ = 5;
   bst_group_t n_groups_{3};
 
-  auto GenerateHessian(Context const* ctx, bst_row_t n_samples) const {
+  auto GenerateHessian(Context const* ctx, bst_idx_t n_samples) const {
     HostDeviceVector<float> hessian;
     auto& h_hess = hessian.HostVector();
     h_hess = GenerateRandomWeights(n_samples);
@@ -846,7 +846,7 @@ class DeviceSketchWithHessianTest
  protected:
   Context ctx_ = MakeCUDACtx(0);
 
-  void TestLTR(Context const* ctx, bst_row_t n_samples, bst_bin_t n_bins,
+  void TestLTR(Context const* ctx, bst_idx_t n_samples, bst_bin_t n_bins,
                std::size_t n_elements) const {
     auto x = GenerateRandom(n_samples, n_features_);
 
@@ -899,7 +899,7 @@ class DeviceSketchWithHessianTest
     }
   }
 
-  void TestRegression(Context const* ctx, bst_row_t n_samples, bst_bin_t n_bins,
+  void TestRegression(Context const* ctx, bst_idx_t n_samples, bst_bin_t n_bins,
                       std::size_t n_elements) const {
     auto x = GenerateRandom(n_samples, n_features_);
     auto p_fmat = GetDMatrixFromData(x, n_samples, n_features_);
@@ -912,9 +912,9 @@ class DeviceSketchWithHessianTest
 };
 
 auto MakeParamsForTest() {
-  std::vector<bst_row_t> sizes = {1, 2, 256, 512, 1000, 1500};
+  std::vector<bst_idx_t> sizes = {1, 2, 256, 512, 1000, 1500};
   std::vector<bst_bin_t> bin_sizes = {2, 16, 256, 512};
-  std::vector<std::tuple<bool, bst_row_t, bst_bin_t>> configs;
+  std::vector<std::tuple<bool, bst_idx_t, bst_bin_t>> configs;
   for (auto n_samples : sizes) {
     for (auto n_bins : bin_sizes) {
       configs.emplace_back(true, n_samples, n_bins);
diff --git a/tests/cpp/common/test_io.cc b/tests/cpp/common/test_io.cc
index 4c4d4efe0..face21851 100644
--- a/tests/cpp/common/test_io.cc
+++ b/tests/cpp/common/test_io.cc
@@ -1,10 +1,11 @@
 /**
- * Copyright 2019-2023, XGBoost Contributors
+ * Copyright 2019-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 
 #include <cstddef>  // for size_t
 #include <fstream>  // for ofstream
+#include <numeric>  // for iota
 
 #include "../../../src/common/io.h"
 #include "../filesystem.h"  // dmlc::TemporaryDirectory
diff --git a/tests/cpp/common/test_json.cc b/tests/cpp/common/test_json.cc
index d361552ce..e144bdc45 100644
--- a/tests/cpp/common/test_json.cc
+++ b/tests/cpp/common/test_json.cc
@@ -1,13 +1,13 @@
 /**
- * Copyright 2019-2023, XGBoost Contributors
+ * Copyright 2019-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 
 #include <fstream>
-#include <iterator>  // for back_inserter
+#include <limits>  // for numeric_limits
 #include <map>
+#include <numeric>  // for iota
 
-#include "../../../src/common/charconv.h"
 #include "../../../src/common/io.h"
 #include "../../../src/common/json_utils.h"
 #include "../../../src/common/threading_utils.h"  // for ParallelFor
@@ -639,6 +639,40 @@ TEST(Json, TypedArray) {
       ASSERT_EQ(arr[i + 8], i);
     }
   }
+
+  {
+    Json f64{Object{}};
+    auto array = F64Array();
+    auto& vec = array.GetArray();
+    // Construct test data
+    vec.resize(18);
+    std::iota(vec.begin(), vec.end(), 0.0);
+    // special values
+    vec.push_back(std::numeric_limits<double>::epsilon());
+    vec.push_back(std::numeric_limits<double>::max());
+    vec.push_back(std::numeric_limits<double>::min());
+    vec.push_back(std::numeric_limits<double>::denorm_min());
+    vec.push_back(std::numeric_limits<double>::quiet_NaN());
+
+    static_assert(
+        std::is_same_v<double, typename std::remove_reference_t<decltype(vec)>::value_type>);
+
+    f64["f64"] = std::move(array);
+    ASSERT_TRUE(IsA<F64Array>(f64["f64"]));
+    std::vector<char> out;
+    Json::Dump(f64, &out, std::ios::binary);
+
+    auto loaded = Json::Load(StringView{out.data(), out.size()}, std::ios::binary);
+    ASSERT_TRUE(IsA<F64Array>(loaded["f64"]));
+    auto const& result = get<F64Array const>(loaded["f64"]);
+
+    auto& vec1 = get<F64Array const>(f64["f64"]);
+    ASSERT_EQ(result.size(), vec1.size());
+    for (std::size_t i = 0; i < vec1.size() - 1; ++i) {
+      ASSERT_EQ(result[i], vec1[i]);
+    }
+    ASSERT_TRUE(std::isnan(result.back()));
+  }
 }
 
 TEST(UBJson, Basic) {
@@ -677,8 +711,24 @@ TEST(UBJson, Basic) {
     ASSERT_FLOAT_EQ(3.14, get<Number>(get<Array>(ret["test"])[1]));
     ASSERT_FLOAT_EQ(2.71, get<Number>(get<Array>(ret["test"])[0]));
   }
+  {
+    // boolean
+    Json boolean{Object{}};
+    boolean["foo"] = Boolean{false};
+    std::vector<char> out;
+    Json::Dump(boolean, &out, std::ios::binary);
+    auto loaded = Json::Load(StringView{out.data(), out.size()}, std::ios::binary);
+
+    ASSERT_EQ(boolean, loaded);
+
+    boolean["foo"] = Boolean{true};
+    Json::Dump(boolean, &out, std::ios::binary);
+    loaded = Json::Load(StringView{out.data(), out.size()}, std::ios::binary);
+    ASSERT_EQ(boolean, loaded);
+  }
 }
 
+
 TEST(Json, TypeCheck) {
   Json config{Object{}};
   config["foo"] = String{"bar"};
diff --git a/tests/cpp/common/test_linalg.cu b/tests/cpp/common/test_linalg.cu
index d14260635..bf217842b 100644
--- a/tests/cpp/common/test_linalg.cu
+++ b/tests/cpp/common/test_linalg.cu
@@ -1,7 +1,11 @@
 /**
- * Copyright 2021-2023 by XGBoost Contributors
+ * Copyright 2021-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
+#include <thrust/equal.h>     // for equal
+#include <thrust/sequence.h>  // for sequence
+
+#include "../../../src/common/cuda_context.cuh"
 #include "../../../src/common/linalg_op.cuh"
 #include "../helpers.h"
 #include "xgboost/context.h"
@@ -84,4 +88,23 @@ void TestSlice() {
 TEST(Linalg, GPUElementWise) { TestElementWiseKernel(); }
 
 TEST(Linalg, GPUTensorView) { TestSlice(); }
+
+TEST(Linalg, GPUIter) {
+  auto ctx = MakeCUDACtx(1);
+  auto cuctx = ctx.CUDACtx();
+
+  dh::device_vector<double> data(2 * 3 * 4);
+  thrust::sequence(cuctx->CTP(), data.begin(), data.end(), 1.0);
+
+  auto t = MakeTensorView(&ctx, dh::ToSpan(data), 2, 3, 4);
+  static_assert(!std::is_const_v<decltype(t)::element_type>);
+  static_assert(!std::is_const_v<decltype(t)::value_type>);
+
+  auto n = std::distance(linalg::tcbegin(t), linalg::tcend(t));
+  ASSERT_EQ(n, t.Size());
+  ASSERT_FALSE(t.Empty());
+
+  bool eq = thrust::equal(cuctx->CTP(), data.cbegin(), data.cend(), linalg::tcbegin(t));
+  ASSERT_TRUE(eq);
+}
 }  // namespace xgboost::linalg
diff --git a/tests/cpp/common/test_parameter.cc b/tests/cpp/common/test_parameter.cc
index 5e8021a1e..5288366f8 100644
--- a/tests/cpp/common/test_parameter.cc
+++ b/tests/cpp/common/test_parameter.cc
@@ -97,4 +97,9 @@ TEST(XGBoostParameter, Update) {
     ASSERT_NEAR(p.f, 2.71828f, kRtEps);
     ASSERT_NEAR(p.d, 2.71828, kRtEps);  // default
   }
+
+  // Just in case dmlc's use of global memory has any impact in parameters.
+  UpdatableParam a, b;
+  a.UpdateAllowUnknown(xgboost::Args{{"f", "2.71828"}});
+  ASSERT_NE(a.f, b.f);
 }
diff --git a/tests/cpp/common/test_quantile.cc b/tests/cpp/common/test_quantile.cc
index 9fa1566ea..26937be76 100644
--- a/tests/cpp/common/test_quantile.cc
+++ b/tests/cpp/common/test_quantile.cc
@@ -50,7 +50,7 @@ void DoTestDistributedQuantile(size_t rows, size_t cols) {
   SimpleLCG lcg;
   SimpleRealUniformDistribution<float> dist(3, 1000);
   std::generate(h_weights.begin(), h_weights.end(), [&]() { return dist(&lcg); });
-  std::vector<bst_row_t> column_size(cols, rows);
+  std::vector<bst_idx_t> column_size(cols, rows);
   bst_bin_t n_bins = 64;
 
   // Generate cuts for distributed environment.
@@ -192,7 +192,7 @@ void DoTestColSplitQuantile(size_t rows, size_t cols) {
     return dmat->SliceCol(world, rank);
   }()};
 
-  std::vector<bst_row_t> column_size(cols, 0);
+  std::vector<bst_idx_t> column_size(cols, 0);
   auto const slice_size = cols / world;
   auto const slice_start = slice_size * rank;
   auto const slice_end = (rank == world - 1) ? cols : slice_start + slice_size;
diff --git a/tests/cpp/common/test_quantile.cu b/tests/cpp/common/test_quantile.cu
index 26bd05524..070c705b5 100644
--- a/tests/cpp/common/test_quantile.cu
+++ b/tests/cpp/common/test_quantile.cu
@@ -27,7 +27,7 @@ TEST(GPUQuantile, Basic) {
   HostDeviceVector<FeatureType> ft;
   SketchContainer sketch(ft, kBins, kCols, kRows, FstCU());
   dh::caching_device_vector<Entry> entries;
-  dh::device_vector<bst_row_t> cuts_ptr(kCols+1);
+  dh::device_vector<bst_idx_t> cuts_ptr(kCols+1);
   thrust::fill(cuts_ptr.begin(), cuts_ptr.end(), 0);
   // Push empty
   sketch.Push(dh::ToSpan(entries), dh::ToSpan(cuts_ptr), dh::ToSpan(cuts_ptr), 0);
@@ -87,11 +87,11 @@ TEST(GPUQuantile, Unique) {
 
 // if with_error is true, the test tolerates floating point error
 void TestQuantileElemRank(DeviceOrd device, Span<SketchEntry const> in,
-                          Span<bst_row_t const> d_columns_ptr, bool with_error = false) {
+                          Span<bst_idx_t const> d_columns_ptr, bool with_error = false) {
   dh::safe_cuda(cudaSetDevice(device.ordinal));
   std::vector<SketchEntry> h_in(in.size());
   dh::CopyDeviceSpanToVector(&h_in, in);
-  std::vector<bst_row_t> h_columns_ptr(d_columns_ptr.size());
+  std::vector<bst_idx_t> h_columns_ptr(d_columns_ptr.size());
   dh::CopyDeviceSpanToVector(&h_columns_ptr, d_columns_ptr);
 
   for (size_t i = 1; i < d_columns_ptr.size(); ++i) {
@@ -164,7 +164,7 @@ TEST(GPUQuantile, MergeEmpty) {
 
   std::vector<SketchEntry> entries_before(sketch_0.Data().size());
   dh::CopyDeviceSpanToVector(&entries_before, sketch_0.Data());
-  std::vector<bst_row_t> ptrs_before(sketch_0.ColumnsPtr().size());
+  std::vector<bst_idx_t> ptrs_before(sketch_0.ColumnsPtr().size());
   dh::CopyDeviceSpanToVector(&ptrs_before, sketch_0.ColumnsPtr());
   thrust::device_vector<size_t> columns_ptr(kCols + 1);
   // Merge an empty sketch
@@ -172,7 +172,7 @@ TEST(GPUQuantile, MergeEmpty) {
 
   std::vector<SketchEntry> entries_after(sketch_0.Data().size());
   dh::CopyDeviceSpanToVector(&entries_after, sketch_0.Data());
-  std::vector<bst_row_t> ptrs_after(sketch_0.ColumnsPtr().size());
+  std::vector<bst_idx_t> ptrs_after(sketch_0.ColumnsPtr().size());
   dh::CopyDeviceSpanToVector(&ptrs_after, sketch_0.ColumnsPtr());
 
   CHECK_EQ(entries_before.size(), entries_after.size());
@@ -222,7 +222,7 @@ TEST(GPUQuantile, MergeBasic) {
     }
 
     auto columns_ptr = sketch_0.ColumnsPtr();
-    std::vector<bst_row_t> h_columns_ptr(columns_ptr.size());
+    std::vector<bst_idx_t> h_columns_ptr(columns_ptr.size());
     dh::CopyDeviceSpanToVector(&h_columns_ptr, columns_ptr);
     ASSERT_EQ(h_columns_ptr.back(), sketch_1.Data().size() + size_before_merge);
 
@@ -278,7 +278,7 @@ void TestMergeDuplicated(int32_t n_bins, size_t cols, size_t rows, float frac) {
   TestQuantileElemRank(FstCU(), sketch_0.Data(), sketch_0.ColumnsPtr());
 
   auto columns_ptr = sketch_0.ColumnsPtr();
-  std::vector<bst_row_t> h_columns_ptr(columns_ptr.size());
+  std::vector<bst_idx_t> h_columns_ptr(columns_ptr.size());
   dh::CopyDeviceSpanToVector(&h_columns_ptr, columns_ptr);
   ASSERT_EQ(h_columns_ptr.back(), sketch_1.Data().size() + size_before_merge);
 
diff --git a/tests/cpp/common/test_span.cu b/tests/cpp/common/test_span.cu
index 4211fb545..3182ebbe2 100644
--- a/tests/cpp/common/test_span.cu
+++ b/tests/cpp/common/test_span.cu
@@ -1,13 +1,15 @@
-/*!
- * Copyright 2018 XGBoost contributors
+/**
+ * Copyright 2018-2024, XGBoost contributors
  */
 #include <gtest/gtest.h>
-
-#include <thrust/host_vector.h>
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
-#include "../../../src/common/device_helpers.cuh"
+#include <thrust/host_vector.h>
 #include <xgboost/span.h>
+
+#include <numeric>  // for iota
+
+#include "../../../src/common/device_helpers.cuh"
 #include "test_span.h"
 
 namespace xgboost {
diff --git a/tests/cpp/common/test_transform_range.cc b/tests/cpp/common/test_transform_range.cc
index 0b14bdc8f..102960c52 100644
--- a/tests/cpp/common/test_transform_range.cc
+++ b/tests/cpp/common/test_transform_range.cc
@@ -1,11 +1,12 @@
 /**
- * Copyright 2018-2023 by XGBoost Contributors
+ * Copyright 2018-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/base.h>
-#include <xgboost/span.h>
 #include <xgboost/host_device_vector.h>
+#include <xgboost/span.h>
 
+#include <numeric>  // for iota
 #include <vector>
 
 #include "../../../src/common/transform.h"
diff --git a/tests/cpp/data/test_adapter.cc b/tests/cpp/data/test_adapter.cc
index fa3ed61f6..f34cfceed 100644
--- a/tests/cpp/data/test_adapter.cc
+++ b/tests/cpp/data/test_adapter.cc
@@ -36,7 +36,7 @@ TEST(Adapter, CSRAdapter) {
 }
 
 TEST(Adapter, CSRArrayAdapter) {
-  HostDeviceVector<bst_row_t> indptr;
+  HostDeviceVector<std::size_t> indptr;
   HostDeviceVector<float> values;
   HostDeviceVector<bst_feature_t> indices;
   size_t n_features = 100, n_samples = 10;
@@ -155,7 +155,7 @@ TEST(Adapter, IteratorAdapter) {
   ASSERT_EQ(data->Info().num_row_, kRows);
   int num_batch = 0;
   for (auto const& batch : data->GetBatches<SparsePage>()) {
-    ASSERT_EQ(batch.offset.HostVector(), std::vector<bst_row_t>({0, 2, 4, 5, 5, 7, 9, 10, 10}));
+    ASSERT_EQ(batch.offset.HostVector(), std::vector<bst_idx_t>({0, 2, 4, 5, 5, 7, 9, 10, 10}));
     ++num_batch;
   }
   ASSERT_EQ(num_batch, 1);
diff --git a/tests/cpp/data/test_array_interface.cu b/tests/cpp/data/test_array_interface.cu
index 00b996fb9..be8160c8a 100644
--- a/tests/cpp/data/test_array_interface.cu
+++ b/tests/cpp/data/test_array_interface.cu
@@ -1,10 +1,11 @@
 /**
- * Copyright 2021-2023, XGBoost Contributors
+ * Copyright 2021-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/host_device_vector.h>
-#include "../helpers.h"
+
 #include "../../../src/data/array_interface.h"
+#include "../helpers.h"
 
 namespace xgboost {
 
diff --git a/tests/cpp/data/test_data.cc b/tests/cpp/data/test_data.cc
index 99cd72cc0..f9e34790d 100644
--- a/tests/cpp/data/test_data.cc
+++ b/tests/cpp/data/test_data.cc
@@ -13,7 +13,7 @@
 
 namespace xgboost {
 TEST(SparsePage, PushCSC) {
-  std::vector<bst_row_t> offset {0};
+  std::vector<bst_idx_t> offset {0};
   std::vector<Entry> data;
   SparsePage batch;
   batch.offset.HostVector() = offset;
diff --git a/tests/cpp/data/test_device_adapter.cu b/tests/cpp/data/test_device_adapter.cu
index 2c86c98b1..0ead3c2f4 100644
--- a/tests/cpp/data/test_device_adapter.cu
+++ b/tests/cpp/data/test_device_adapter.cu
@@ -64,7 +64,7 @@ TEST(DeviceAdapter, GetRowCounts) {
                        .Device(ctx.Device())
                        .GenerateArrayInterface(&storage);
     auto adapter = CupyAdapter{str_arr};
-    HostDeviceVector<bst_row_t> offset(adapter.NumRows() + 1, 0);
+    HostDeviceVector<bst_idx_t> offset(adapter.NumRows() + 1, 0);
     offset.SetDevice(ctx.Device());
     auto rstride = GetRowCounts(adapter.Value(), offset.DeviceSpan(), ctx.Device(),
                                 std::numeric_limits<float>::quiet_NaN());
diff --git a/tests/cpp/data/test_metainfo.cc b/tests/cpp/data/test_metainfo.cc
index b3f3a67ca..a26e7cbd7 100644
--- a/tests/cpp/data/test_metainfo.cc
+++ b/tests/cpp/data/test_metainfo.cc
@@ -1,15 +1,17 @@
-// Copyright 2016-2021 by Contributors
+/**
+ * Copyright 2016-2024, XGBoost contributors
+ */
 #include "test_metainfo.h"
 
+#include <gmock/gmock.h>
 #include <dmlc/io.h>
 #include <xgboost/data.h>
 
 #include <memory>
 #include <string>
 
-#include "../../../src/common/version.h"
 #include "../filesystem.h"  // dmlc::TemporaryDirectory
-#include "../helpers.h"
+#include "../helpers.h"     // for GMockTHrow
 #include "xgboost/base.h"
 
 namespace xgboost {
@@ -20,23 +22,22 @@ TEST(MetaInfo, GetSet) {
   double double2[2] = {1.0, 2.0};
 
   EXPECT_EQ(info.labels.Size(), 0);
-  info.SetInfo(ctx, "label", double2, xgboost::DataType::kFloat32, 2);
+  info.SetInfo(ctx, "label", Make1dInterfaceTest(double2, 2));
   EXPECT_EQ(info.labels.Size(), 2);
 
   float float2[2] = {1.0f, 2.0f};
-  EXPECT_EQ(info.GetWeight(1), 1.0f)
-    << "When no weights are given, was expecting default value 1";
-  info.SetInfo(ctx, "weight", float2, xgboost::DataType::kFloat32, 2);
+  EXPECT_EQ(info.GetWeight(1), 1.0f) << "When no weights are given, was expecting default value 1";
+  info.SetInfo(ctx, "weight", Make1dInterfaceTest(float2, 2));
   EXPECT_EQ(info.GetWeight(1), 2.0f);
 
   uint32_t uint32_t2[2] = {1U, 2U};
   EXPECT_EQ(info.base_margin_.Size(), 0);
-  info.SetInfo(ctx, "base_margin", uint32_t2, xgboost::DataType::kUInt32, 2);
+  info.SetInfo(ctx, "base_margin", Make1dInterfaceTest(uint32_t2, 2));
   EXPECT_EQ(info.base_margin_.Size(), 2);
 
   uint64_t uint64_t2[2] = {1U, 2U};
   EXPECT_EQ(info.group_ptr_.size(), 0);
-  info.SetInfo(ctx, "group", uint64_t2, xgboost::DataType::kUInt64, 2);
+  info.SetInfo(ctx, "group", Make1dInterfaceTest(uint64_t2, 2));
   ASSERT_EQ(info.group_ptr_.size(), 3);
   EXPECT_EQ(info.group_ptr_[2], 3);
 
@@ -46,6 +47,8 @@ TEST(MetaInfo, GetSet) {
 
 TEST(MetaInfo, GetSetFeature) {
   xgboost::MetaInfo info;
+  ASSERT_THAT([&] { info.SetFeatureInfo("", nullptr, 0); },
+              GMockThrow("Unknown feature info name"));
   EXPECT_THROW(info.SetFeatureInfo("", nullptr, 0), dmlc::Error);
   EXPECT_THROW(info.SetFeatureInfo("foo", nullptr, 0), dmlc::Error);
   EXPECT_NO_THROW(info.SetFeatureInfo("feature_name", nullptr, 0));
@@ -86,7 +89,8 @@ void VerifyGetSetFeatureColumnSplit() {
   std::transform(types.cbegin(), types.cend(), c_types.begin(),
                  [](auto const &str) { return str.c_str(); });
   info.num_col_ = kCols;
-  EXPECT_THROW(info.SetFeatureInfo(u8"feature_type", c_types.data(), c_types.size()), dmlc::Error);
+  ASSERT_THAT([&] { info.SetFeatureInfo(u8"feature_type", c_types.data(), c_types.size()); },
+              GMockThrow("Length of feature_type must be equal to number of columns"));
   info.num_col_ = kCols * world_size;
   EXPECT_NO_THROW(info.SetFeatureInfo(u8"feature_type", c_types.data(), c_types.size()));
   std::vector<std::string> expected_type_names{u8"float", u8"c",     u8"float",
@@ -103,7 +107,8 @@ void VerifyGetSetFeatureColumnSplit() {
   std::transform(names.cbegin(), names.cend(), c_names.begin(),
                  [](auto const &str) { return str.c_str(); });
   info.num_col_ = kCols;
-  EXPECT_THROW(info.SetFeatureInfo(u8"feature_name", c_names.data(), c_names.size()), dmlc::Error);
+  ASSERT_THAT([&] { info.SetFeatureInfo(u8"feature_name", c_names.data(), c_names.size()); },
+              GMockThrow("Length of feature_name must be equal to number of columns"));
   info.num_col_ = kCols * world_size;
   EXPECT_NO_THROW(info.SetFeatureInfo(u8"feature_name", c_names.data(), c_names.size()));
   std::vector<std::string> expected_names{u8"0.feature0", u8"0.feature1", u8"1.feature0",
@@ -128,9 +133,9 @@ TEST(MetaInfo, SaveLoadBinary) {
                    };
   std::vector<float> values (kRows);
   std::generate(values.begin(), values.end(), generator);
-  info.SetInfo(ctx, "label", values.data(), xgboost::DataType::kFloat32, kRows);
-  info.SetInfo(ctx, "weight", values.data(), xgboost::DataType::kFloat32, kRows);
-  info.SetInfo(ctx, "base_margin", values.data(), xgboost::DataType::kFloat32, kRows);
+  info.SetInfo(ctx, "label", Make1dInterfaceTest(values.data(), kRows));
+  info.SetInfo(ctx, "weight", Make1dInterfaceTest(values.data(), kRows));
+  info.SetInfo(ctx, "base_margin", Make1dInterfaceTest(values.data(), kRows));
 
   info.num_row_ = kRows;
   info.num_col_ = kCols;
@@ -224,7 +229,7 @@ TEST(MetaInfo, LoadQid) {
   const std::vector<xgboost::bst_uint> expected_group_ptr{0, 4, 8, 12};
   CHECK(info.group_ptr_ == expected_group_ptr);
 
-  const std::vector<xgboost::bst_row_t> expected_offset{
+  const std::vector<xgboost::bst_idx_t> expected_offset{
     0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60
   };
   const std::vector<xgboost::Entry> expected_data{
@@ -264,7 +269,7 @@ TEST(MetaInfo, CPUQid) {
     qid[i] = i;
   }
 
-  info.SetInfo(ctx, "qid", qid.data(), xgboost::DataType::kUInt32, info.num_row_);
+  info.SetInfo(ctx, "qid", Make1dInterfaceTest(qid.data(), info.num_row_));
   ASSERT_EQ(info.group_ptr_.size(), info.num_row_ + 1);
   ASSERT_EQ(info.group_ptr_.front(), 0);
   ASSERT_EQ(info.group_ptr_.back(), info.num_row_);
@@ -281,14 +286,12 @@ TEST(MetaInfo, Validate) {
   info.num_col_ = 3;
   std::vector<xgboost::bst_group_t> groups (11);
   Context ctx;
-  info.SetInfo(ctx, "group", groups.data(), xgboost::DataType::kUInt32, 11);
+  info.SetInfo(ctx, "group", Make1dInterfaceTest(groups.data(), groups.size()));
   EXPECT_THROW(info.Validate(FstCU()), dmlc::Error);
 
   std::vector<float> labels(info.num_row_ + 1);
   EXPECT_THROW(
-      {
-        info.SetInfo(ctx, "label", labels.data(), xgboost::DataType::kFloat32, info.num_row_ + 1);
-      },
+      { info.SetInfo(ctx, "label", Make1dInterfaceTest(labels.data(), info.num_row_ + 1)); },
       dmlc::Error);
 
   // Make overflow data, which can happen when users pass group structure as int
@@ -298,13 +301,13 @@ TEST(MetaInfo, Validate) {
     groups.push_back(1562500);
   }
   groups.push_back(static_cast<xgboost::bst_group_t>(-1));
-  EXPECT_THROW(info.SetInfo(ctx, "group", groups.data(), xgboost::DataType::kUInt32, groups.size()),
+  EXPECT_THROW(info.SetInfo(ctx, "group", Make1dInterfaceTest(groups.data(), groups.size())),
                dmlc::Error);
 
 #if defined(XGBOOST_USE_CUDA) || defined(XGBOOST_USE_HIP)
   info.group_ptr_.clear();
   labels.resize(info.num_row_);
-  info.SetInfo(ctx, "label", labels.data(), xgboost::DataType::kFloat32, info.num_row_);
+  info.SetInfo(ctx, "label", Make1dInterfaceTest(labels.data(), info.num_row_));
   info.labels.SetDevice(FstCU());
   EXPECT_THROW(info.Validate(DeviceOrd::CUDA(1)), dmlc::Error);
 
@@ -333,8 +336,8 @@ TEST(MetaInfo, HostExtend) {
   for (size_t g = 0; g < kRows / per_group; ++g) {
     groups.emplace_back(per_group);
   }
-  lhs.SetInfo(ctx, "group", groups.data(), xgboost::DataType::kUInt32, groups.size());
-  rhs.SetInfo(ctx, "group", groups.data(), xgboost::DataType::kUInt32, groups.size());
+  lhs.SetInfo(ctx, "group", Make1dInterfaceTest(groups.data(), groups.size()));
+  rhs.SetInfo(ctx, "group", Make1dInterfaceTest(groups.data(), groups.size()));
 
   lhs.Extend(rhs, true, true);
   ASSERT_EQ(lhs.num_row_, kRows * 2);
diff --git a/tests/cpp/data/test_simple_dmatrix.cc b/tests/cpp/data/test_simple_dmatrix.cc
index fa4165796..6334d96c6 100644
--- a/tests/cpp/data/test_simple_dmatrix.cc
+++ b/tests/cpp/data/test_simple_dmatrix.cc
@@ -223,7 +223,7 @@ TEST(SimpleDMatrix, FromFile) {
     auto batch = page.GetView();
     EXPECT_EQ(batch.Size(), kExpectedNumRow);
     EXPECT_EQ(page.offset.HostVector(),
-              std::vector<bst_row_t>({0, 3, 6, 9, 12, 15, 15}));
+              std::vector<bst_idx_t>({0, 3, 6, 9, 12, 15, 15}));
     EXPECT_EQ(page.base_rowid, 0);
 
     for (auto i = 0ull; i < batch.Size() - 1; i++) {
diff --git a/tests/cpp/gbm/test_gbtree.cc b/tests/cpp/gbm/test_gbtree.cc
index f18f3133d..8de5b6498 100644
--- a/tests/cpp/gbm/test_gbtree.cc
+++ b/tests/cpp/gbm/test_gbtree.cc
@@ -171,7 +171,7 @@ TEST(GBTree, ChoosePredictor) {
 }
 
 TEST(GBTree, ChooseTreeMethod) {
-  bst_row_t n_samples{128};
+  bst_idx_t n_samples{128};
   bst_feature_t n_features{64};
   auto Xy = RandomDataGenerator{n_samples, n_features, 0.5f}.GenerateDMatrix(true);
 
@@ -408,7 +408,7 @@ class Dart : public testing::TestWithParam<char const*> {
     for (size_t i = 0; i < kRows; ++i) {
       labels[i] = i % 2;
     }
-    p_mat->SetInfo("label", labels.data(), DataType::kFloat32, kRows);
+    p_mat->SetInfo("label", Make1dInterfaceTest(labels.data(), kRows));
 
     auto learner = std::unique_ptr<Learner>(Learner::Create({p_mat}));
     learner->SetParam("booster", "dart");
diff --git a/tests/cpp/gbm/test_gbtree.cu b/tests/cpp/gbm/test_gbtree.cu
index f308e3b3e..227e07ffd 100644
--- a/tests/cpp/gbm/test_gbtree.cu
+++ b/tests/cpp/gbm/test_gbtree.cu
@@ -18,7 +18,7 @@
 namespace xgboost {
 void TestInplaceFallback(Context const* ctx) {
   // prepare data
-  bst_row_t n_samples{1024};
+  bst_idx_t n_samples{1024};
   bst_feature_t n_features{32};
   HostDeviceVector<float> X_storage;
   // use a different device than the learner
diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index 1c7d68c31..f47eef14f 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -216,7 +216,7 @@ SimpleLCG::StateType SimpleLCG::Max() const { return max(); }
 static_assert(SimpleLCG::max() - SimpleLCG::min());
 
 void RandomDataGenerator::GenerateLabels(std::shared_ptr<DMatrix> p_fmat) const {
-  RandomDataGenerator{static_cast<bst_row_t>(p_fmat->Info().num_row_), this->n_targets_, 0.0f}.GenerateDense(
+  RandomDataGenerator{static_cast<bst_idx_t>(p_fmat->Info().num_row_), this->n_targets_, 0.0f}.GenerateDense(
       p_fmat->Info().labels.Data());
   CHECK_EQ(p_fmat->Info().labels.Size(), this->rows_ * this->n_targets_);
   p_fmat->Info().labels.Reshape(this->rows_, this->n_targets_);
@@ -334,7 +334,7 @@ std::string RandomDataGenerator::GenerateColumnarArrayInterface(
 }
 
 void RandomDataGenerator::GenerateCSR(
-    HostDeviceVector<float>* value, HostDeviceVector<bst_row_t>* row_ptr,
+    HostDeviceVector<float>* value, HostDeviceVector<std::size_t>* row_ptr,
     HostDeviceVector<bst_feature_t>* columns) const {
   auto& h_value = value->HostVector();
   auto& h_rptr = row_ptr->HostVector();
@@ -381,7 +381,7 @@ void RandomDataGenerator::GenerateCSR(
 [[nodiscard]] std::shared_ptr<DMatrix> RandomDataGenerator::GenerateDMatrix(
     bool with_label, bool float_label, size_t classes, DataSplitMode data_split_mode) const {
   HostDeviceVector<float> data;
-  HostDeviceVector<bst_row_t> rptrs;
+  HostDeviceVector<std::size_t> rptrs;
   HostDeviceVector<bst_feature_t> columns;
   this->GenerateCSR(&data, &rptrs, &columns);
   data::CSRAdapter adapter(rptrs.HostPointer(), columns.HostPointer(), data.HostPointer(), rows_,
@@ -447,7 +447,7 @@ void RandomDataGenerator::GenerateCSR(
 
   // Loop over the batches and count the number of pages
   std::size_t batch_count = 0;
-  bst_row_t row_count = 0;
+  bst_idx_t row_count = 0;
   for (const auto& batch : dmat->GetBatches<xgboost::SparsePage>()) {
     batch_count++;
     row_count += batch.Size();
@@ -458,7 +458,7 @@ void RandomDataGenerator::GenerateCSR(
   EXPECT_EQ(row_count, dmat->Info().num_row_);
 
   if (with_label) {
-    RandomDataGenerator{static_cast<bst_row_t>(dmat->Info().num_row_), this->n_targets_, 0.0f}.GenerateDense(
+    RandomDataGenerator{static_cast<bst_idx_t>(dmat->Info().num_row_), this->n_targets_, 0.0f}.GenerateDense(
         dmat->Info().labels.Data());
     CHECK_EQ(dmat->Info().labels.Size(), this->rows_ * this->n_targets_);
     dmat->Info().labels.Reshape(this->rows_, this->n_targets_);
@@ -488,7 +488,7 @@ int CudaArrayIterForTest::Next() {
 }
 #endif  // !defined(XGBOOST_USE_CUDA)
 
-NumpyArrayIterForTest::NumpyArrayIterForTest(float sparsity, size_t rows, size_t cols,
+NumpyArrayIterForTest::NumpyArrayIterForTest(float sparsity, bst_idx_t rows, size_t cols,
                                              size_t batches)
     : ArrayIterForTest{sparsity, rows, cols, batches} {
   rng_->Device(DeviceOrd::CPU());
@@ -515,7 +515,7 @@ std::shared_ptr<DMatrix> GetDMatrixFromData(const std::vector<float>& x, std::si
   return p_fmat;
 }
 
-std::unique_ptr<DMatrix> CreateSparsePageDMatrix(bst_row_t n_samples, bst_feature_t n_features,
+std::unique_ptr<DMatrix> CreateSparsePageDMatrix(bst_idx_t n_samples, bst_feature_t n_features,
                                                  size_t n_batches, std::string prefix) {
   CHECK_GE(n_samples, n_batches);
   NumpyArrayIterForTest iter(0, n_samples, n_features, n_batches);
@@ -662,7 +662,7 @@ std::unique_ptr<GradientBooster> CreateTrainedGBM(std::string name, Args kwargs,
   return gbm;
 }
 
-ArrayIterForTest::ArrayIterForTest(float sparsity, size_t rows, size_t cols, size_t batches)
+ArrayIterForTest::ArrayIterForTest(float sparsity, bst_idx_t rows, size_t cols, size_t batches)
     : rows_{rows}, cols_{cols}, n_batches_{batches} {
   XGProxyDMatrixCreate(&proxy_);
   rng_ = std::make_unique<RandomDataGenerator>(rows_, cols_, sparsity);
diff --git a/tests/cpp/helpers.cu b/tests/cpp/helpers.cu
index db94da27a..f75628953 100644
--- a/tests/cpp/helpers.cu
+++ b/tests/cpp/helpers.cu
@@ -1,8 +1,11 @@
+/**
+ * Copyright 2020-2024, XGBoost contributors
+ */
 #include <xgboost/c_api.h>
 
-#include "helpers.h"
 #include "../../src/data/device_adapter.cuh"
 #include "../../src/data/iterative_dmatrix.h"
+#include "helpers.h"
 
 namespace xgboost {
 
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index 744793ba0..ff62f69dc 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -1,8 +1,9 @@
 /**
- * Copyright 2016-2024 by XGBoost contributors
+ * Copyright 2016-2024, XGBoost contributors
  */
 #pragma once
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include <sys/stat.h>
 #include <sys/types.h>
@@ -12,21 +13,20 @@
 #include <xgboost/learner.h>  // for LearnerModelParam
 #include <xgboost/model.h>    // for Configurable
 
-#include <cstdint>            // std::int32_t
+#include <cstdint>  // std::int32_t
 #include <cstdio>
-#include <fstream>
-#include <iostream>
 #include <memory>
 #include <string>
-#include <thread>
 #include <vector>
 
 #include "../../src/collective/communicator-inl.h"
 #include "../../src/common/common.h"
 #include "../../src/common/threading_utils.h"
-#include "../../src/data/array_interface.h"
 #include "filesystem.h"  // dmlc::TemporaryDirectory
 #include "xgboost/linalg.h"
+#if !defined(_OPENMP)
+#include <thread>
+#endif
 
 #if defined(__CUDACC__) || defined(__HIPCC__)
 #define DeclareUnifiedTest(name) GPU ## name
@@ -222,7 +222,7 @@ Json GetArrayInterface(HostDeviceVector<T> const* storage, size_t rows, size_t c
 
 // Generate in-memory random data without using DMatrix.
 class RandomDataGenerator {
-  bst_row_t rows_;
+  bst_idx_t rows_;
   size_t cols_;
   float sparsity_;
 
@@ -245,7 +245,7 @@ class RandomDataGenerator {
   void GenerateLabels(std::shared_ptr<DMatrix> p_fmat) const;
 
  public:
-  RandomDataGenerator(bst_row_t rows, size_t cols, float sparsity)
+  RandomDataGenerator(bst_idx_t rows, size_t cols, float sparsity)
       : rows_{rows}, cols_{cols}, sparsity_{sparsity}, lcg_{seed_} {}
 
   RandomDataGenerator& Lower(float v) {
@@ -307,7 +307,7 @@ class RandomDataGenerator {
 
   std::string GenerateColumnarArrayInterface(std::vector<HostDeviceVector<float>>* data) const;
 
-  void GenerateCSR(HostDeviceVector<float>* value, HostDeviceVector<bst_row_t>* row_ptr,
+  void GenerateCSR(HostDeviceVector<float>* value, HostDeviceVector<std::size_t>* row_ptr,
                    HostDeviceVector<bst_feature_t>* columns) const;
 
   [[nodiscard]] std::shared_ptr<DMatrix> GenerateDMatrix(
@@ -332,7 +332,7 @@ inline std::vector<float> GenerateRandomCategoricalSingleColumn(int n, size_t nu
   std::vector<float> x(n);
   std::mt19937 rng(0);
   std::uniform_int_distribution<size_t> dist(0, num_categories - 1);
-  std::generate(x.begin(), x.end(), [&]() { return dist(rng); });
+  std::generate(x.begin(), x.end(), [&]() { return static_cast<float>(dist(rng)); });
   // Make sure each category is present
   for (size_t i = 0; i < num_categories; i++) {
     x[i] = static_cast<decltype(x)::value_type>(i);
@@ -353,7 +353,7 @@ std::shared_ptr<DMatrix> GetDMatrixFromData(const std::vector<float>& x, std::si
  *
  * \return A Sparse DMatrix with n_batches.
  */
-std::unique_ptr<DMatrix> CreateSparsePageDMatrix(bst_row_t n_samples, bst_feature_t n_features,
+std::unique_ptr<DMatrix> CreateSparsePageDMatrix(bst_idx_t n_samples, bst_feature_t n_features,
                                                  size_t n_batches, std::string prefix = "cache");
 
 /**
@@ -412,12 +412,12 @@ inline HostDeviceVector<GradientPair> GenerateRandomGradients(const size_t n_row
   return gpair;
 }
 
-inline linalg::Matrix<GradientPair> GenerateRandomGradients(Context const* ctx, bst_row_t n_rows,
+inline linalg::Matrix<GradientPair> GenerateRandomGradients(Context const* ctx, bst_idx_t n_rows,
                                                             bst_target_t n_targets,
                                                             float lower = 0.0f,
                                                             float upper = 1.0f) {
   auto g = GenerateRandomGradients(n_rows * n_targets, lower, upper);
-  linalg::Matrix<GradientPair> gpair({n_rows, static_cast<bst_row_t>(n_targets)}, ctx->Device());
+  linalg::Matrix<GradientPair> gpair({n_rows, static_cast<bst_idx_t>(n_targets)}, ctx->Device());
   gpair.Data()->Copy(g);
   return gpair;
 }
@@ -433,12 +433,12 @@ class ArrayIterForTest {
 
   std::vector<std::string> batches_;
   std::string interface_;
-  size_t rows_;
+  bst_idx_t rows_;
   size_t cols_;
   size_t n_batches_;
 
  public:
-  size_t static constexpr Rows() { return 1024; }
+  bst_idx_t static constexpr Rows() { return 1024; }
   size_t static constexpr Batches() { return 100; }
   size_t static constexpr Cols() { return 13; }
 
@@ -450,7 +450,7 @@ class ArrayIterForTest {
   [[nodiscard]] std::size_t Iter() const { return iter_; }
   auto Proxy() -> decltype(proxy_) { return proxy_; }
 
-  explicit ArrayIterForTest(float sparsity, size_t rows, size_t cols, size_t batches);
+  explicit ArrayIterForTest(float sparsity, bst_idx_t rows, size_t cols, size_t batches);
   /**
    * \brief Create iterator with user provided data.
    */
@@ -469,7 +469,7 @@ class CudaArrayIterForTest : public ArrayIterForTest {
 
 class NumpyArrayIterForTest : public ArrayIterForTest {
  public:
-  explicit NumpyArrayIterForTest(float sparsity, size_t rows = Rows(), size_t cols = Cols(),
+  explicit NumpyArrayIterForTest(float sparsity, bst_idx_t rows = Rows(), size_t cols = Cols(),
                                  size_t batches = Batches());
   explicit NumpyArrayIterForTest(Context const* ctx, HostDeviceVector<float> const& data,
                                  std::size_t n_samples, bst_feature_t n_features,
@@ -493,6 +493,16 @@ inline int Next(DataIterHandle self) {
   return static_cast<ArrayIterForTest*>(self)->Next();
 }
 
+/**
+ * @brief Create an array interface for host vector.
+ */
+template <typename T>
+char const* Make1dInterfaceTest(T const* vec, std::size_t len) {
+  static thread_local std::string str;
+  str = linalg::Make1dInterface(vec, len);
+  return str.c_str();
+}
+
 class RMMAllocator;
 using RMMAllocatorPtr = std::unique_ptr<RMMAllocator, void(*)(RMMAllocator*)>;
 RMMAllocatorPtr SetUpRMMResourceForCppTests(int argc, char** argv);
@@ -573,30 +583,7 @@ class DeclareUnifiedDistributedTest(MetricTest) : public BaseMGPUTest{};
 
 inline DeviceOrd FstCU() { return DeviceOrd::CUDA(0); }
 
-/**
- * @brief poor man's gmock for message matching.
- *
- * @tparam Error The type of expected execption.
- *
- * @param submsg A substring of the actual error message.
- * @param fn The function that throws Error
- */
-template <typename Error, typename Fn>
-void ExpectThrow(std::string submsg, Fn&& fn) {
-  try {
-    fn();
-  } catch (Error const& exc) {
-    auto actual = std::string{exc.what()};
-    ASSERT_NE(actual.find(submsg), std::string::npos)
-        << "Expecting substring `" << submsg << "` from the error message."
-        << " Got:\n"
-        << actual << "\n";
-    return;
-  } catch (std::exception const& exc) {
-    auto actual = exc.what();
-    ASSERT_TRUE(false) << "An unexpected type of exception is thrown. what:" << actual;
-    return;
-  }
-  ASSERT_TRUE(false) << "No exception is thrown";
+inline auto GMockThrow(StringView msg) {
+  return ::testing::ThrowsMessage<dmlc::Error>(::testing::HasSubstr(msg));
 }
 }  // namespace xgboost
diff --git a/tests/cpp/histogram_helpers.h b/tests/cpp/histogram_helpers.h
index e5d603b42..d1e44fb88 100644
--- a/tests/cpp/histogram_helpers.h
+++ b/tests/cpp/histogram_helpers.h
@@ -47,7 +47,7 @@ inline std::unique_ptr<EllpackPageImpl> BuildEllpackPage(int n_rows, int n_cols,
           0.26f, 0.71f, 1.83f});
   cmat.SetMins({0.1f, 0.2f, 0.3f, 0.1f, 0.2f, 0.3f, 0.2f, 0.2f});
 
-  bst_row_t row_stride = 0;
+  bst_idx_t row_stride = 0;
   const auto &offset_vec = batch.offset.ConstHostVector();
   for (size_t i = 1; i < offset_vec.size(); ++i) {
     row_stride = std::max(row_stride, offset_vec[i] - offset_vec[i-1]);
diff --git a/tests/cpp/metric/test_elementwise_metric.h b/tests/cpp/metric/test_elementwise_metric.h
index ef34d7651..4435c0807 100644
--- a/tests/cpp/metric/test_elementwise_metric.h
+++ b/tests/cpp/metric/test_elementwise_metric.h
@@ -5,10 +5,9 @@
 #include <xgboost/json.h>
 #include <xgboost/metric.h>
 
-#include <map>
 #include <memory>
+#include <numeric>  // for iota
 
-#include "../../../src/common/linalg_op.h"
 #include "../helpers.h"
 
 namespace xgboost::metric {
diff --git a/tests/cpp/objective/test_regression_obj_cpu.cc b/tests/cpp/objective/test_regression_obj_cpu.cc
index 4e9c0e3c0..d72b330f1 100644
--- a/tests/cpp/objective/test_regression_obj_cpu.cc
+++ b/tests/cpp/objective/test_regression_obj_cpu.cc
@@ -1,14 +1,15 @@
-/*!
- * Copyright 2018-2023 XGBoost contributors
+/**
+ * Copyright 2018-2024, XGBoost contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/context.h>
 #include <xgboost/objective.h>
 
-#include "../../../src/objective/adaptive.h"
-#include "../../../src/tree/param.h"        // for TrainParam
-#include "../helpers.h"
+#include <numeric>  // for iota
 
+#include "../../../src/objective/adaptive.h"
+#include "../../../src/tree/param.h"  // for TrainParam
+#include "../helpers.h"
 #include "test_regression_obj.h"
 
 namespace xgboost {
diff --git a/tests/cpp/plugin/federated/test_federated_coll.cc b/tests/cpp/plugin/federated/test_federated_coll.cc
index ad053f286..6b7000ef9 100644
--- a/tests/cpp/plugin/federated/test_federated_coll.cc
+++ b/tests/cpp/plugin/federated/test_federated_coll.cc
@@ -60,8 +60,7 @@ TEST_F(FederatedCollTest, Allgather) {
 
     std::vector<std::int32_t> buffer(n_workers, 0);
     buffer[comm->Rank()] = comm->Rank();
-    auto rc = coll.Allgather(*comm, common::EraseType(common::Span{buffer.data(), buffer.size()}),
-                             sizeof(int));
+    auto rc = coll.Allgather(*comm, common::EraseType(common::Span{buffer.data(), buffer.size()}));
     ASSERT_TRUE(rc.OK());
     for (auto i = 0; i < n_workers; i++) {
       ASSERT_EQ(buffer[i], i);
diff --git a/tests/cpp/plugin/federated/test_federated_coll.cu b/tests/cpp/plugin/federated/test_federated_coll.cu
index a6ec7e352..237bdeb9d 100644
--- a/tests/cpp/plugin/federated/test_federated_coll.cu
+++ b/tests/cpp/plugin/federated/test_federated_coll.cu
@@ -5,13 +5,13 @@
 #include <gtest/gtest.h>
 #include <xgboost/collective/result.h>  // for Result
 
+#include "../../../../src/collective/allreduce.h"
 #include "../../../../src/common/common.h"            // for AllVisibleGPUs
 #include "../../../../src/common/device_helpers.cuh"  // for device_vector
 #include "../../../../src/common/type.h"              // for EraseType
 #include "../../collective/test_worker.h"             // for SocketTest
 #include "../../helpers.h"                            // for MakeCUDACtx
 #include "federated_coll.cuh"
-#include "federated_comm.cuh"
 #include "test_worker.h"  // for TestFederated
 
 namespace xgboost::collective {
@@ -71,7 +71,7 @@ void TestAllgather(std::shared_ptr<FederatedComm> comm, std::int32_t rank, std::
 
   dh::device_vector<std::int32_t> buffer(n_workers, 0);
   buffer[comm->Rank()] = comm->Rank();
-  auto rc = w.coll->Allgather(*w.nccl_comm, common::EraseType(dh::ToSpan(buffer)), sizeof(int));
+  auto rc = w.coll->Allgather(*w.nccl_comm, common::EraseType(dh::ToSpan(buffer)));
   ASSERT_TRUE(rc.OK());
   for (auto i = 0; i < n_workers; i++) {
     ASSERT_EQ(buffer[i], i);
diff --git a/tests/cpp/plugin/federated/test_federated_coll.hip b/tests/cpp/plugin/federated/test_federated_coll.hip
new file mode 100644
index 000000000..af572c6a2
--- /dev/null
+++ b/tests/cpp/plugin/federated/test_federated_coll.hip
@@ -0,0 +1,4 @@
+
+#ifdef XGBOOST_USE_HIP
+#include "test_federated_coll.cu"
+#endif
diff --git a/tests/cpp/plugin/federated/test_federated_comm.cc b/tests/cpp/plugin/federated/test_federated_comm.cc
index 0d0692b5f..16edc685f 100644
--- a/tests/cpp/plugin/federated/test_federated_comm.cc
+++ b/tests/cpp/plugin/federated/test_federated_comm.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2022-2023, XGBoost contributors
+ * Copyright 2022-2024, XGBoost contributors
  */
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -9,7 +9,7 @@
 
 #include "../../../../plugin/federated/federated_comm.h"
 #include "../../collective/test_worker.h"  // for SocketTest
-#include "../../helpers.h"                 // for ExpectThrow
+#include "../../helpers.h"                 // for GMockThrow
 #include "test_worker.h"                   // for TestFederated
 #include "xgboost/json.h"                  // for Json
 
@@ -20,19 +20,19 @@ class FederatedCommTest : public SocketTest {};
 
 TEST_F(FederatedCommTest, ThrowOnWorldSizeTooSmall) {
   auto construct = [] { FederatedComm comm{"localhost", 0, 0, 0}; };
-  ASSERT_THAT(construct,
-              ::testing::ThrowsMessage<dmlc::Error>(::testing::HasSubstr("Invalid world size")));
+  ASSERT_THAT(construct, GMockThrow("Invalid world size"));
 }
 
 TEST_F(FederatedCommTest, ThrowOnRankTooSmall) {
   auto construct = [] { FederatedComm comm{"localhost", 0, 1, -1}; };
-  ASSERT_THAT(construct,
-              ::testing::ThrowsMessage<dmlc::Error>(::testing::HasSubstr("Invalid worker rank.")));
+  ASSERT_THAT(construct, GMockThrow("Invalid worker rank."));
 }
 
 TEST_F(FederatedCommTest, ThrowOnRankTooBig) {
-  auto construct = [] { FederatedComm comm{"localhost", 0, 1, 1}; };
-  ExpectThrow<dmlc::Error>("Invalid worker rank.", construct);
+  auto construct = [] {
+    FederatedComm comm{"localhost", 0, 1, 1};
+  };
+  ASSERT_THAT(construct, GMockThrow("Invalid worker rank."));
 }
 
 TEST_F(FederatedCommTest, ThrowOnWorldSizeNotInteger) {
@@ -43,7 +43,7 @@ TEST_F(FederatedCommTest, ThrowOnWorldSizeNotInteger) {
     config["federated_rank"] = Integer(0);
     FederatedComm comm{DefaultRetry(), std::chrono::seconds{DefaultTimeoutSec()}, "", config};
   };
-  ExpectThrow<dmlc::Error>("got: `String`", construct);
+  ASSERT_THAT(construct, GMockThrow("got: `String`"));
 }
 
 TEST_F(FederatedCommTest, ThrowOnRankNotInteger) {
@@ -54,7 +54,7 @@ TEST_F(FederatedCommTest, ThrowOnRankNotInteger) {
     config["federated_rank"] = std::string("0");
     FederatedComm comm(DefaultRetry(), std::chrono::seconds{DefaultTimeoutSec()}, "", config);
   };
-  ExpectThrow<dmlc::Error>("got: `String`", construct);
+  ASSERT_THAT(construct, GMockThrow("got: `String`"));
 }
 
 TEST_F(FederatedCommTest, GetWorldSizeAndRank) {
diff --git a/tests/cpp/plugin/federated/test_federated_comm_group.hip b/tests/cpp/plugin/federated/test_federated_comm_group.hip
new file mode 100644
index 000000000..077a4210d
--- /dev/null
+++ b/tests/cpp/plugin/federated/test_federated_comm_group.hip
@@ -0,0 +1,4 @@
+
+#ifdef XGBOOST_USE_HIP
+#include "test_federated_comm_group.cu"
+#endif
diff --git a/tests/cpp/plugin/federated/test_federated_tracker.cc b/tests/cpp/plugin/federated/test_federated_tracker.cc
index 81ff95540..aa979ff15 100644
--- a/tests/cpp/plugin/federated/test_federated_tracker.cc
+++ b/tests/cpp/plugin/federated/test_federated_tracker.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023, XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 
@@ -8,7 +8,6 @@
 
 #include "../../../../src/collective/tracker.h"  // for GetHostAddress
 #include "federated_tracker.h"
-#include "test_worker.h"
 #include "xgboost/json.h"  // for Json
 
 namespace xgboost::collective {
@@ -26,7 +25,7 @@ TEST(FederatedTrackerTest, Basic) {
   ASSERT_GE(tracker->Port(), 1);
   std::string host;
   auto rc = GetHostAddress(&host);
-  ASSERT_EQ(get<String const>(args["DMLC_TRACKER_URI"]), host);
+  ASSERT_EQ(get<String const>(args["dmlc_tracker_uri"]), host);
 
   rc = tracker->Shutdown();
   ASSERT_TRUE(rc.OK());
diff --git a/tests/cpp/plugin/sycl_helpers.h b/tests/cpp/plugin/sycl_helpers.h
new file mode 100644
index 000000000..afc403d86
--- /dev/null
+++ b/tests/cpp/plugin/sycl_helpers.h
@@ -0,0 +1,31 @@
+/*!
+ * Copyright 2022-2024 XGBoost contributors
+ */
+#pragma once
+
+#include "../helpers.h"
+
+namespace xgboost::sycl {
+template<typename T, typename Container>
+void VerifySyclVector(const USMVector<T, MemoryType::shared>& sycl_vector,
+                      const Container& host_vector, T eps = T()) {
+  ASSERT_EQ(sycl_vector.Size(), host_vector.size());
+
+  size_t size = sycl_vector.Size();
+  for (size_t i = 0; i < size; ++i) {
+    EXPECT_NEAR(sycl_vector[i], host_vector[i], eps);
+  }
+}
+
+template<typename T, typename Container>
+void VerifySyclVector(const std::vector<T>& sycl_vector,
+                      const Container& host_vector, T eps = T()) {
+  ASSERT_EQ(sycl_vector.size(), host_vector.size());
+
+  size_t size = sycl_vector.size();
+  for (size_t i = 0; i < size; ++i) {
+    EXPECT_NEAR(sycl_vector[i], host_vector[i], eps);
+  }
+}
+
+}  // namespace xgboost::sycl
diff --git a/tests/cpp/plugin/test_federated_adapter.cu b/tests/cpp/plugin/test_federated_adapter.cu
index cec180e70..b96524878 100644
--- a/tests/cpp/plugin/test_federated_adapter.cu
+++ b/tests/cpp/plugin/test_federated_adapter.cu
@@ -26,7 +26,6 @@ TEST(FederatedAdapterSimpleTest, ThrowOnInvalidDeviceOrdinal) {
 namespace {
 void VerifyAllReduceSum() {
   auto const world_size = collective::GetWorldSize();
-  auto const rank = collective::GetRank();
   auto const device = GPUIDX;
   int count = 3;
   common::SetDevice(device);
diff --git a/tests/cpp/plugin/test_sycl_ghist_builder.cc b/tests/cpp/plugin/test_sycl_ghist_builder.cc
new file mode 100644
index 000000000..dacbc75fc
--- /dev/null
+++ b/tests/cpp/plugin/test_sycl_ghist_builder.cc
@@ -0,0 +1,157 @@
+/**
+ * Copyright 2020-2024 by XGBoost contributors
+ */
+#include <gtest/gtest.h>
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
+#pragma GCC diagnostic ignored "-W#pragma-messages"
+#include "../../../src/data/gradient_index.h"       // for GHistIndexMatrix
+#pragma GCC diagnostic pop
+
+#include "../../../plugin/sycl/common/hist_util.h"
+#include "../../../plugin/sycl/device_manager.h"
+#include "sycl_helpers.h"
+#include "../helpers.h"
+
+namespace xgboost::sycl::common {
+
+template <typename GradientSumT>
+void GHistBuilderTest(float sparsity, bool force_atomic_use) {
+  const size_t num_rows = 8;
+  const size_t num_columns = 1;
+  const int n_bins = 2;
+  const GradientSumT eps = 1e-6;
+
+  Context ctx;
+  ctx.UpdateAllowUnknown(Args{{"device", "sycl"}});
+
+  DeviceManager device_manager;
+  auto qu = device_manager.GetQueue(ctx.Device());
+
+  auto p_fmat = RandomDataGenerator{num_rows, num_columns, sparsity}.GenerateDMatrix();
+  sycl::DeviceMatrix dmat;
+  dmat.Init(qu, p_fmat.get());
+
+  GHistIndexMatrix gmat_sycl;
+  gmat_sycl.Init(qu, &ctx, dmat, n_bins);
+
+  xgboost::GHistIndexMatrix gmat{&ctx, p_fmat.get(), n_bins, 0.3, false};
+
+  RowSetCollection row_set_collection;
+  auto& row_indices = row_set_collection.Data();
+  row_indices.Resize(&qu, num_rows);
+  size_t* p_row_indices = row_indices.Data();
+
+  qu.submit([&](::sycl::handler& cgh) {
+    cgh.parallel_for<>(::sycl::range<1>(num_rows),
+                       [p_row_indices](::sycl::item<1> pid) {
+      const size_t idx = pid.get_id(0);
+      p_row_indices[idx] = idx;
+    });
+  }).wait_and_throw();
+  row_set_collection.Init();
+
+  auto builder = GHistBuilder<GradientSumT>(qu, n_bins);
+
+  std::vector<GradientPair> gpair = {
+      {0.1f, 0.2f}, {0.3f, 0.4f}, {0.5f, 0.6f}, {0.7f, 0.8f},
+      {0.9f, 0.1f}, {0.2f, 0.3f}, {0.4f, 0.5f}, {0.6f, 0.7f}};
+  CHECK_EQ(gpair.size(), num_rows);
+  USMVector<GradientPair, MemoryType::on_device> gpair_device(&qu, gpair);
+
+  std::vector<GradientSumT> hist_host(2*n_bins);
+  GHistRow<GradientSumT, MemoryType::on_device> hist(&qu, 2 * n_bins);
+  ::sycl::event event;
+
+  const size_t nblocks = 2;
+  GHistRow<GradientSumT, MemoryType::on_device> hist_buffer(&qu, 2 * nblocks * n_bins);
+
+  InitHist(qu, &hist, hist.Size(), &event);
+  InitHist(qu, &hist_buffer, hist_buffer.Size(), &event);
+
+  event = builder.BuildHist(gpair_device, row_set_collection[0], gmat_sycl, &hist,
+                            sparsity < eps , &hist_buffer, event, force_atomic_use);
+  qu.memcpy(hist_host.data(), hist.Data(),
+            2 * n_bins * sizeof(GradientSumT), event);
+  qu.wait_and_throw();
+
+  // Build hist on host to compare
+  std::vector<GradientSumT> hist_desired(2*n_bins);
+  for (size_t rid = 0; rid < num_rows; ++rid) {
+    const size_t ibegin = gmat.row_ptr[rid];
+    const size_t iend = gmat.row_ptr[rid + 1];
+    for (size_t i = ibegin; i < iend; ++i) {
+      const size_t bin_idx = gmat.index[i];
+      hist_desired[2*bin_idx]   += gpair[rid].GetGrad();
+      hist_desired[2*bin_idx+1] += gpair[rid].GetHess();
+    }
+  }
+
+  VerifySyclVector(hist_host, hist_desired, eps);
+}
+
+template <typename GradientSumT>
+void GHistSubtractionTest() {
+  const size_t n_bins = 4;
+  using GHistType = GHistRow<GradientSumT, MemoryType::on_device>;
+
+  Context ctx;
+  ctx.UpdateAllowUnknown(Args{{"device", "sycl"}});
+
+  DeviceManager device_manager;
+  auto qu = device_manager.GetQueue(ctx.Device());
+
+  ::sycl::event event;
+  std::vector<GradientSumT> hist1_host = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8};
+  GHistType hist1(&qu, 2 * n_bins);
+  event = qu.memcpy(hist1.Data(), hist1_host.data(),
+                    2 * n_bins * sizeof(GradientSumT), event);
+
+  std::vector<GradientSumT> hist2_host = {0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1};
+  GHistType hist2(&qu, 2 * n_bins);
+  event = qu.memcpy(hist2.Data(), hist2_host.data(),
+            2 * n_bins * sizeof(GradientSumT), event);
+
+  std::vector<GradientSumT> hist3_host(2 * n_bins);
+  GHistType hist3(&qu, 2 * n_bins);
+  event = SubtractionHist(qu, &hist3, hist1, hist2, n_bins, event);
+  qu.memcpy(hist3_host.data(), hist3.Data(),
+            2 * n_bins * sizeof(GradientSumT), event);
+  qu.wait_and_throw();
+
+  std::vector<GradientSumT> hist3_desired(2 * n_bins);
+  for (size_t idx = 0; idx < 2 * n_bins; ++idx) {
+    hist3_desired[idx] = hist1_host[idx] - hist2_host[idx];
+  }
+
+  const GradientSumT eps = 1e-6;
+  VerifySyclVector(hist3_host, hist3_desired, eps);
+}
+
+TEST(SyclGHistBuilder, ByBlockDenseCase) {
+  GHistBuilderTest<float>(0.0, false);
+  GHistBuilderTest<double>(0.0, false);
+}
+
+TEST(SyclGHistBuilder, ByBlockSparseCase) {
+  GHistBuilderTest<float>(0.3, false);
+  GHistBuilderTest<double>(0.3, false);
+}
+
+TEST(SyclGHistBuilder, ByAtomicDenseCase) {
+  GHistBuilderTest<float>(0.0, true);
+  GHistBuilderTest<double>(0.0, true);
+}
+
+TEST(SyclGHistBuilder, ByAtomicSparseCase) {
+  GHistBuilderTest<float>(0.3, true);
+  GHistBuilderTest<double>(0.3, true);
+}
+
+TEST(SyclGHistBuilder, Subtraction) {
+  GHistSubtractionTest<float>();
+  GHistSubtractionTest<double>();
+}
+
+}  // namespace xgboost::sycl::common
diff --git a/tests/cpp/plugin/test_sycl_gradient_index.cc b/tests/cpp/plugin/test_sycl_gradient_index.cc
new file mode 100644
index 000000000..4d605ce7a
--- /dev/null
+++ b/tests/cpp/plugin/test_sycl_gradient_index.cc
@@ -0,0 +1,80 @@
+/**
+ * Copyright 2021-2024 by XGBoost contributors
+ */
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
+#pragma GCC diagnostic ignored "-W#pragma-messages"
+#include "../../../src/data/gradient_index.h"       // for GHistIndexMatrix
+#pragma GCC diagnostic pop
+
+#include "../../../plugin/sycl/data/gradient_index.h"
+#include "../../../plugin/sycl/device_manager.h"
+#include "sycl_helpers.h"
+#include "../helpers.h"
+
+namespace xgboost::sycl::data {
+
+TEST(SyclGradientIndex, HistogramCuts) {
+  size_t max_bins = 8;
+
+  Context ctx;
+  ctx.UpdateAllowUnknown(Args{{"device", "sycl"}});
+
+  DeviceManager device_manager;
+  auto qu = device_manager.GetQueue(ctx.Device());
+
+  auto p_fmat = RandomDataGenerator{512, 16, 0.5}.GenerateDMatrix(true);
+
+  xgboost::common::HistogramCuts cut = 
+    xgboost::common::SketchOnDMatrix(&ctx, p_fmat.get(), max_bins);
+
+  common::HistogramCuts cut_sycl;
+  cut_sycl.Init(qu, cut);
+
+  VerifySyclVector(cut_sycl.Ptrs(), cut.cut_ptrs_.HostVector());
+  VerifySyclVector(cut_sycl.Values(), cut.cut_values_.HostVector());
+  VerifySyclVector(cut_sycl.MinValues(), cut.min_vals_.HostVector());
+}
+
+TEST(SyclGradientIndex, Init) {
+  size_t n_rows = 128;
+  size_t n_columns = 7;
+
+  Context ctx;
+  ctx.UpdateAllowUnknown(Args{{"device", "sycl"}});
+
+  DeviceManager device_manager;
+  auto qu = device_manager.GetQueue(ctx.Device());
+
+  auto p_fmat = RandomDataGenerator{n_rows, n_columns, 0.3}.GenerateDMatrix();
+
+  sycl::DeviceMatrix dmat;
+  dmat.Init(qu, p_fmat.get());
+
+  int max_bins = 256;
+  common::GHistIndexMatrix gmat_sycl;
+  gmat_sycl.Init(qu, &ctx, dmat, max_bins);
+
+  xgboost::GHistIndexMatrix gmat{&ctx, p_fmat.get(), max_bins, 0.3, false};
+
+  {
+    ASSERT_EQ(gmat_sycl.max_num_bins, max_bins);
+    ASSERT_EQ(gmat_sycl.nfeatures, n_columns);
+  }
+
+  {
+    VerifySyclVector(gmat_sycl.hit_count, gmat.hit_count);
+  }
+
+  {
+    std::vector<size_t> feature_count_sycl(n_columns, 0);
+    gmat_sycl.GetFeatureCounts(feature_count_sycl.data());
+
+    std::vector<size_t> feature_count(n_columns, 0);
+    gmat.GetFeatureCounts(feature_count.data());
+    VerifySyclVector(feature_count_sycl, feature_count);
+  }
+}
+
+}  // namespace xgboost::sycl::data
diff --git a/tests/cpp/plugin/test_sycl_partition_builder.cc b/tests/cpp/plugin/test_sycl_partition_builder.cc
index 90bc757eb..7e3126a79 100644
--- a/tests/cpp/plugin/test_sycl_partition_builder.cc
+++ b/tests/cpp/plugin/test_sycl_partition_builder.cc
@@ -13,6 +13,108 @@
 
 namespace xgboost::sycl::common {
 
+void TestPartitioning(float sparsity, int max_bins) {
+  const size_t num_rows = 16;
+  const size_t num_columns = 1;
+
+  Context ctx;
+  ctx.UpdateAllowUnknown(Args{{"device", "sycl"}});
+
+  DeviceManager device_manager;
+  auto qu = device_manager.GetQueue(ctx.Device());
+
+  auto p_fmat = RandomDataGenerator{num_rows, num_columns, sparsity}.GenerateDMatrix();
+  sycl::DeviceMatrix dmat;
+  dmat.Init(qu, p_fmat.get());
+
+  common::GHistIndexMatrix gmat;
+  gmat.Init(qu, &ctx, dmat, max_bins);
+
+  RowSetCollection row_set_collection;
+  auto& row_indices = row_set_collection.Data();
+  row_indices.Resize(&qu, num_rows);
+  size_t* p_row_indices = row_indices.Data();
+
+  qu.submit([&](::sycl::handler& cgh) {
+    cgh.parallel_for<>(::sycl::range<1>(num_rows),
+                       [p_row_indices](::sycl::item<1> pid) {
+      const size_t idx = pid.get_id(0);
+      p_row_indices[idx] = idx;
+    });
+  }).wait_and_throw();
+  row_set_collection.Init();
+
+  RegTree tree;
+  tree.ExpandNode(0, 0, 0, false, 0, 0, 0, 0, 0, 0, 0);
+
+  const size_t n_nodes = row_set_collection.Size();
+  PartitionBuilder partition_builder;
+  partition_builder.Init(&qu, n_nodes, [&](size_t nid) {
+    return row_set_collection[nid].Size();
+  });
+
+  std::vector<tree::ExpandEntry> nodes;
+  nodes.emplace_back(tree::ExpandEntry(0, tree.GetDepth(0)));
+
+  ::sycl::event event;
+  std::vector<int32_t> split_conditions = {2};
+  partition_builder.Partition(gmat, nodes, row_set_collection,
+                    split_conditions, &tree, &event);
+  qu.wait_and_throw();
+
+  size_t* data_result = const_cast<size_t*>(row_set_collection[0].begin);
+  partition_builder.MergeToArray(0, data_result, &event);
+  qu.wait_and_throw();
+
+  bst_float split_pt = gmat.cut.Values()[split_conditions[0]];
+
+  std::vector<uint8_t> ridx_left(num_rows, 0);
+  std::vector<uint8_t> ridx_right(num_rows, 0);
+  for (auto &batch : gmat.p_fmat->GetBatches<SparsePage>()) {
+    const auto& data_vec = batch.data.HostVector();
+    const auto& offset_vec = batch.offset.HostVector();
+
+    size_t begin = offset_vec[0];
+    for (size_t idx = 0; idx < offset_vec.size() - 1; ++idx) {
+      size_t end = offset_vec[idx + 1];
+      if (begin < end) {
+        const auto& entry = data_vec[begin];
+        if (entry.fvalue < split_pt) {
+          ridx_left[idx] = 1;
+        } else {
+          ridx_right[idx] = 1;
+        }
+      } else {
+        // missing value
+        if (tree[0].DefaultLeft()) {
+          ridx_left[idx] = 1;
+        } else {
+          ridx_right[idx] = 1;
+        }
+      }
+      begin = end;
+    }
+  }
+  auto n_left  = std::accumulate(ridx_left.begin(),  ridx_left.end(),  0);
+  auto n_right = std::accumulate(ridx_right.begin(), ridx_right.end(), 0);
+
+  std::vector<size_t> row_indices_host(num_rows);
+  qu.memcpy(row_indices_host.data(), row_indices.Data(), num_rows * sizeof(size_t));
+  qu.wait_and_throw();
+
+  ASSERT_EQ(n_left,  partition_builder.GetNLeftElems(0));
+  for (size_t i = 0; i < n_left; ++i) {
+    auto idx = row_indices_host[i];
+    ASSERT_EQ(ridx_left[idx], 1);
+  }
+
+  ASSERT_EQ(n_right, partition_builder.GetNRightElems(0));
+  for (size_t i = 0; i < n_right; ++i) {
+    auto idx = row_indices_host[num_rows - 1 - i];
+    ASSERT_EQ(ridx_right[idx], 1);
+  }
+}
+
 TEST(SyclPartitionBuilder, BasicTest) {
   constexpr size_t kNodes = 5;
   // Number of rows for each node
@@ -67,7 +169,7 @@ TEST(SyclPartitionBuilder, BasicTest) {
   std::vector<size_t> v(*std::max_element(rows.begin(), rows.end()));
   size_t row_id = 0;
   for(size_t nid = 0; nid < kNodes; ++nid) {
-    builder.MergeToArray(nid, v.data(), event);
+    builder.MergeToArray(nid, v.data(), &event);
     qu.wait();
 
     // Check that row_id for left side are correct
@@ -88,4 +190,20 @@ TEST(SyclPartitionBuilder, BasicTest) {
   }
 }
 
+TEST(SyclPartitionBuilder, PartitioningSparce) {
+  TestPartitioning(0.3, 256);
+}
+
+TEST(SyclPartitionBuilder, PartitioningDence8Bits) {
+  TestPartitioning(0.0, 256);
+}
+
+TEST(SyclPartitionBuilder, PartitioningDence16Bits) {
+  TestPartitioning(0.0, 256 + 1);
+}
+
+TEST(SyclPartitionBuilder, PartitioningDence32Bits) {
+  TestPartitioning(0.0, (1u << 16) + 1);
+}
+
 }  // namespace xgboost::common
diff --git a/tests/cpp/plugin/test_sycl_predictor.cc b/tests/cpp/plugin/test_sycl_predictor.cc
index d5b3a5e5c..7bd788a3b 100755
--- a/tests/cpp/plugin/test_sycl_predictor.cc
+++ b/tests/cpp/plugin/test_sycl_predictor.cc
@@ -43,7 +43,7 @@ TEST(SyclPredictor, ExternalMemory) {
 }
 
 TEST(SyclPredictor, InplacePredict) {
-  bst_row_t constexpr kRows{128};
+  bst_idx_t constexpr kRows{128};
   bst_feature_t constexpr kCols{64};
   Context ctx;
   auto gen = RandomDataGenerator{kRows, kCols, 0.5}.Device(ctx.Device());
@@ -106,4 +106,4 @@ TEST(SyclPredictor, Multi) {
   TestVectorLeafPrediction(&ctx);
 }
 
-}  // namespace xgboost
\ No newline at end of file
+}  // namespace xgboost
diff --git a/tests/cpp/plugin/test_sycl_quantile_hist_builder.cc b/tests/cpp/plugin/test_sycl_quantile_hist_builder.cc
new file mode 100644
index 000000000..4bf7bd962
--- /dev/null
+++ b/tests/cpp/plugin/test_sycl_quantile_hist_builder.cc
@@ -0,0 +1,55 @@
+/**
+ * Copyright 2020-2024 by XGBoost contributors
+ */
+#include <gtest/gtest.h>
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
+#pragma GCC diagnostic ignored "-W#pragma-messages"
+#include <xgboost/json.h>
+#include <xgboost/task.h>
+#include "../../../plugin/sycl/tree/updater_quantile_hist.h"       // for QuantileHistMaker
+#pragma GCC diagnostic pop
+
+namespace xgboost::sycl::tree {
+TEST(SyclQuantileHistMaker, Basic) {
+  Context ctx;
+  ctx.UpdateAllowUnknown(Args{{"device", "sycl"}});
+
+  ObjInfo task{ObjInfo::kRegression};
+  std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_quantile_histmaker_sycl", &ctx, &task)};
+
+  ASSERT_EQ(updater->Name(), "grow_quantile_histmaker_sycl");
+}
+
+TEST(SyclQuantileHistMaker, JsonIO) {
+  Context ctx;
+  ctx.UpdateAllowUnknown(Args{{"device", "sycl"}});
+
+  ObjInfo task{ObjInfo::kRegression};
+  Json config {Object()};
+  {
+    std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_quantile_histmaker_sycl", &ctx, &task)};
+    updater->Configure({{"max_depth", std::to_string(42)}});
+    updater->Configure({{"single_precision_histogram", std::to_string(true)}});
+    updater->SaveConfig(&config);
+  }
+
+  {
+    std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create("grow_quantile_histmaker_sycl", &ctx, &task)};
+    updater->LoadConfig(config);
+
+    Json new_config {Object()};
+    updater->SaveConfig(&new_config);
+
+    ASSERT_EQ(config, new_config);
+
+    auto max_depth = atoi(get<String const>(new_config["train_param"]["max_depth"]).c_str());
+    ASSERT_EQ(max_depth, 42);
+
+    auto single_precision_histogram = atoi(get<String const>(new_config["sycl_hist_train_param"]["single_precision_histogram"]).c_str());
+    ASSERT_EQ(single_precision_histogram, 1);
+  }
+  
+}
+}  // namespace xgboost::sycl::tree
diff --git a/tests/cpp/plugin/test_sycl_row_set_collection.cc b/tests/cpp/plugin/test_sycl_row_set_collection.cc
new file mode 100644
index 000000000..f527d9f16
--- /dev/null
+++ b/tests/cpp/plugin/test_sycl_row_set_collection.cc
@@ -0,0 +1,78 @@
+/**
+ * Copyright 2020-2023 by XGBoost contributors
+ */
+#include <gtest/gtest.h>
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "../../../plugin/sycl/common/row_set.h"
+#include "../../../plugin/sycl/device_manager.h"
+#include "../helpers.h"
+
+namespace xgboost::sycl::common {
+TEST(SyclRowSetCollection, AddSplits) {
+  const size_t num_rows = 16;
+
+  DeviceManager device_manager;
+  auto qu = device_manager.GetQueue(DeviceOrd::SyclDefault());
+
+  RowSetCollection row_set_collection;
+
+  auto& row_indices = row_set_collection.Data();
+  row_indices.Resize(&qu, num_rows);
+  size_t* p_row_indices = row_indices.Data();
+
+  qu.submit([&](::sycl::handler& cgh) {
+    cgh.parallel_for<>(::sycl::range<1>(num_rows),
+                       [p_row_indices](::sycl::item<1> pid) {
+      const size_t idx = pid.get_id(0);
+      p_row_indices[idx] = idx;
+    });
+  }).wait_and_throw();
+  row_set_collection.Init();
+
+  CHECK_EQ(row_set_collection.Size(), 1);
+  {
+    size_t nid_test = 0;
+    auto& elem = row_set_collection[nid_test];
+    CHECK_EQ(elem.begin, row_indices.Begin());
+    CHECK_EQ(elem.end, row_indices.End());
+    CHECK_EQ(elem.node_id , 0);
+  }
+
+  size_t nid = 0;
+  size_t nid_left = 1;
+  size_t nid_right = 2;
+  size_t n_left = 4;
+  size_t n_right = num_rows - n_left;
+  row_set_collection.AddSplit(nid, nid_left, nid_right, n_left, n_right);
+  CHECK_EQ(row_set_collection.Size(), 3);
+
+  {
+    size_t nid_test = 0;
+    auto& elem = row_set_collection[nid_test];
+    CHECK_EQ(elem.begin, nullptr);
+    CHECK_EQ(elem.end, nullptr);
+    CHECK_EQ(elem.node_id , -1);
+  }
+
+  {
+    size_t nid_test = 1;
+    auto& elem = row_set_collection[nid_test];
+    CHECK_EQ(elem.begin, row_indices.Begin());
+    CHECK_EQ(elem.end, row_indices.Begin() + n_left);
+    CHECK_EQ(elem.node_id , nid_test);
+  }
+
+  {
+    size_t nid_test = 2;
+    auto& elem = row_set_collection[nid_test];
+    CHECK_EQ(elem.begin, row_indices.Begin() + n_left);
+    CHECK_EQ(elem.end, row_indices.End());
+    CHECK_EQ(elem.node_id , nid_test);
+  }
+
+}
+}  // namespace xgboost::sycl::common
diff --git a/tests/cpp/plugin/test_sycl_split_evaluator.cc b/tests/cpp/plugin/test_sycl_split_evaluator.cc
new file mode 100644
index 000000000..507490fd1
--- /dev/null
+++ b/tests/cpp/plugin/test_sycl_split_evaluator.cc
@@ -0,0 +1,134 @@
+/**
+ * Copyright 2020-2024 by XGBoost contributors
+ */
+#include <gtest/gtest.h>
+#include <vector>
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
+#pragma GCC diagnostic ignored "-W#pragma-messages"
+#include "../../../plugin/sycl/tree/split_evaluator.h"
+#pragma GCC diagnostic pop
+
+#include "../../../plugin/sycl/device_manager.h"
+#include "../helpers.h"
+
+namespace xgboost::sycl::tree {
+
+template<typename GradientSumT>
+void BasicTestSplitEvaluator(const std::string& monotone_constraints, bool has_constrains) {
+  const size_t n_columns = 2;
+
+  xgboost::tree::TrainParam param;
+  param.UpdateAllowUnknown(Args{{"min_child_weight", "0"},
+                                {"reg_lambda", "0"},
+                                {"monotone_constraints", monotone_constraints}});
+
+  DeviceManager device_manager;
+  auto qu = device_manager.GetQueue(DeviceOrd::SyclDefault());
+
+  TreeEvaluator<GradientSumT> tree_evaluator(qu, param, n_columns);
+  {
+    // Check correctness of has_constrains flag
+    ASSERT_EQ(tree_evaluator.HasConstraint(), has_constrains);
+  }
+
+  auto split_evaluator = tree_evaluator.GetEvaluator();
+  {
+    // Check if params were inititialised correctly
+    ASSERT_EQ(split_evaluator.param.min_child_weight, param.min_child_weight);
+    ASSERT_EQ(split_evaluator.param.reg_lambda, param.reg_lambda);
+    ASSERT_EQ(split_evaluator.param.reg_alpha, param.reg_alpha);
+    ASSERT_EQ(split_evaluator.param.max_delta_step, param.max_delta_step);
+  }
+}
+
+template<typename GradientSumT>
+void TestSplitEvaluator(const std::string& monotone_constraints) {
+  const size_t n_columns = 2;
+
+  xgboost::tree::TrainParam param;
+  param.UpdateAllowUnknown(Args{{"min_child_weight", "0"},
+                                {"reg_lambda", "0"},
+                                {"monotone_constraints", monotone_constraints}});
+
+  DeviceManager device_manager;
+  auto qu = device_manager.GetQueue(DeviceOrd::SyclDefault());
+
+  TreeEvaluator<GradientSumT> tree_evaluator(qu, param, n_columns);
+  auto split_evaluator = tree_evaluator.GetEvaluator();
+  {
+    // Test ThresholdL1
+    const GradientSumT alpha = 0.5;
+    {
+      const GradientSumT val = 0.0;
+      const auto trh = split_evaluator.ThresholdL1(val, alpha);
+      ASSERT_EQ(trh, 0.0);
+    }
+
+    {
+      const GradientSumT val = 1.0;
+      const auto trh = split_evaluator.ThresholdL1(val, alpha);
+      ASSERT_EQ(trh, val - alpha);
+    }
+
+    {
+      const GradientSumT val = -1.0;
+      const auto trh = split_evaluator.ThresholdL1(val, alpha);
+      ASSERT_EQ(trh, val + alpha);
+    }
+  }
+
+  {
+    constexpr float eps = 1e-8;
+    tree_evaluator.AddSplit(0, 1, 2, 0, 0.3, 0.7);
+
+    GradStats<GradientSumT> left(0.1, 0.2);
+    GradStats<GradientSumT> right(0.3, 0.4);
+    bst_node_t nidx = 0;
+    bst_feature_t fidx = 0;
+
+    GradientSumT wleft  = split_evaluator.CalcWeight(nidx, left);
+    // wleft = -grad/hess = -0.1/0.2
+    EXPECT_NEAR(wleft, -0.5, eps);
+    GradientSumT wright = split_evaluator.CalcWeight(nidx, right);
+    // wright = -grad/hess = -0.3/0.4
+    EXPECT_NEAR(wright, -0.75, eps);
+
+    GradientSumT gweight_left = split_evaluator.CalcGainGivenWeight(nidx, left, wleft);
+    // gweight_left = left.grad**2 / left.hess = 0.1*0.1/0.2 = 0.05
+    EXPECT_NEAR(gweight_left, 0.05, eps);
+    // gweight_left = right.grad**2 / right.hess = 0.3*0.3/0.4 = 0.225
+    GradientSumT gweight_right = split_evaluator.CalcGainGivenWeight(nidx, right, wright);
+    EXPECT_NEAR(gweight_right, 0.225, eps);
+
+    GradientSumT split_gain = split_evaluator.CalcSplitGain(nidx, fidx, left, right);
+    if (!tree_evaluator.HasConstraint()) {
+      EXPECT_NEAR(split_gain, gweight_left + gweight_right, eps);
+    } else {
+      // Parameters are chosen to have -inf here
+      ASSERT_EQ(split_gain, -std::numeric_limits<GradientSumT>::infinity());
+    }
+  }
+}
+
+TEST(SyclSplitEvaluator, BasicTest) {
+  BasicTestSplitEvaluator<float>("( 0,  0)", false);
+  BasicTestSplitEvaluator<float>("( 1,  0)", true);
+  BasicTestSplitEvaluator<float>("( 0,  1)", true);
+  BasicTestSplitEvaluator<float>("(-1,  0)", true);
+  BasicTestSplitEvaluator<float>("( 0, -1)", true);
+  BasicTestSplitEvaluator<float>("( 1,  1)", true);
+  BasicTestSplitEvaluator<float>("(-1, -1)", true);
+  BasicTestSplitEvaluator<float>("( 1, -1)", true);
+  BasicTestSplitEvaluator<float>("(-1,  1)", true);
+}
+
+TEST(SyclSplitEvaluator, TestMath) {
+  // Without constraints
+  TestSplitEvaluator<float>("( 0,  0)");
+  // With constraints
+  TestSplitEvaluator<float>("( 1,  0)");
+}
+
+}  // namespace xgboost::sycl::tree
diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc
index 8f3955c05..46b085916 100644
--- a/tests/cpp/predictor/test_cpu_predictor.cc
+++ b/tests/cpp/predictor/test_cpu_predictor.cc
@@ -65,7 +65,7 @@ TEST(CpuPredictor, ExternalMemory) {
 }
 
 TEST(CpuPredictor, InplacePredict) {
-  bst_row_t constexpr kRows{128};
+  bst_idx_t constexpr kRows{128};
   bst_feature_t constexpr kCols{64};
   Context ctx;
   auto gen = RandomDataGenerator{kRows, kCols, 0.5}.Device(ctx.Device());
@@ -83,7 +83,7 @@ TEST(CpuPredictor, InplacePredict) {
 
   {
     HostDeviceVector<float> data;
-    HostDeviceVector<bst_row_t> rptrs;
+    HostDeviceVector<std::size_t> rptrs;
     HostDeviceVector<bst_feature_t> columns;
     gen.GenerateCSR(&data, &rptrs, &columns);
     auto data_interface = GetArrayInterface(&data, kRows * kCols, 1);
@@ -148,7 +148,7 @@ TEST(CPUPredictor, GHistIndexTraining) {
   auto adapter = data::ArrayAdapter(columnar.c_str());
   std::shared_ptr<DMatrix> p_full{
       DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 1)};
-  TestTrainingPrediction(&ctx, kRows, kBins, p_full, p_hist);
+  TestTrainingPrediction(&ctx, kRows, kBins, p_full, p_hist, true);
 }
 
 TEST(CPUPredictor, CategoricalPrediction) {
diff --git a/tests/cpp/predictor/test_predictor.cc b/tests/cpp/predictor/test_predictor.cc
index 3420de78e..ceb1ef1b1 100644
--- a/tests/cpp/predictor/test_predictor.cc
+++ b/tests/cpp/predictor/test_predictor.cc
@@ -118,7 +118,8 @@ TEST(Predictor, PredictionCache) {
 }
 
 void TestTrainingPrediction(Context const *ctx, size_t rows, size_t bins,
-                            std::shared_ptr<DMatrix> p_full, std::shared_ptr<DMatrix> p_hist) {
+                            std::shared_ptr<DMatrix> p_full, std::shared_ptr<DMatrix> p_hist,
+                            bool check_contribs) {
   size_t constexpr kCols = 16;
   size_t constexpr kClasses = 3;
   size_t constexpr kIters = 3;
@@ -161,9 +162,31 @@ void TestTrainingPrediction(Context const *ctx, size_t rows, size_t bins,
   for (size_t i = 0; i < rows; ++i) {
     EXPECT_NEAR(from_hist.ConstHostVector()[i], from_full.ConstHostVector()[i], kRtEps);
   }
+
+  if (check_contribs) {
+    // Contributions
+    HostDeviceVector<float> from_full_contribs;
+    learner->Predict(p_full, false, &from_full_contribs, 0, 0, false, false, true);
+    HostDeviceVector<float> from_hist_contribs;
+    learner->Predict(p_hist, false, &from_hist_contribs, 0, 0, false, false, true);
+    for (size_t i = 0; i < from_full_contribs.ConstHostVector().size(); ++i) {
+      EXPECT_NEAR(from_hist_contribs.ConstHostVector()[i],
+                  from_full_contribs.ConstHostVector()[i], kRtEps);
+    }
+
+    // Contributions (approximate method)
+    HostDeviceVector<float> from_full_approx_contribs;
+    learner->Predict(p_full, false, &from_full_approx_contribs, 0, 0, false, false, false, true);
+    HostDeviceVector<float> from_hist_approx_contribs;
+    learner->Predict(p_hist, false, &from_hist_approx_contribs, 0, 0, false, false, false, true);
+    for (size_t i = 0; i < from_full_approx_contribs.ConstHostVector().size(); ++i) {
+      EXPECT_NEAR(from_hist_approx_contribs.ConstHostVector()[i],
+                  from_full_approx_contribs.ConstHostVector()[i], kRtEps);
+    }
+  }
 }
 
-void TestInplacePrediction(Context const *ctx, std::shared_ptr<DMatrix> x, bst_row_t rows,
+void TestInplacePrediction(Context const *ctx, std::shared_ptr<DMatrix> x, bst_idx_t rows,
                            bst_feature_t cols) {
   std::size_t constexpr kClasses { 4 };
   auto gen = RandomDataGenerator{rows, cols, 0.5}.Device(ctx->Device());
@@ -232,7 +255,7 @@ std::unique_ptr<Learner> LearnerForTest(Context const *ctx, std::shared_ptr<DMat
   return learner;
 }
 
-void VerifyPredictionWithLesserFeatures(Learner *learner, bst_row_t kRows,
+void VerifyPredictionWithLesserFeatures(Learner *learner, bst_idx_t kRows,
                                         std::shared_ptr<DMatrix> m_test,
                                         std::shared_ptr<DMatrix> m_invalid) {
   HostDeviceVector<float> prediction;
diff --git a/tests/cpp/predictor/test_predictor.h b/tests/cpp/predictor/test_predictor.h
index c2b28883a..1ccd35102 100644
--- a/tests/cpp/predictor/test_predictor.h
+++ b/tests/cpp/predictor/test_predictor.h
@@ -89,9 +89,10 @@ void TestBasic(DMatrix* dmat, Context const * ctx);
 
 // p_full and p_hist should come from the same data set.
 void TestTrainingPrediction(Context const* ctx, size_t rows, size_t bins,
-                            std::shared_ptr<DMatrix> p_full, std::shared_ptr<DMatrix> p_hist);
+                            std::shared_ptr<DMatrix> p_full, std::shared_ptr<DMatrix> p_hist,
+                            bool check_contribs = false);
 
-void TestInplacePrediction(Context const* ctx, std::shared_ptr<DMatrix> x, bst_row_t rows,
+void TestInplacePrediction(Context const* ctx, std::shared_ptr<DMatrix> x, bst_idx_t rows,
                            bst_feature_t cols);
 
 void TestPredictionWithLesserFeatures(Context const* ctx);
diff --git a/tests/cpp/test_helpers.cc b/tests/cpp/test_helpers.cc
index 79d8d2475..f582ba564 100644
--- a/tests/cpp/test_helpers.cc
+++ b/tests/cpp/test_helpers.cc
@@ -11,7 +11,7 @@ TEST(RandomDataGenerator, DMatrix) {
   auto p_dmatrix = RandomDataGenerator{kRows, kCols, kSparsity}.GenerateDMatrix();
 
   HostDeviceVector<float> csr_value;
-  HostDeviceVector<bst_row_t> csr_rptr;
+  HostDeviceVector<std::size_t> csr_rptr;
   HostDeviceVector<bst_feature_t> csr_cidx;
   RandomDataGenerator{kRows, kCols, kSparsity}.GenerateCSR(&csr_value, &csr_rptr, &csr_cidx);
 
diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc
index 8dccea10b..a2cf1a289 100644
--- a/tests/cpp/test_learner.cc
+++ b/tests/cpp/test_learner.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2017-2023, XGBoost contributors
+ * Copyright 2017-2024, XGBoost contributors
  */
 #include <gtest/gtest.h>
 #include <gmock/gmock.h>
@@ -12,7 +12,6 @@
 #include <cinttypes>                                // for int32_t, int64_t, uint32_t
 #include <cstddef>                                  // for size_t
 #include <iosfwd>                                   // for ofstream
-#include <iterator>                                 // for back_insert_iterator, back_inserter
 #include <limits>                                   // for numeric_limits
 #include <map>                                      // for map
 #include <memory>                                   // for unique_ptr, shared_ptr, __shared_ptr_...
@@ -30,7 +29,6 @@
 #include "../../src/common/random.h"                // for GlobalRandom
 #include "dmlc/io.h"                                // for Stream
 #include "dmlc/omp.h"                               // for omp_get_max_threads
-#include "dmlc/registry.h"                          // for Registry
 #include "filesystem.h"                             // for TemporaryDirectory
 #include "helpers.h"                                // for GetBaseScore, RandomDataGenerator
 #include "objective_helpers.h"                      // for MakeObjNamesForTest, ObjTestNameGenerator
@@ -82,9 +80,7 @@ TEST(Learner, ParameterValidation) {
 
   // whitespace
   learner->SetParam("tree method", "exact");
-  EXPECT_THAT([&] { learner->Configure(); },
-              ::testing::ThrowsMessage<dmlc::Error>(
-                  ::testing::HasSubstr(R"("tree method" contains whitespace)")));
+  ASSERT_THAT([&] { learner->Configure(); }, GMockThrow(R"("tree method" contains whitespace)"));
 }
 
 TEST(Learner, CheckGroup) {
@@ -105,9 +101,9 @@ TEST(Learner, CheckGroup) {
     labels[i] = i % 2;
   }
 
-  p_mat->SetInfo("weight", static_cast<void *>(weight.data()), DataType::kFloat32, kNumGroups);
-  p_mat->SetInfo("group", group.data(), DataType::kUInt32, kNumGroups);
-  p_mat->SetInfo("label", labels.data(), DataType::kFloat32, kNumRows);
+  p_mat->SetInfo("weight", Make1dInterfaceTest(weight.data(), kNumGroups));
+  p_mat->SetInfo("group", Make1dInterfaceTest(group.data(), kNumGroups));
+  p_mat->SetInfo("label", Make1dInterfaceTest(labels.data(), kNumRows));
 
   std::vector<std::shared_ptr<xgboost::DMatrix>> mat = {p_mat};
   auto learner = std::unique_ptr<Learner>(Learner::Create(mat));
@@ -117,7 +113,7 @@ TEST(Learner, CheckGroup) {
   group.resize(kNumGroups+1);
   group[3] = 4;
   group[4] = 1;
-  p_mat->SetInfo("group", group.data(), DataType::kUInt32, kNumGroups+1);
+  p_mat->SetInfo("group", Make1dInterfaceTest(group.data(), kNumGroups+1));
   EXPECT_ANY_THROW(learner->UpdateOneIter(0, p_mat));
 }
 
@@ -134,7 +130,7 @@ TEST(Learner, SLOW_CheckMultiBatch) {  // NOLINT
   for (size_t i = 0; i < num_row; ++i) {
     labels[i] = i % 2;
   }
-  dmat->SetInfo("label", labels.data(), DataType::kFloat32, num_row);
+  dmat->SetInfo("label", Make1dInterfaceTest(labels.data(), num_row));
   std::vector<std::shared_ptr<DMatrix>> mat{dmat};
   auto learner = std::unique_ptr<Learner>(Learner::Create(mat));
   learner->SetParams(Args{{"objective", "binary:logistic"}});
@@ -219,7 +215,7 @@ TEST(Learner, JsonModelIO) {
 }
 
 TEST(Learner, ConfigIO) {
-  bst_row_t n_samples = 128;
+  bst_idx_t n_samples = 128;
   bst_feature_t n_features = 12;
   std::shared_ptr<DMatrix> p_fmat{
       RandomDataGenerator{n_samples, n_features, 0}.GenerateDMatrix(true, false, 2)};
diff --git a/tests/cpp/test_serialization.cc b/tests/cpp/test_serialization.cc
index bf23991c1..eacfcc58f 100644
--- a/tests/cpp/test_serialization.cc
+++ b/tests/cpp/test_serialization.cc
@@ -60,7 +60,7 @@ void CompareJSON(Json l, Json r) {
     }
     break;
   }
-  case Value::ValueKind::kNumberArray: {
+  case Value::ValueKind::kF32Array: {
     auto const& l_arr = get<F32Array const>(l);
     auto const& r_arr = get<F32Array const>(r);
     ASSERT_EQ(l_arr.size(), r_arr.size());
@@ -69,6 +69,15 @@ void CompareJSON(Json l, Json r) {
     }
     break;
   }
+  case Value::ValueKind::kF64Array: {
+    auto const& l_arr = get<F64Array const>(l);
+    auto const& r_arr = get<F64Array const>(r);
+    ASSERT_EQ(l_arr.size(), r_arr.size());
+    for (size_t i = 0; i < l_arr.size(); ++i) {
+      ASSERT_NEAR(l_arr[i], r_arr[i], kRtEps);
+    }
+    break;
+  }
   case Value::ValueKind::kU8Array: {
     CompareIntArray<U8Array>(l, r);
     break;
diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
index 862bc6bfc..f4accfc8a 100644
--- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
+++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
@@ -363,7 +363,7 @@ TEST(GpuHist, EvaluateSingleSplitMissing) {
   GPUTrainingParam param{tparam};
 
   thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0};
-  thrust::device_vector<uint32_t> feature_segments = std::vector<bst_row_t>{0, 2};
+  thrust::device_vector<uint32_t> feature_segments = std::vector<bst_idx_t>{0, 2};
   thrust::device_vector<float> feature_values = std::vector<float>{1.0, 2.0};
   thrust::device_vector<float> feature_min_values = std::vector<float>{0.0};
   auto feature_histogram = ConvertToInteger(&ctx, {{-0.5, 0.5}, {0.5, 0.5}});
@@ -412,7 +412,7 @@ TEST(GpuHist, EvaluateSingleSplitFeatureSampling) {
   GPUTrainingParam param{tparam};
 
   thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{1};
-  thrust::device_vector<uint32_t> feature_segments = std::vector<bst_row_t>{0, 2, 4};
+  thrust::device_vector<uint32_t> feature_segments = std::vector<bst_idx_t>{0, 2, 4};
   thrust::device_vector<float> feature_values = std::vector<float>{1.0, 2.0, 11.0, 12.0};
   thrust::device_vector<float> feature_min_values = std::vector<float>{0.0, 10.0};
   auto feature_histogram =
@@ -446,7 +446,7 @@ TEST(GpuHist, EvaluateSingleSplitBreakTies) {
   GPUTrainingParam param{tparam};
 
   thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0, 1};
-  thrust::device_vector<uint32_t> feature_segments = std::vector<bst_row_t>{0, 2, 4};
+  thrust::device_vector<uint32_t> feature_segments = std::vector<bst_idx_t>{0, 2, 4};
   thrust::device_vector<float> feature_values = std::vector<float>{1.0, 2.0, 11.0, 12.0};
   thrust::device_vector<float> feature_min_values = std::vector<float>{0.0, 10.0};
   auto feature_histogram =
@@ -478,7 +478,7 @@ TEST(GpuHist, EvaluateSplits) {
   GPUTrainingParam param{tparam};
 
   thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0, 1};
-  thrust::device_vector<uint32_t> feature_segments = std::vector<bst_row_t>{0, 2, 4};
+  thrust::device_vector<uint32_t> feature_segments = std::vector<bst_idx_t>{0, 2, 4};
   thrust::device_vector<float> feature_values = std::vector<float>{1.0, 2.0, 11.0, 12.0};
   thrust::device_vector<float> feature_min_values = std::vector<float>{0.0, 0.0};
   auto feature_histogram_left =
diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu
index f7f2e27ea..84cd956db 100644
--- a/tests/cpp/tree/gpu_hist/test_histogram.cu
+++ b/tests/cpp/tree/gpu_hist/test_histogram.cu
@@ -239,4 +239,18 @@ void TestAtomicAdd() {
 TEST(Histogram, AtomicAddInt64) {
   TestAtomicAdd();
 }
+
+TEST(Histogram, Quantiser) {
+  auto ctx = MakeCUDACtx(0);
+  std::size_t n_samples{16};
+  HostDeviceVector<GradientPair> gpair(n_samples, GradientPair{1.0, 1.0});
+  gpair.SetDevice(ctx.Device());
+
+  auto quantiser = GradientQuantiser(&ctx, gpair.DeviceSpan(), MetaInfo());
+  for (auto v : gpair.ConstHostVector()) {
+    auto gh = quantiser.ToFloatingPoint(quantiser.ToFixedPoint(v));
+    ASSERT_EQ(gh.GetGrad(), 1.0);
+    ASSERT_EQ(gh.GetHess(), 1.0);
+  }
+}
 }  // namespace xgboost::tree
diff --git a/tests/cpp/tree/hist/test_histogram.cc b/tests/cpp/tree/hist/test_histogram.cc
index 25a800367..5b48f2793 100644
--- a/tests/cpp/tree/hist/test_histogram.cc
+++ b/tests/cpp/tree/hist/test_histogram.cc
@@ -409,9 +409,9 @@ void TestHistogramExternalMemory(Context const *ctx, BatchParam batch_param, boo
     batch_param.hess = hess;
   }
 
-  std::vector<std::size_t> partition_size(1, 0);
+  std::vector<bst_idx_t> partition_size(1, 0);
   bst_bin_t total_bins{0};
-  bst_row_t n_samples{0};
+  bst_idx_t n_samples{0};
 
   auto gpair = GenerateRandomGradients(m->Info().num_row_, 0.0, 1.0);
   auto const &h_gpair = gpair.HostVector();
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index de24cf827..8774eeccb 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -441,7 +441,7 @@ RegTree GetHistTree(Context const* ctx, DMatrix* dmat) {
   return tree;
 }
 
-void VerifyHistColumnSplit(bst_row_t rows, bst_feature_t cols, RegTree const& expected_tree) {
+void VerifyHistColumnSplit(bst_idx_t rows, bst_feature_t cols, RegTree const& expected_tree) {
   Context ctx(MakeCUDACtx(GPUIDX));
 
   auto Xy = RandomDataGenerator{rows, cols, 0}.GenerateDMatrix(true);
@@ -491,7 +491,7 @@ RegTree GetApproxTree(Context const* ctx, DMatrix* dmat) {
   return tree;
 }
 
-void VerifyApproxColumnSplit(bst_row_t rows, bst_feature_t cols, RegTree const& expected_tree) {
+void VerifyApproxColumnSplit(bst_idx_t rows, bst_feature_t cols, RegTree const& expected_tree) {
   Context ctx(MakeCUDACtx(GPUIDX));
 
   auto Xy = RandomDataGenerator{rows, cols, 0}.GenerateDMatrix(true);
diff --git a/tests/cpp/tree/test_multi_target_tree_model.cc b/tests/cpp/tree/test_multi_target_tree_model.cc
index 550b8837c..0b5745a20 100644
--- a/tests/cpp/tree/test_multi_target_tree_model.cc
+++ b/tests/cpp/tree/test_multi_target_tree_model.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023 by XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/context.h>     // for Context
@@ -7,16 +7,23 @@
 #include <xgboost/tree_model.h>  // for RegTree
 
 namespace xgboost {
-TEST(MultiTargetTree, JsonIO) {
+namespace {
+auto MakeTreeForTest() {
   bst_target_t n_targets{3};
   bst_feature_t n_features{4};
   RegTree tree{n_targets, n_features};
-  ASSERT_TRUE(tree.IsMultiTarget());
+  CHECK(tree.IsMultiTarget());
   linalg::Vector<float> base_weight{{1.0f, 2.0f, 3.0f}, {3ul}, DeviceOrd::CPU()};
   linalg::Vector<float> left_weight{{2.0f, 3.0f, 4.0f}, {3ul}, DeviceOrd::CPU()};
   linalg::Vector<float> right_weight{{3.0f, 4.0f, 5.0f}, {3ul}, DeviceOrd::CPU()};
   tree.ExpandNode(RegTree::kRoot, /*split_idx=*/1, 0.5f, true, base_weight.HostView(),
                   left_weight.HostView(), right_weight.HostView());
+  return tree;
+}
+}  // namespace
+
+TEST(MultiTargetTree, JsonIO) {
+  auto tree = MakeTreeForTest();
   ASSERT_EQ(tree.NumNodes(), 3);
   ASSERT_EQ(tree.NumTargets(), 3);
   ASSERT_EQ(tree.GetMultiTargetTree()->Size(), 3);
@@ -44,4 +51,28 @@ TEST(MultiTargetTree, JsonIO) {
   loaded.SaveModel(&jtree1);
   check_jtree(jtree1, tree);
 }
+
+TEST(MultiTargetTree, DumpDot) {
+  auto tree = MakeTreeForTest();
+  auto n_features = tree.NumFeatures();
+  FeatureMap fmap;
+  for (bst_feature_t f = 0; f < n_features; ++f) {
+    auto name = "feat_" + std::to_string(f);
+    fmap.PushBack(f, name.c_str(), "q");
+  }
+  auto str = tree.DumpModel(fmap, true, "dot");
+  ASSERT_NE(str.find("leaf=[2, 3, 4]"), std::string::npos);
+  ASSERT_NE(str.find("leaf=[3, 4, 5]"), std::string::npos);
+
+  {
+    bst_target_t n_targets{4};
+    bst_feature_t n_features{4};
+    RegTree tree{n_targets, n_features};
+    linalg::Vector<float> weight{{1.0f, 2.0f, 3.0f, 4.0f}, {4ul}, DeviceOrd::CPU()};
+    tree.ExpandNode(RegTree::kRoot, /*split_idx=*/1, 0.5f, true, weight.HostView(),
+                    weight.HostView(), weight.HostView());
+    auto str = tree.DumpModel(fmap, true, "dot");
+    ASSERT_NE(str.find("leaf=[1, 2, ..., 4]"), std::string::npos);
+  }
+}
 }  // namespace xgboost
diff --git a/tests/cpp/tree/test_quantile_hist.cc b/tests/cpp/tree/test_quantile_hist.cc
index cf806536a..1c3651005 100644
--- a/tests/cpp/tree/test_quantile_hist.cc
+++ b/tests/cpp/tree/test_quantile_hist.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2018-2023 by XGBoost Contributors
+ * Copyright 2018-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <xgboost/host_device_vector.h>
@@ -18,7 +18,6 @@
 #include "xgboost/data.h"
 
 namespace xgboost::tree {
-
 namespace {
 template <typename ExpandEntry>
 void TestPartitioner(bst_target_t n_targets) {
@@ -202,7 +201,7 @@ TEST(QuantileHist, PartitionerColSplit) { TestColumnSplitPartitioner<CPUExpandEn
 TEST(QuantileHist, MultiPartitionerColSplit) { TestColumnSplitPartitioner<MultiExpandEntry>(3); }
 
 namespace {
-void VerifyColumnSplit(Context const* ctx, bst_row_t rows, bst_feature_t cols, bst_target_t n_targets,
+void VerifyColumnSplit(Context const* ctx, bst_idx_t rows, bst_feature_t cols, bst_target_t n_targets,
                        RegTree const& expected_tree) {
   auto Xy = RandomDataGenerator{rows, cols, 0}.GenerateDMatrix(true);
   linalg::Matrix<GradientPair> gpair = GenerateRandomGradients(ctx, rows, n_targets);
@@ -253,5 +252,5 @@ void TestColumnSplit(bst_target_t n_targets) {
 
 TEST(QuantileHist, ColumnSplit) { TestColumnSplit(1); }
 
-TEST(QuantileHist, DISABLED_ColumnSplitMultiTarget) { TestColumnSplit(3); }
+TEST(QuantileHist, ColumnSplitMultiTarget) { TestColumnSplit(3); }
 }  // namespace xgboost::tree
diff --git a/tests/cpp/tree/test_refresh.cc b/tests/cpp/tree/test_refresh.cc
index c8859c898..bbd274a08 100644
--- a/tests/cpp/tree/test_refresh.cc
+++ b/tests/cpp/tree/test_refresh.cc
@@ -15,7 +15,7 @@
 
 namespace xgboost::tree {
 TEST(Updater, Refresh) {
-  bst_row_t constexpr kRows = 8;
+  bst_idx_t constexpr kRows = 8;
   bst_feature_t constexpr kCols = 16;
   Context ctx;
 
diff --git a/tests/python-gpu/test_from_cudf.py b/tests/python-gpu/test_from_cudf.py
index 8707af0c8..c3a0b7d5f 100644
--- a/tests/python-gpu/test_from_cudf.py
+++ b/tests/python-gpu/test_from_cudf.py
@@ -71,15 +71,6 @@ def _test_from_cudf(DMatrixT):
     assert dtrain.num_col() == 1
     assert dtrain.num_row() == 5
 
-    # Boolean is not supported.
-    X_boolean = cudf.DataFrame({"x": cudf.Series([True, False])})
-    with pytest.raises(Exception):
-        dtrain = DMatrixT(X_boolean)
-
-    y_boolean = cudf.DataFrame({"x": cudf.Series([True, False, True, True, True])})
-    with pytest.raises(Exception):
-        dtrain = DMatrixT(X_boolean, label=y_boolean)
-
 
 def _test_cudf_training(DMatrixT):
     import pandas as pd
diff --git a/tests/python-gpu/test_from_cupy.py b/tests/python-gpu/test_from_cupy.py
index 79814a1bb..85d54c78d 100644
--- a/tests/python-gpu/test_from_cupy.py
+++ b/tests/python-gpu/test_from_cupy.py
@@ -202,7 +202,10 @@ class TestFromCupy:
         n = 100
         X = cp.random.random((n, 2))
         m = xgb.QuantileDMatrix(X.toDlpack())
-        with pytest.raises(xgb.core.XGBoostError):
+
+        with pytest.raises(
+            xgb.core.XGBoostError, match="Slicing DMatrix is not supported"
+        ):
             m.slice(rindex=[0, 1, 2])
 
     @pytest.mark.skipif(**tm.no_cupy())
diff --git a/tests/python-gpu/test_gpu_ranking.py b/tests/python-gpu/test_gpu_ranking.py
index 2579b17de..b7c5c3adb 100644
--- a/tests/python-gpu/test_gpu_ranking.py
+++ b/tests/python-gpu/test_gpu_ranking.py
@@ -6,6 +6,7 @@ import pytest
 
 import xgboost
 from xgboost import testing as tm
+from xgboost.testing.ranking import run_normalization
 
 pytestmark = tm.timeout(30)
 
@@ -126,3 +127,7 @@ def test_with_mq2008(objective, metric) -> None:
     dtest = xgboost.DMatrix(x_test, y_test, qid=qid_test)
 
     comp_training_with_rank_objective(dtrain, dtest, objective, metric)
+
+
+def test_normalization() -> None:
+    run_normalization("cuda")
diff --git a/tests/python/test_ranking.py b/tests/python/test_ranking.py
index 8bdeb070f..49508f594 100644
--- a/tests/python/test_ranking.py
+++ b/tests/python/test_ranking.py
@@ -13,6 +13,7 @@ import xgboost
 from xgboost import testing as tm
 from xgboost.testing.data import RelDataCV, simulate_clicks, sort_ltr_samples
 from xgboost.testing.params import lambdarank_parameter_strategy
+from xgboost.testing.ranking import run_normalization
 
 
 def test_ndcg_custom_gain():
@@ -53,6 +54,20 @@ def test_ndcg_custom_gain():
     assert byxgb.evals_result() == bynp.evals_result()
     assert byxgb_json == bynp_json
 
+    # test pairwise can handle max_rel > 31, while ndcg metric is using custom gain
+    X, y, q, w = tm.make_ltr(n_samples=1024, n_features=4, n_query_groups=3, max_rel=33)
+    ranknet = xgboost.XGBRanker(
+        tree_method="hist",
+        ndcg_exp_gain=False,
+        n_estimators=10,
+        objective="rank:pairwise",
+    )
+    ranknet.fit(X, y, qid=q, eval_set=[(X, y)], eval_qid=[q])
+    history = ranknet.evals_result()
+    assert (
+        history["validation_0"]["ndcg@32"][0] < history["validation_0"]["ndcg@32"][-1]
+    )
+
 
 def test_ranking_with_unweighted_data():
     Xrow = np.array([1, 2, 6, 8, 11, 14, 16, 17])
@@ -188,6 +203,10 @@ def test_unbiased() -> None:
     assert df["ti+"].iloc[-1] < df["ti+"].iloc[0]
 
 
+def test_normalization() -> None:
+    run_normalization("cpu")
+
+
 class TestRanking:
     @classmethod
     def setup_class(cls):
diff --git a/tests/python/test_shap.py b/tests/python/test_shap.py
index 88149c054..097298f07 100644
--- a/tests/python/test_shap.py
+++ b/tests/python/test_shap.py
@@ -2,7 +2,6 @@ import itertools
 import re
 
 import numpy as np
-import scipy
 import scipy.special
 
 import xgboost as xgb
@@ -256,3 +255,30 @@ class TestSHAP:
         brute_force[-1, -1] += base_score
         fast_method = bst.predict(xgb.DMatrix(X[0:1, :]), pred_interactions=True)
         assert np.linalg.norm(brute_force - fast_method[0, :, :]) < 1e-4
+
+    def test_shap_values(self) -> None:
+        from sklearn.datasets import make_classification, make_regression
+
+        def assert_same(X: np.ndarray, y: np.ndarray) -> None:
+            Xy = xgb.DMatrix(X, y)
+            booster = xgb.train({}, Xy, num_boost_round=4)
+            shap_dm = booster.predict(Xy, pred_contribs=True)
+            Xy = xgb.QuantileDMatrix(X, y)
+            shap_qdm = booster.predict(Xy, pred_contribs=True)
+            np.testing.assert_allclose(shap_dm, shap_qdm)
+
+            margin = booster.predict(Xy, output_margin=True)
+            np.testing.assert_allclose(
+                np.sum(shap_qdm, axis=len(shap_qdm.shape) - 1), margin, 1e-3, 1e-3
+            )
+
+            shap_dm = booster.predict(Xy, pred_interactions=True)
+            Xy = xgb.QuantileDMatrix(X, y)
+            shap_qdm = booster.predict(Xy, pred_interactions=True)
+            np.testing.assert_allclose(shap_dm, shap_qdm)
+
+        X, y = make_regression()
+        assert_same(X, y)
+
+        X, y = make_classification()
+        assert_same(X, y)
diff --git a/tests/python/test_updaters.py b/tests/python/test_updaters.py
index e7641348d..8ec1fdd9d 100644
--- a/tests/python/test_updaters.py
+++ b/tests/python/test_updaters.py
@@ -35,10 +35,24 @@ class TestTreeMethod:
     def test_exact(self, param, num_rounds, dataset):
         if dataset.name.endswith("-l1"):
             return
-        param['tree_method'] = 'exact'
+        param["tree_method"] = "exact"
         param = dataset.set_params(param)
         result = train_result(param, dataset.get_dmat(), num_rounds)
-        assert tm.non_increasing(result['train'][dataset.metric])
+        assert tm.non_increasing(result["train"][dataset.metric])
+
+    def test_exact_sample_by_node_error(self) -> None:
+        X, y, w = tm.make_regression(128, 12, False)
+        with pytest.raises(ValueError, match="column sample by node"):
+            xgb.train(
+                {"tree_method": "exact", "colsample_bynode": 0.999},
+                xgb.DMatrix(X, y, weight=w),
+            )
+
+        xgb.train(
+            {"tree_method": "exact", "colsample_bynode": 1.0},
+            xgb.DMatrix(X, y, weight=w),
+            num_boost_round=2,
+        )
 
     @given(
         exact_parameter_strategy,
diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
index 344628e4f..507470724 100644
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -517,6 +517,12 @@ def test_regression_with_custom_objective():
         labels = y[test_index]
     assert mean_squared_error(preds, labels) < 25
 
+    w = rng.uniform(low=0.0, high=1.0, size=X.shape[0])
+    reg = xgb.XGBRegressor(objective=tm.ls_obj, n_estimators=25)
+    reg.fit(X, y, sample_weight=w)
+    y_pred = reg.predict(X)
+    assert mean_squared_error(y_true=y, y_pred=y_pred, sample_weight=w) < 25
+
     # Test that the custom objective function is actually used
     class XGBCustomObjectiveException(Exception):
         pass
@@ -1456,3 +1462,16 @@ def test_intercept() -> None:
     result = reg.intercept_
     assert result.dtype == np.float32
     assert result[0] < 0.5
+
+
+def test_fit_none() -> None:
+    with pytest.raises(TypeError, match="NoneType"):
+        xgb.XGBClassifier().fit(None, [0, 1])
+
+    X = rng.normal(size=4).reshape(2, 2)
+
+    with pytest.raises(ValueError, match="Invalid classes"):
+        xgb.XGBClassifier().fit(X, None)
+
+    with pytest.raises(ValueError, match="labels"):
+        xgb.XGBRegressor().fit(X, None)
diff --git a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
index d3b60ecdf..9ef1a5566 100644
--- a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
+++ b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py
@@ -252,7 +252,7 @@ class TestDistributedGPU:
 
         X_onehot, _ = make_categorical(local_cuda_client, 10000, 30, 13, True)
         X_onehot = dask_cudf.from_dask_dataframe(X_onehot)
-        run_categorical(local_cuda_client, "gpu_hist", X, X_onehot, y)
+        run_categorical(local_cuda_client, "hist", "cuda", X, X_onehot, y)
 
     @given(
         params=hist_parameter_strategy,
diff --git a/tests/test_distributed/test_with_dask/test_with_dask.py b/tests/test_distributed/test_with_dask/test_with_dask.py
index fdf0d64c4..ca55716bb 100644
--- a/tests/test_distributed/test_with_dask/test_with_dask.py
+++ b/tests/test_distributed/test_with_dask/test_with_dask.py
@@ -315,8 +315,15 @@ def test_dask_sparse(client: "Client") -> None:
     )
 
 
-def run_categorical(client: "Client", tree_method: str, X, X_onehot, y) -> None:
-    parameters = {"tree_method": tree_method, "max_cat_to_onehot": 9999}  # force onehot
+def run_categorical(
+    client: "Client", tree_method: str, device: str, X, X_onehot, y
+) -> None:
+    # Force onehot
+    parameters = {
+        "tree_method": tree_method,
+        "device": device,
+        "max_cat_to_onehot": 9999,
+    }
     rounds = 10
     m = xgb.dask.DaskDMatrix(client, X_onehot, y, enable_categorical=True)
     by_etl_results = xgb.dask.train(
@@ -364,6 +371,7 @@ def run_categorical(client: "Client", tree_method: str, X, X_onehot, y) -> None:
         enable_categorical=True,
         n_estimators=10,
         tree_method=tree_method,
+        device=device,
         # force onehot
         max_cat_to_onehot=9999,
     )
@@ -378,7 +386,10 @@ def run_categorical(client: "Client", tree_method: str, X, X_onehot, y) -> None:
         reg.fit(X, y)
     # check partition based
     reg = xgb.dask.DaskXGBRegressor(
-        enable_categorical=True, n_estimators=10, tree_method=tree_method
+        enable_categorical=True,
+        n_estimators=10,
+        tree_method=tree_method,
+        device=device,
     )
     reg.fit(X, y, eval_set=[(X, y)])
     assert tm.non_increasing(reg.evals_result()["validation_0"]["rmse"])
@@ -398,8 +409,8 @@ def run_categorical(client: "Client", tree_method: str, X, X_onehot, y) -> None:
 def test_categorical(client: "Client") -> None:
     X, y = make_categorical(client, 10000, 30, 13)
     X_onehot, _ = make_categorical(client, 10000, 30, 13, True)
-    run_categorical(client, "approx", X, X_onehot, y)
-    run_categorical(client, "hist", X, X_onehot, y)
+    run_categorical(client, "approx", "cpu", X, X_onehot, y)
+    run_categorical(client, "hist", "cpu", X, X_onehot, y)
 
     ft = ["c"] * X.shape[1]
     reg = xgb.dask.DaskXGBRegressor(
@@ -1750,9 +1761,20 @@ class TestWithDask:
             )
             tm.non_increasing(results_native["validation_0"]["rmse"])
 
+            reg = xgb.dask.DaskXGBRegressor(
+                n_estimators=rounds, objective=tm.ls_obj, tree_method="hist"
+            )
+            rng = da.random.RandomState(1994)
+            w = rng.uniform(low=0.0, high=1.0, size=y.shape[0])
+            reg.fit(
+                X, y, sample_weight=w, eval_set=[(X, y)], sample_weight_eval_set=[w]
+            )
+            results_custom = reg.evals_result()
+            tm.non_increasing(results_custom["validation_0"]["rmse"])
+
     def test_no_duplicated_partition(self) -> None:
-        """Assert each worker has the correct amount of data, and DMatrix initialization doesn't
-        generate unnecessary copies of data.
+        """Assert each worker has the correct amount of data, and DMatrix initialization
+        doesn't generate unnecessary copies of data.
 
         """
         with LocalCluster(n_workers=2, dashboard_address=":0") as cluster:
diff --git a/tests/test_distributed/test_with_spark/test_spark_local.py b/tests/test_distributed/test_with_spark/test_spark_local.py
index b8c16ef1c..ab983c920 100644
--- a/tests/test_distributed/test_with_spark/test_spark_local.py
+++ b/tests/test_distributed/test_with_spark/test_spark_local.py
@@ -929,8 +929,127 @@ class TestPySparkLocal:
             model_loaded.set_device("cuda")
             assert model_loaded._run_on_gpu()
 
+    def test_validate_gpu_params(self) -> None:
+        # Standalone
+        standalone_conf = (
+            SparkConf()
+            .setMaster("spark://foo")
+            .set("spark.executor.cores", "12")
+            .set("spark.task.cpus", "1")
+            .set("spark.executor.resource.gpu.amount", "1")
+            .set("spark.task.resource.gpu.amount", "0.08")
+        )
+        classifer_on_cpu = SparkXGBClassifier(use_gpu=False)
+        classifer_on_gpu = SparkXGBClassifier(use_gpu=True)
+
+        # No exception for classifier on CPU
+        classifer_on_cpu._validate_gpu_params("3.4.0", standalone_conf)
+
+        with pytest.raises(
+            ValueError, match="XGBoost doesn't support GPU fractional configurations"
+        ):
+            classifer_on_gpu._validate_gpu_params("3.3.0", standalone_conf)
+
+        # No issues
+        classifer_on_gpu._validate_gpu_params("3.4.0", standalone_conf)
+        classifer_on_gpu._validate_gpu_params("3.4.1", standalone_conf)
+        classifer_on_gpu._validate_gpu_params("3.5.0", standalone_conf)
+        classifer_on_gpu._validate_gpu_params("3.5.1", standalone_conf)
+
+        # no spark.executor.resource.gpu.amount
+        standalone_bad_conf = (
+            SparkConf()
+            .setMaster("spark://foo")
+            .set("spark.executor.cores", "12")
+            .set("spark.task.cpus", "1")
+            .set("spark.task.resource.gpu.amount", "0.08")
+        )
+        msg_match = (
+            "The `spark.executor.resource.gpu.amount` is required for training on GPU"
+        )
+        with pytest.raises(ValueError, match=msg_match):
+            classifer_on_gpu._validate_gpu_params("3.3.0", standalone_bad_conf)
+        with pytest.raises(ValueError, match=msg_match):
+            classifer_on_gpu._validate_gpu_params("3.4.0", standalone_bad_conf)
+        with pytest.raises(ValueError, match=msg_match):
+            classifer_on_gpu._validate_gpu_params("3.4.1", standalone_bad_conf)
+        with pytest.raises(ValueError, match=msg_match):
+            classifer_on_gpu._validate_gpu_params("3.5.0", standalone_bad_conf)
+        with pytest.raises(ValueError, match=msg_match):
+            classifer_on_gpu._validate_gpu_params("3.5.1", standalone_bad_conf)
+
+        standalone_bad_conf = (
+            SparkConf()
+            .setMaster("spark://foo")
+            .set("spark.executor.cores", "12")
+            .set("spark.task.cpus", "1")
+            .set("spark.executor.resource.gpu.amount", "1")
+        )
+        msg_match = (
+            "The `spark.task.resource.gpu.amount` is required for training on GPU"
+        )
+        with pytest.raises(ValueError, match=msg_match):
+            classifer_on_gpu._validate_gpu_params("3.3.0", standalone_bad_conf)
+
+        classifer_on_gpu._validate_gpu_params("3.4.0", standalone_bad_conf)
+        classifer_on_gpu._validate_gpu_params("3.5.0", standalone_bad_conf)
+        classifer_on_gpu._validate_gpu_params("3.5.1", standalone_bad_conf)
+
+        # Yarn and K8s mode
+        for mode in ["yarn", "k8s://"]:
+            conf = (
+                SparkConf()
+                .setMaster(mode)
+                .set("spark.executor.cores", "12")
+                .set("spark.task.cpus", "1")
+                .set("spark.executor.resource.gpu.amount", "1")
+                .set("spark.task.resource.gpu.amount", "0.08")
+            )
+            with pytest.raises(
+                ValueError,
+                match="XGBoost doesn't support GPU fractional configurations",
+            ):
+                classifer_on_gpu._validate_gpu_params("3.3.0", conf)
+            with pytest.raises(
+                ValueError,
+                match="XGBoost doesn't support GPU fractional configurations",
+            ):
+                classifer_on_gpu._validate_gpu_params("3.4.0", conf)
+            with pytest.raises(
+                ValueError,
+                match="XGBoost doesn't support GPU fractional configurations",
+            ):
+                classifer_on_gpu._validate_gpu_params("3.4.1", conf)
+            with pytest.raises(
+                ValueError,
+                match="XGBoost doesn't support GPU fractional configurations",
+            ):
+                classifer_on_gpu._validate_gpu_params("3.5.0", conf)
+
+            classifer_on_gpu._validate_gpu_params("3.5.1", conf)
+
+        for mode in ["yarn", "k8s://"]:
+            bad_conf = (
+                SparkConf()
+                .setMaster(mode)
+                .set("spark.executor.cores", "12")
+                .set("spark.task.cpus", "1")
+                .set("spark.executor.resource.gpu.amount", "1")
+            )
+            msg_match = (
+                "The `spark.task.resource.gpu.amount` is required for training on GPU"
+            )
+            with pytest.raises(ValueError, match=msg_match):
+                classifer_on_gpu._validate_gpu_params("3.3.0", bad_conf)
+            with pytest.raises(ValueError, match=msg_match):
+                classifer_on_gpu._validate_gpu_params("3.4.0", bad_conf)
+            with pytest.raises(ValueError, match=msg_match):
+                classifer_on_gpu._validate_gpu_params("3.5.0", bad_conf)
+
+            classifer_on_gpu._validate_gpu_params("3.5.1", bad_conf)
+
     def test_skip_stage_level_scheduling(self) -> None:
-        conf = (
+        standalone_conf = (
             SparkConf()
             .setMaster("spark://foo")
             .set("spark.executor.cores", "12")
@@ -943,26 +1062,36 @@ class TestPySparkLocal:
         classifer_on_gpu = SparkXGBClassifier(use_gpu=True)
 
         # the correct configurations should not skip stage-level scheduling
-        assert not classifer_on_gpu._skip_stage_level_scheduling("3.4.0", conf)
+        assert not classifer_on_gpu._skip_stage_level_scheduling(
+            "3.4.0", standalone_conf
+        )
+        assert not classifer_on_gpu._skip_stage_level_scheduling(
+            "3.4.1", standalone_conf
+        )
+        assert not classifer_on_gpu._skip_stage_level_scheduling(
+            "3.5.0", standalone_conf
+        )
+        assert not classifer_on_gpu._skip_stage_level_scheduling(
+            "3.5.1", standalone_conf
+        )
 
         # spark version < 3.4.0
-        assert classifer_on_gpu._skip_stage_level_scheduling("3.3.0", conf)
-
+        assert classifer_on_gpu._skip_stage_level_scheduling("3.3.0", standalone_conf)
         # not run on GPU
-        assert classifer_on_cpu._skip_stage_level_scheduling("3.4.0", conf)
+        assert classifer_on_cpu._skip_stage_level_scheduling("3.4.0", standalone_conf)
 
         # spark.executor.cores is not set
-        badConf = (
+        bad_conf = (
             SparkConf()
             .setMaster("spark://foo")
             .set("spark.task.cpus", "1")
             .set("spark.executor.resource.gpu.amount", "1")
             .set("spark.task.resource.gpu.amount", "0.08")
         )
-        assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", badConf)
+        assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", bad_conf)
 
         # spark.executor.cores=1
-        badConf = (
+        bad_conf = (
             SparkConf()
             .setMaster("spark://foo")
             .set("spark.executor.cores", "1")
@@ -970,20 +1099,20 @@ class TestPySparkLocal:
             .set("spark.executor.resource.gpu.amount", "1")
             .set("spark.task.resource.gpu.amount", "0.08")
         )
-        assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", badConf)
+        assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", bad_conf)
 
         # spark.executor.resource.gpu.amount is not set
-        badConf = (
+        bad_conf = (
             SparkConf()
             .setMaster("spark://foo")
             .set("spark.executor.cores", "12")
             .set("spark.task.cpus", "1")
             .set("spark.task.resource.gpu.amount", "0.08")
         )
-        assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", badConf)
+        assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", bad_conf)
 
         # spark.executor.resource.gpu.amount>1
-        badConf = (
+        bad_conf = (
             SparkConf()
             .setMaster("spark://foo")
             .set("spark.executor.cores", "12")
@@ -991,20 +1120,20 @@ class TestPySparkLocal:
             .set("spark.executor.resource.gpu.amount", "2")
             .set("spark.task.resource.gpu.amount", "0.08")
         )
-        assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", badConf)
+        assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", bad_conf)
 
         # spark.task.resource.gpu.amount is not set
-        badConf = (
+        bad_conf = (
             SparkConf()
             .setMaster("spark://foo")
             .set("spark.executor.cores", "12")
             .set("spark.task.cpus", "1")
             .set("spark.executor.resource.gpu.amount", "1")
         )
-        assert not classifer_on_gpu._skip_stage_level_scheduling("3.4.0", badConf)
+        assert not classifer_on_gpu._skip_stage_level_scheduling("3.4.0", bad_conf)
 
         # spark.task.resource.gpu.amount=1
-        badConf = (
+        bad_conf = (
             SparkConf()
             .setMaster("spark://foo")
             .set("spark.executor.cores", "12")
@@ -1012,29 +1141,32 @@ class TestPySparkLocal:
             .set("spark.executor.resource.gpu.amount", "1")
             .set("spark.task.resource.gpu.amount", "1")
         )
-        assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", badConf)
+        assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", bad_conf)
 
-        # yarn
-        badConf = (
-            SparkConf()
-            .setMaster("yarn")
-            .set("spark.executor.cores", "12")
-            .set("spark.task.cpus", "1")
-            .set("spark.executor.resource.gpu.amount", "1")
-            .set("spark.task.resource.gpu.amount", "1")
-        )
-        assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", badConf)
+        # For Yarn and K8S
+        for mode in ["yarn", "k8s://"]:
+            for gpu_amount in ["0.08", "0.2", "1.0"]:
+                conf = (
+                    SparkConf()
+                    .setMaster(mode)
+                    .set("spark.executor.cores", "12")
+                    .set("spark.task.cpus", "1")
+                    .set("spark.executor.resource.gpu.amount", "1")
+                    .set("spark.task.resource.gpu.amount", gpu_amount)
+                )
+                assert classifer_on_gpu._skip_stage_level_scheduling("3.3.0", conf)
+                assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", conf)
+                assert classifer_on_gpu._skip_stage_level_scheduling("3.4.1", conf)
+                assert classifer_on_gpu._skip_stage_level_scheduling("3.5.0", conf)
 
-        # k8s
-        badConf = (
-            SparkConf()
-            .setMaster("k8s://")
-            .set("spark.executor.cores", "12")
-            .set("spark.task.cpus", "1")
-            .set("spark.executor.resource.gpu.amount", "1")
-            .set("spark.task.resource.gpu.amount", "1")
-        )
-        assert classifer_on_gpu._skip_stage_level_scheduling("3.4.0", badConf)
+                # This will be fixed when spark 4.0.0 is released.
+                if gpu_amount == "1.0":
+                    assert classifer_on_gpu._skip_stage_level_scheduling("3.5.1", conf)
+                else:
+                    # Starting from 3.5.1+, stage-level scheduling is working for Yarn and K8s
+                    assert not classifer_on_gpu._skip_stage_level_scheduling(
+                        "3.5.1", conf
+                    )
 
 
 class XgboostLocalTest(SparkTestCase):